xref: /aosp_15_r20/external/googleapis/google/cloud/videointelligence/v1beta2/video_intelligence.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2019 Google LLC.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15
16syntax = "proto3";
17
18package google.cloud.videointelligence.v1beta2;
19
20import "google/api/annotations.proto";
21import "google/api/client.proto";
22import "google/api/field_behavior.proto";
23import "google/longrunning/operations.proto";
24import "google/protobuf/duration.proto";
25import "google/protobuf/timestamp.proto";
26import "google/rpc/status.proto";
27
28option csharp_namespace = "Google.Cloud.VideoIntelligence.V1Beta2";
29option go_package = "cloud.google.com/go/videointelligence/apiv1beta2/videointelligencepb;videointelligencepb";
30option java_multiple_files = true;
31option java_outer_classname = "VideoIntelligenceServiceProto";
32option java_package = "com.google.cloud.videointelligence.v1beta2";
33option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1beta2";
34option ruby_package = "Google::Cloud::VideoIntelligence::V1beta2";
35
36// Service that implements Google Cloud Video Intelligence API.
37service VideoIntelligenceService {
38  option (google.api.default_host) = "videointelligence.googleapis.com";
39  option (google.api.oauth_scopes) =
40      "https://www.googleapis.com/auth/cloud-platform";
41
42  // Performs asynchronous video annotation. Progress and results can be
43  // retrieved through the `google.longrunning.Operations` interface.
44  // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
45  // `Operation.response` contains `AnnotateVideoResponse` (results).
46  rpc AnnotateVideo(AnnotateVideoRequest)
47      returns (google.longrunning.Operation) {
48    option (google.api.http) = {
49      post: "/v1beta2/videos:annotate"
50      body: "*"
51    };
52    option (google.api.method_signature) = "input_uri,features";
53    option (google.longrunning.operation_info) = {
54      response_type: "AnnotateVideoResponse"
55      metadata_type: "AnnotateVideoProgress"
56    };
57  }
58}
59
60// Video annotation request.
61message AnnotateVideoRequest {
62  // Input video location. Currently, only
63  // [Google Cloud Storage](https://cloud.google.com/storage/) URIs are
64  // supported, which must be specified in the following format:
65  // `gs://bucket-id/object-id` (other URI formats return
66  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
67  // more information, see [Request
68  // URIs](https://cloud.google.com/storage/docs/request-endpoints). A video URI
69  // may include wildcards in `object-id`, and thus identify multiple videos.
70  // Supported wildcards: '*' to match 0 or more characters;
71  // '?' to match 1 character. If unset, the input video should be embedded
72  // in the request as `input_content`. If set, `input_content` should be unset.
73  string input_uri = 1;
74
75  // The video data bytes.
76  // If unset, the input video(s) should be specified via `input_uri`.
77  // If set, `input_uri` should be unset.
78  bytes input_content = 6;
79
80  // Required. Requested video annotation features.
81  repeated Feature features = 2 [(google.api.field_behavior) = REQUIRED];
82
83  // Additional video context and/or feature-specific parameters.
84  VideoContext video_context = 3;
85
86  // Optional. Location where the output (in JSON format) should be stored.
87  // Currently, only [Google Cloud Storage](https://cloud.google.com/storage/)
88  // URIs are supported, which must be specified in the following format:
89  // `gs://bucket-id/object-id` (other URI formats return
90  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
91  // more information, see [Request
92  // URIs](https://cloud.google.com/storage/docs/request-endpoints).
93  string output_uri = 4 [(google.api.field_behavior) = OPTIONAL];
94
95  // Optional. Cloud region where annotation should take place. Supported cloud
96  // regions: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no region
97  // is specified, a region will be determined based on video file location.
98  string location_id = 5 [(google.api.field_behavior) = OPTIONAL];
99}
100
101// Video context and/or feature-specific parameters.
102message VideoContext {
103  // Video segments to annotate. The segments may overlap and are not required
104  // to be contiguous or span the whole video. If unspecified, each video is
105  // treated as a single segment.
106  repeated VideoSegment segments = 1;
107
108  // Config for LABEL_DETECTION.
109  LabelDetectionConfig label_detection_config = 2;
110
111  // Config for SHOT_CHANGE_DETECTION.
112  ShotChangeDetectionConfig shot_change_detection_config = 3;
113
114  // Config for EXPLICIT_CONTENT_DETECTION.
115  ExplicitContentDetectionConfig explicit_content_detection_config = 4;
116
117  // Config for FACE_DETECTION.
118  FaceDetectionConfig face_detection_config = 5;
119}
120
121// Config for LABEL_DETECTION.
122message LabelDetectionConfig {
123  // What labels should be detected with LABEL_DETECTION, in addition to
124  // video-level labels or segment-level labels.
125  // If unspecified, defaults to `SHOT_MODE`.
126  LabelDetectionMode label_detection_mode = 1;
127
128  // Whether the video has been shot from a stationary (i.e. non-moving) camera.
129  // When set to true, might improve detection accuracy for moving objects.
130  // Should be used with `SHOT_AND_FRAME_MODE` enabled.
131  bool stationary_camera = 2;
132
133  // Model to use for label detection.
134  // Supported values: "builtin/stable" (the default if unset) and
135  // "builtin/latest".
136  string model = 3;
137}
138
139// Config for SHOT_CHANGE_DETECTION.
140message ShotChangeDetectionConfig {
141  // Model to use for shot change detection.
142  // Supported values: "builtin/stable" (the default if unset) and
143  // "builtin/latest".
144  string model = 1;
145}
146
147// Config for EXPLICIT_CONTENT_DETECTION.
148message ExplicitContentDetectionConfig {
149  // Model to use for explicit content detection.
150  // Supported values: "builtin/stable" (the default if unset) and
151  // "builtin/latest".
152  string model = 1;
153}
154
155// Config for FACE_DETECTION.
156message FaceDetectionConfig {
157  // Model to use for face detection.
158  // Supported values: "builtin/stable" (the default if unset) and
159  // "builtin/latest".
160  string model = 1;
161
162  // Whether bounding boxes be included in the face annotation output.
163  bool include_bounding_boxes = 2;
164}
165
166// Video segment.
167message VideoSegment {
168  // Time-offset, relative to the beginning of the video,
169  // corresponding to the start of the segment (inclusive).
170  google.protobuf.Duration start_time_offset = 1;
171
172  // Time-offset, relative to the beginning of the video,
173  // corresponding to the end of the segment (inclusive).
174  google.protobuf.Duration end_time_offset = 2;
175}
176
177// Video segment level annotation results for label detection.
178message LabelSegment {
179  // Video segment where a label was detected.
180  VideoSegment segment = 1;
181
182  // Confidence that the label is accurate. Range: [0, 1].
183  float confidence = 2;
184}
185
186// Video frame level annotation results for label detection.
187message LabelFrame {
188  // Time-offset, relative to the beginning of the video, corresponding to the
189  // video frame for this location.
190  google.protobuf.Duration time_offset = 1;
191
192  // Confidence that the label is accurate. Range: [0, 1].
193  float confidence = 2;
194}
195
196// Detected entity from video analysis.
197message Entity {
198  // Opaque entity ID. Some IDs may be available in
199  // [Google Knowledge Graph Search
200  // API](https://developers.google.com/knowledge-graph/).
201  string entity_id = 1;
202
203  // Textual description, e.g. `Fixed-gear bicycle`.
204  string description = 2;
205
206  // Language code for `description` in BCP-47 format.
207  string language_code = 3;
208}
209
210// Label annotation.
211message LabelAnnotation {
212  // Detected entity.
213  Entity entity = 1;
214
215  // Common categories for the detected entity.
216  // E.g. when the label is `Terrier` the category is likely `dog`. And in some
217  // cases there might be more than one categories e.g. `Terrier` could also be
218  // a `pet`.
219  repeated Entity category_entities = 2;
220
221  // All video segments where a label was detected.
222  repeated LabelSegment segments = 3;
223
224  // All video frames where a label was detected.
225  repeated LabelFrame frames = 4;
226}
227
228// Video frame level annotation results for explicit content.
229message ExplicitContentFrame {
230  // Time-offset, relative to the beginning of the video, corresponding to the
231  // video frame for this location.
232  google.protobuf.Duration time_offset = 1;
233
234  // Likelihood of the pornography content..
235  Likelihood pornography_likelihood = 2;
236}
237
238// Explicit content annotation (based on per-frame visual signals only).
239// If no explicit content has been detected in a frame, no annotations are
240// present for that frame.
241message ExplicitContentAnnotation {
242  // All video frames where explicit content was detected.
243  repeated ExplicitContentFrame frames = 1;
244}
245
246// Normalized bounding box.
247// The normalized vertex coordinates are relative to the original image.
248// Range: [0, 1].
249message NormalizedBoundingBox {
250  // Left X coordinate.
251  float left = 1;
252
253  // Top Y coordinate.
254  float top = 2;
255
256  // Right X coordinate.
257  float right = 3;
258
259  // Bottom Y coordinate.
260  float bottom = 4;
261}
262
263// Video segment level annotation results for face detection.
264message FaceSegment {
265  // Video segment where a face was detected.
266  VideoSegment segment = 1;
267}
268
269// Video frame level annotation results for face detection.
270message FaceFrame {
271  // Normalized Bounding boxes in a frame.
272  // There can be more than one boxes if the same face is detected in multiple
273  // locations within the current frame.
274  repeated NormalizedBoundingBox normalized_bounding_boxes = 1;
275
276  // Time-offset, relative to the beginning of the video,
277  // corresponding to the video frame for this location.
278  google.protobuf.Duration time_offset = 2;
279}
280
281// Face annotation.
282message FaceAnnotation {
283  // Thumbnail of a representative face view (in JPEG format).
284  bytes thumbnail = 1;
285
286  // All video segments where a face was detected.
287  repeated FaceSegment segments = 2;
288
289  // All video frames where a face was detected.
290  repeated FaceFrame frames = 3;
291}
292
293// Annotation results for a single video.
294message VideoAnnotationResults {
295  // Video file location in
296  // [Google Cloud Storage](https://cloud.google.com/storage/).
297  string input_uri = 1;
298
299  // Label annotations on video level or user specified segment level.
300  // There is exactly one element for each unique label.
301  repeated LabelAnnotation segment_label_annotations = 2;
302
303  // Label annotations on shot level.
304  // There is exactly one element for each unique label.
305  repeated LabelAnnotation shot_label_annotations = 3;
306
307  // Label annotations on frame level.
308  // There is exactly one element for each unique label.
309  repeated LabelAnnotation frame_label_annotations = 4;
310
311  // Face annotations. There is exactly one element for each unique face.
312  repeated FaceAnnotation face_annotations = 5;
313
314  // Shot annotations. Each shot is represented as a video segment.
315  repeated VideoSegment shot_annotations = 6;
316
317  // Explicit content annotation.
318  ExplicitContentAnnotation explicit_annotation = 7;
319
320  // If set, indicates an error. Note that for a single `AnnotateVideoRequest`
321  // some videos may succeed and some may fail.
322  google.rpc.Status error = 9;
323}
324
325// Video annotation response. Included in the `response`
326// field of the `Operation` returned by the `GetOperation`
327// call of the `google::longrunning::Operations` service.
328message AnnotateVideoResponse {
329  // Annotation results for all videos specified in `AnnotateVideoRequest`.
330  repeated VideoAnnotationResults annotation_results = 1;
331}
332
333// Annotation progress for a single video.
334message VideoAnnotationProgress {
335  // Video file location in
336  // [Google Cloud Storage](https://cloud.google.com/storage/).
337  string input_uri = 1;
338
339  // Approximate percentage processed thus far.
340  // Guaranteed to be 100 when fully processed.
341  int32 progress_percent = 2;
342
343  // Time when the request was received.
344  google.protobuf.Timestamp start_time = 3;
345
346  // Time of the most recent update.
347  google.protobuf.Timestamp update_time = 4;
348}
349
350// Video annotation progress. Included in the `metadata`
351// field of the `Operation` returned by the `GetOperation`
352// call of the `google::longrunning::Operations` service.
353message AnnotateVideoProgress {
354  // Progress metadata for all videos specified in `AnnotateVideoRequest`.
355  repeated VideoAnnotationProgress annotation_progress = 1;
356}
357
358// Video annotation feature.
359enum Feature {
360  // Unspecified.
361  FEATURE_UNSPECIFIED = 0;
362
363  // Label detection. Detect objects, such as dog or flower.
364  LABEL_DETECTION = 1;
365
366  // Shot change detection.
367  SHOT_CHANGE_DETECTION = 2;
368
369  // Explicit content detection.
370  EXPLICIT_CONTENT_DETECTION = 3;
371
372  // Human face detection and tracking.
373  FACE_DETECTION = 4;
374}
375
376// Label detection mode.
377enum LabelDetectionMode {
378  // Unspecified.
379  LABEL_DETECTION_MODE_UNSPECIFIED = 0;
380
381  // Detect shot-level labels.
382  SHOT_MODE = 1;
383
384  // Detect frame-level labels.
385  FRAME_MODE = 2;
386
387  // Detect both shot-level and frame-level labels.
388  SHOT_AND_FRAME_MODE = 3;
389}
390
391// Bucketized representation of likelihood.
392enum Likelihood {
393  // Unspecified likelihood.
394  LIKELIHOOD_UNSPECIFIED = 0;
395
396  // Very unlikely.
397  VERY_UNLIKELY = 1;
398
399  // Unlikely.
400  UNLIKELY = 2;
401
402  // Possible.
403  POSSIBLE = 3;
404
405  // Likely.
406  LIKELY = 4;
407
408  // Very likely.
409  VERY_LIKELY = 5;
410}
411