xref: /aosp_15_r20/external/googleapis/google/cloud/videointelligence/v1p2beta1/video_intelligence.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2019 Google LLC.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15
16syntax = "proto3";
17
18package google.cloud.videointelligence.v1p2beta1;
19
20import "google/api/annotations.proto";
21import "google/api/client.proto";
22import "google/api/field_behavior.proto";
23import "google/longrunning/operations.proto";
24import "google/protobuf/duration.proto";
25import "google/protobuf/timestamp.proto";
26import "google/rpc/status.proto";
27
28option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P2Beta1";
29option go_package = "cloud.google.com/go/videointelligence/apiv1p2beta1/videointelligencepb;videointelligencepb";
30option java_multiple_files = true;
31option java_outer_classname = "VideoIntelligenceServiceProto";
32option java_package = "com.google.cloud.videointelligence.v1p2beta1";
33option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p2beta1";
34option ruby_package = "Google::Cloud::VideoIntelligence::V1p2beta1";
35
36// Service that implements Google Cloud Video Intelligence API.
37service VideoIntelligenceService {
38  option (google.api.default_host) = "videointelligence.googleapis.com";
39  option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";
40
41  // Performs asynchronous video annotation. Progress and results can be
42  // retrieved through the `google.longrunning.Operations` interface.
43  // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
44  // `Operation.response` contains `AnnotateVideoResponse` (results).
45  rpc AnnotateVideo(AnnotateVideoRequest) returns (google.longrunning.Operation) {
46    option (google.api.http) = {
47      post: "/v1p2beta1/videos:annotate"
48      body: "*"
49    };
50    option (google.api.method_signature) = "input_uri,features";
51    option (google.longrunning.operation_info) = {
52      response_type: "AnnotateVideoResponse"
53      metadata_type: "AnnotateVideoProgress"
54    };
55  }
56}
57
58// Video annotation request.
59message AnnotateVideoRequest {
60  // Input video location. Currently, only
61  // [Google Cloud Storage](https://cloud.google.com/storage/) URIs are
62  // supported, which must be specified in the following format:
63  // `gs://bucket-id/object-id` (other URI formats return
64  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
65  // [Request URIs](https://cloud.google.com/storage/docs/request-endpoints).
66  // A video URI may include wildcards in `object-id`, and thus identify
67  // multiple videos. Supported wildcards: '*' to match 0 or more characters;
68  // '?' to match 1 character. If unset, the input video should be embedded
69  // in the request as `input_content`. If set, `input_content` should be unset.
70  string input_uri = 1;
71
72  // The video data bytes.
73  // If unset, the input video(s) should be specified via `input_uri`.
74  // If set, `input_uri` should be unset.
75  bytes input_content = 6;
76
77  // Required. Requested video annotation features.
78  repeated Feature features = 2 [(google.api.field_behavior) = REQUIRED];
79
80  // Additional video context and/or feature-specific parameters.
81  VideoContext video_context = 3;
82
83  // Optional. Location where the output (in JSON format) should be stored.
84  // Currently, only [Google Cloud Storage](https://cloud.google.com/storage/)
85  // URIs are supported, which must be specified in the following format:
86  // `gs://bucket-id/object-id` (other URI formats return
87  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
88  // [Request URIs](https://cloud.google.com/storage/docs/request-endpoints).
89  string output_uri = 4 [(google.api.field_behavior) = OPTIONAL];
90
91  // Optional. Cloud region where annotation should take place. Supported cloud
92  // regions: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no region
93  // is specified, a region will be determined based on video file location.
94  string location_id = 5 [(google.api.field_behavior) = OPTIONAL];
95}
96
97// Video context and/or feature-specific parameters.
98message VideoContext {
99  // Video segments to annotate. The segments may overlap and are not required
100  // to be contiguous or span the whole video. If unspecified, each video is
101  // treated as a single segment.
102  repeated VideoSegment segments = 1;
103
104  // Config for LABEL_DETECTION.
105  LabelDetectionConfig label_detection_config = 2;
106
107  // Config for SHOT_CHANGE_DETECTION.
108  ShotChangeDetectionConfig shot_change_detection_config = 3;
109
110  // Config for EXPLICIT_CONTENT_DETECTION.
111  ExplicitContentDetectionConfig explicit_content_detection_config = 4;
112
113  // Config for TEXT_DETECTION.
114  TextDetectionConfig text_detection_config = 8;
115}
116
117// Config for LABEL_DETECTION.
118message LabelDetectionConfig {
119  // What labels should be detected with LABEL_DETECTION, in addition to
120  // video-level labels or segment-level labels.
121  // If unspecified, defaults to `SHOT_MODE`.
122  LabelDetectionMode label_detection_mode = 1;
123
124  // Whether the video has been shot from a stationary (i.e. non-moving) camera.
125  // When set to true, might improve detection accuracy for moving objects.
126  // Should be used with `SHOT_AND_FRAME_MODE` enabled.
127  bool stationary_camera = 2;
128
129  // Model to use for label detection.
130  // Supported values: "builtin/stable" (the default if unset) and
131  // "builtin/latest".
132  string model = 3;
133}
134
135// Config for SHOT_CHANGE_DETECTION.
136message ShotChangeDetectionConfig {
137  // Model to use for shot change detection.
138  // Supported values: "builtin/stable" (the default if unset) and
139  // "builtin/latest".
140  string model = 1;
141}
142
143// Config for EXPLICIT_CONTENT_DETECTION.
144message ExplicitContentDetectionConfig {
145  // Model to use for explicit content detection.
146  // Supported values: "builtin/stable" (the default if unset) and
147  // "builtin/latest".
148  string model = 1;
149}
150
151// Config for TEXT_DETECTION.
152message TextDetectionConfig {
153  // Language hint can be specified if the language to be detected is known a
154  // priori. It can increase the accuracy of the detection. Language hint must
155  // be language code in BCP-47 format.
156  //
157  // Automatic language detection is performed if no hint is provided.
158  repeated string language_hints = 1;
159}
160
161// Video segment.
162message VideoSegment {
163  // Time-offset, relative to the beginning of the video,
164  // corresponding to the start of the segment (inclusive).
165  google.protobuf.Duration start_time_offset = 1;
166
167  // Time-offset, relative to the beginning of the video,
168  // corresponding to the end of the segment (inclusive).
169  google.protobuf.Duration end_time_offset = 2;
170}
171
172// Video segment level annotation results for label detection.
173message LabelSegment {
174  // Video segment where a label was detected.
175  VideoSegment segment = 1;
176
177  // Confidence that the label is accurate. Range: [0, 1].
178  float confidence = 2;
179}
180
181// Video frame level annotation results for label detection.
182message LabelFrame {
183  // Time-offset, relative to the beginning of the video, corresponding to the
184  // video frame for this location.
185  google.protobuf.Duration time_offset = 1;
186
187  // Confidence that the label is accurate. Range: [0, 1].
188  float confidence = 2;
189}
190
191// Detected entity from video analysis.
192message Entity {
193  // Opaque entity ID. Some IDs may be available in
194  // [Google Knowledge Graph Search
195  // API](https://developers.google.com/knowledge-graph/).
196  string entity_id = 1;
197
198  // Textual description, e.g. `Fixed-gear bicycle`.
199  string description = 2;
200
201  // Language code for `description` in BCP-47 format.
202  string language_code = 3;
203}
204
205// Label annotation.
206message LabelAnnotation {
207  // Detected entity.
208  Entity entity = 1;
209
210  // Common categories for the detected entity.
211  // E.g. when the label is `Terrier` the category is likely `dog`. And in some
212  // cases there might be more than one categories e.g. `Terrier` could also be
213  // a `pet`.
214  repeated Entity category_entities = 2;
215
216  // All video segments where a label was detected.
217  repeated LabelSegment segments = 3;
218
219  // All video frames where a label was detected.
220  repeated LabelFrame frames = 4;
221}
222
223// Video frame level annotation results for explicit content.
224message ExplicitContentFrame {
225  // Time-offset, relative to the beginning of the video, corresponding to the
226  // video frame for this location.
227  google.protobuf.Duration time_offset = 1;
228
229  // Likelihood of the pornography content..
230  Likelihood pornography_likelihood = 2;
231}
232
233// Explicit content annotation (based on per-frame visual signals only).
234// If no explicit content has been detected in a frame, no annotations are
235// present for that frame.
236message ExplicitContentAnnotation {
237  // All video frames where explicit content was detected.
238  repeated ExplicitContentFrame frames = 1;
239}
240
241// Normalized bounding box.
242// The normalized vertex coordinates are relative to the original image.
243// Range: [0, 1].
244message NormalizedBoundingBox {
245  // Left X coordinate.
246  float left = 1;
247
248  // Top Y coordinate.
249  float top = 2;
250
251  // Right X coordinate.
252  float right = 3;
253
254  // Bottom Y coordinate.
255  float bottom = 4;
256}
257
258// Annotation results for a single video.
259message VideoAnnotationResults {
260  // Video file location in
261  // [Google Cloud Storage](https://cloud.google.com/storage/).
262  string input_uri = 1;
263
264  // Label annotations on video level or user specified segment level.
265  // There is exactly one element for each unique label.
266  repeated LabelAnnotation segment_label_annotations = 2;
267
268  // Label annotations on shot level.
269  // There is exactly one element for each unique label.
270  repeated LabelAnnotation shot_label_annotations = 3;
271
272  // Label annotations on frame level.
273  // There is exactly one element for each unique label.
274  repeated LabelAnnotation frame_label_annotations = 4;
275
276  // Shot annotations. Each shot is represented as a video segment.
277  repeated VideoSegment shot_annotations = 6;
278
279  // Explicit content annotation.
280  ExplicitContentAnnotation explicit_annotation = 7;
281
282  // OCR text detection and tracking.
283  // Annotations for list of detected text snippets. Each will have list of
284  // frame information associated with it.
285  repeated TextAnnotation text_annotations = 12;
286
287  // Annotations for list of objects detected and tracked in video.
288  repeated ObjectTrackingAnnotation object_annotations = 14;
289
290  // If set, indicates an error. Note that for a single `AnnotateVideoRequest`
291  // some videos may succeed and some may fail.
292  google.rpc.Status error = 9;
293}
294
295// Video annotation response. Included in the `response`
296// field of the `Operation` returned by the `GetOperation`
297// call of the `google::longrunning::Operations` service.
298message AnnotateVideoResponse {
299  // Annotation results for all videos specified in `AnnotateVideoRequest`.
300  repeated VideoAnnotationResults annotation_results = 1;
301}
302
303// Annotation progress for a single video.
304message VideoAnnotationProgress {
305  // Video file location in
306  // [Google Cloud Storage](https://cloud.google.com/storage/).
307  string input_uri = 1;
308
309  // Approximate percentage processed thus far. Guaranteed to be
310  // 100 when fully processed.
311  int32 progress_percent = 2;
312
313  // Time when the request was received.
314  google.protobuf.Timestamp start_time = 3;
315
316  // Time of the most recent update.
317  google.protobuf.Timestamp update_time = 4;
318}
319
320// Video annotation progress. Included in the `metadata`
321// field of the `Operation` returned by the `GetOperation`
322// call of the `google::longrunning::Operations` service.
323message AnnotateVideoProgress {
324  // Progress metadata for all videos specified in `AnnotateVideoRequest`.
325  repeated VideoAnnotationProgress annotation_progress = 1;
326}
327
328// A vertex represents a 2D point in the image.
329// NOTE: the normalized vertex coordinates are relative to the original image
330// and range from 0 to 1.
331message NormalizedVertex {
332  // X coordinate.
333  float x = 1;
334
335  // Y coordinate.
336  float y = 2;
337}
338
339// Normalized bounding polygon for text (that might not be aligned with axis).
340// Contains list of the corner points in clockwise order starting from
341// top-left corner. For example, for a rectangular bounding box:
342// When the text is horizontal it might look like:
343//         0----1
344//         |    |
345//         3----2
346//
347// When it's clockwise rotated 180 degrees around the top-left corner it
348// becomes:
349//         2----3
350//         |    |
351//         1----0
352//
353// and the vertex order will still be (0, 1, 2, 3). Note that values can be less
354// than 0, or greater than 1 due to trignometric calculations for location of
355// the box.
356message NormalizedBoundingPoly {
357  // Normalized vertices of the bounding polygon.
358  repeated NormalizedVertex vertices = 1;
359}
360
361// Video segment level annotation results for text detection.
362message TextSegment {
363  // Video segment where a text snippet was detected.
364  VideoSegment segment = 1;
365
366  // Confidence for the track of detected text. It is calculated as the highest
367  // over all frames where OCR detected text appears.
368  float confidence = 2;
369
370  // Information related to the frames where OCR detected text appears.
371  repeated TextFrame frames = 3;
372}
373
374// Video frame level annotation results for text annotation (OCR).
375// Contains information regarding timestamp and bounding box locations for the
376// frames containing detected OCR text snippets.
377message TextFrame {
378  // Bounding polygon of the detected text for this frame.
379  NormalizedBoundingPoly rotated_bounding_box = 1;
380
381  // Timestamp of this frame.
382  google.protobuf.Duration time_offset = 2;
383}
384
385// Annotations related to one detected OCR text snippet. This will contain the
386// corresponding text, confidence value, and frame level information for each
387// detection.
388message TextAnnotation {
389  // The detected text.
390  string text = 1;
391
392  // All video segments where OCR detected text appears.
393  repeated TextSegment segments = 2;
394}
395
396// Video frame level annotations for object detection and tracking. This field
397// stores per frame location, time offset, and confidence.
398message ObjectTrackingFrame {
399  // The normalized bounding box location of this object track for the frame.
400  NormalizedBoundingBox normalized_bounding_box = 1;
401
402  // The timestamp of the frame in microseconds.
403  google.protobuf.Duration time_offset = 2;
404}
405
406// Annotations corresponding to one tracked object.
407message ObjectTrackingAnnotation {
408  // Different representation of tracking info in non-streaming batch
409  // and streaming modes.
410  oneof track_info {
411    // Non-streaming batch mode ONLY.
412    // Each object track corresponds to one video segment where it appears.
413    VideoSegment segment = 3;
414
415    // Streaming mode ONLY.
416    // In streaming mode, we do not know the end time of a tracked object
417    // before it is completed. Hence, there is no VideoSegment info returned.
418    // Instead, we provide a unique identifiable integer track_id so that
419    // the customers can correlate the results of the ongoing
420    // ObjectTrackAnnotation of the same track_id over time.
421    int64 track_id = 5;
422  }
423
424  // Entity to specify the object category that this track is labeled as.
425  Entity entity = 1;
426
427  // Object category's labeling confidence of this track.
428  float confidence = 4;
429
430  // Information corresponding to all frames where this object track appears.
431  repeated ObjectTrackingFrame frames = 2;
432}
433
434// Video annotation feature.
435enum Feature {
436  // Unspecified.
437  FEATURE_UNSPECIFIED = 0;
438
439  // Label detection. Detect objects, such as dog or flower.
440  LABEL_DETECTION = 1;
441
442  // Shot change detection.
443  SHOT_CHANGE_DETECTION = 2;
444
445  // Explicit content detection.
446  EXPLICIT_CONTENT_DETECTION = 3;
447
448  // OCR text detection and tracking.
449  TEXT_DETECTION = 7;
450
451  // Object detection and tracking.
452  OBJECT_TRACKING = 9;
453}
454
455// Label detection mode.
456enum LabelDetectionMode {
457  // Unspecified.
458  LABEL_DETECTION_MODE_UNSPECIFIED = 0;
459
460  // Detect shot-level labels.
461  SHOT_MODE = 1;
462
463  // Detect frame-level labels.
464  FRAME_MODE = 2;
465
466  // Detect both shot-level and frame-level labels.
467  SHOT_AND_FRAME_MODE = 3;
468}
469
470// Bucketized representation of likelihood.
471enum Likelihood {
472  // Unspecified likelihood.
473  LIKELIHOOD_UNSPECIFIED = 0;
474
475  // Very unlikely.
476  VERY_UNLIKELY = 1;
477
478  // Unlikely.
479  UNLIKELY = 2;
480
481  // Possible.
482  POSSIBLE = 3;
483
484  // Likely.
485  LIKELY = 4;
486
487  // Very likely.
488  VERY_LIKELY = 5;
489}
490