xref: /aosp_15_r20/external/googleapis/google/cloud/videointelligence/v1p3beta1/video_intelligence.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2020 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.videointelligence.v1p3beta1;
18
19import "google/api/annotations.proto";
20import "google/api/client.proto";
21import "google/api/field_behavior.proto";
22import "google/longrunning/operations.proto";
23import "google/protobuf/duration.proto";
24import "google/protobuf/timestamp.proto";
25import "google/rpc/status.proto";
26
27option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P3Beta1";
28option go_package = "cloud.google.com/go/videointelligence/apiv1p3beta1/videointelligencepb;videointelligencepb";
29option java_multiple_files = true;
30option java_outer_classname = "VideoIntelligenceServiceProto";
31option java_package = "com.google.cloud.videointelligence.v1p3beta1";
32option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p3beta1";
33option ruby_package = "Google::Cloud::VideoIntelligence::V1p3beta1";
34
35// Service that implements the Video Intelligence API.
36service VideoIntelligenceService {
37  option (google.api.default_host) = "videointelligence.googleapis.com";
38  option (google.api.oauth_scopes) =
39      "https://www.googleapis.com/auth/cloud-platform";
40
41  // Performs asynchronous video annotation. Progress and results can be
42  // retrieved through the `google.longrunning.Operations` interface.
43  // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
44  // `Operation.response` contains `AnnotateVideoResponse` (results).
45  rpc AnnotateVideo(AnnotateVideoRequest)
46      returns (google.longrunning.Operation) {
47    option (google.api.http) = {
48      post: "/v1p3beta1/videos:annotate"
49      body: "*"
50    };
51    option (google.api.method_signature) = "input_uri,features";
52    option (google.longrunning.operation_info) = {
53      response_type: "AnnotateVideoResponse"
54      metadata_type: "AnnotateVideoProgress"
55    };
56  }
57}
58
59// Service that implements streaming Video Intelligence API.
60service StreamingVideoIntelligenceService {
61  option (google.api.default_host) = "videointelligence.googleapis.com";
62  option (google.api.oauth_scopes) =
63      "https://www.googleapis.com/auth/cloud-platform";
64
65  // Performs video annotation with bidirectional streaming: emitting results
66  // while sending video/audio bytes.
67  // This method is only available via the gRPC API (not REST).
68  rpc StreamingAnnotateVideo(stream StreamingAnnotateVideoRequest)
69      returns (stream StreamingAnnotateVideoResponse) {}
70}
71
72// Video annotation request.
73message AnnotateVideoRequest {
74  // Input video location. Currently, only
75  // [Cloud Storage](https://cloud.google.com/storage/) URIs are
76  // supported. URIs must be specified in the following format:
77  // `gs://bucket-id/object-id` (other URI formats return
78  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
79  // more information, see [Request
80  // URIs](https://cloud.google.com/storage/docs/request-endpoints). To identify
81  // multiple videos, a video URI may include wildcards in the `object-id`.
82  // Supported wildcards: '*' to match 0 or more characters;
83  // '?' to match 1 character. If unset, the input video should be embedded
84  // in the request as `input_content`. If set, `input_content` must be unset.
85  string input_uri = 1;
86
87  // The video data bytes.
88  // If unset, the input video(s) should be specified via the `input_uri`.
89  // If set, `input_uri` must be unset.
90  bytes input_content = 6;
91
92  // Required. Requested video annotation features.
93  repeated Feature features = 2 [(google.api.field_behavior) = REQUIRED];
94
95  // Additional video context and/or feature-specific parameters.
96  VideoContext video_context = 3;
97
98  // Optional. Location where the output (in JSON format) should be stored.
99  // Currently, only [Cloud Storage](https://cloud.google.com/storage/)
100  // URIs are supported. These must be specified in the following format:
101  // `gs://bucket-id/object-id` (other URI formats return
102  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
103  // more information, see [Request
104  // URIs](https://cloud.google.com/storage/docs/request-endpoints).
105  string output_uri = 4 [(google.api.field_behavior) = OPTIONAL];
106
107  // Optional. Cloud region where annotation should take place. Supported cloud
108  // regions are: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no
109  // region is specified, the region will be determined based on video file
110  // location.
111  string location_id = 5 [(google.api.field_behavior) = OPTIONAL];
112}
113
114// Video context and/or feature-specific parameters.
115message VideoContext {
116  // Video segments to annotate. The segments may overlap and are not required
117  // to be contiguous or span the whole video. If unspecified, each video is
118  // treated as a single segment.
119  repeated VideoSegment segments = 1;
120
121  // Config for LABEL_DETECTION.
122  LabelDetectionConfig label_detection_config = 2;
123
124  // Config for SHOT_CHANGE_DETECTION.
125  ShotChangeDetectionConfig shot_change_detection_config = 3;
126
127  // Config for EXPLICIT_CONTENT_DETECTION.
128  ExplicitContentDetectionConfig explicit_content_detection_config = 4;
129
130  // Config for FACE_DETECTION.
131  FaceDetectionConfig face_detection_config = 5;
132
133  // Config for SPEECH_TRANSCRIPTION.
134  SpeechTranscriptionConfig speech_transcription_config = 6;
135
136  // Config for TEXT_DETECTION.
137  TextDetectionConfig text_detection_config = 8;
138
139  // Config for PERSON_DETECTION.
140  PersonDetectionConfig person_detection_config = 11;
141
142  // Config for OBJECT_TRACKING.
143  ObjectTrackingConfig object_tracking_config = 13;
144}
145
146// Label detection mode.
147enum LabelDetectionMode {
148  // Unspecified.
149  LABEL_DETECTION_MODE_UNSPECIFIED = 0;
150
151  // Detect shot-level labels.
152  SHOT_MODE = 1;
153
154  // Detect frame-level labels.
155  FRAME_MODE = 2;
156
157  // Detect both shot-level and frame-level labels.
158  SHOT_AND_FRAME_MODE = 3;
159}
160
161// Bucketized representation of likelihood.
162enum Likelihood {
163  // Unspecified likelihood.
164  LIKELIHOOD_UNSPECIFIED = 0;
165
166  // Very unlikely.
167  VERY_UNLIKELY = 1;
168
169  // Unlikely.
170  UNLIKELY = 2;
171
172  // Possible.
173  POSSIBLE = 3;
174
175  // Likely.
176  LIKELY = 4;
177
178  // Very likely.
179  VERY_LIKELY = 5;
180}
181
182// Config for LABEL_DETECTION.
183message LabelDetectionConfig {
184  // What labels should be detected with LABEL_DETECTION, in addition to
185  // video-level labels or segment-level labels.
186  // If unspecified, defaults to `SHOT_MODE`.
187  LabelDetectionMode label_detection_mode = 1;
188
189  // Whether the video has been shot from a stationary (i.e., non-moving)
190  // camera. When set to true, might improve detection accuracy for moving
191  // objects. Should be used with `SHOT_AND_FRAME_MODE` enabled.
192  bool stationary_camera = 2;
193
194  // Model to use for label detection.
195  // Supported values: "builtin/stable" (the default if unset) and
196  // "builtin/latest".
197  string model = 3;
198
199  // The confidence threshold we perform filtering on the labels from
200  // frame-level detection. If not set, it is set to 0.4 by default. The valid
201  // range for this threshold is [0.1, 0.9]. Any value set outside of this
202  // range will be clipped.
203  // Note: For best results, follow the default threshold. We will update
204  // the default threshold everytime when we release a new model.
205  float frame_confidence_threshold = 4;
206
207  // The confidence threshold we perform filtering on the labels from
208  // video-level and shot-level detections. If not set, it's set to 0.3 by
209  // default. The valid range for this threshold is [0.1, 0.9]. Any value set
210  // outside of this range will be clipped.
211  // Note: For best results, follow the default threshold. We will update
212  // the default threshold everytime when we release a new model.
213  float video_confidence_threshold = 5;
214}
215
216// Streaming video annotation feature.
217enum StreamingFeature {
218  // Unspecified.
219  STREAMING_FEATURE_UNSPECIFIED = 0;
220
221  // Label detection. Detect objects, such as dog or flower.
222  STREAMING_LABEL_DETECTION = 1;
223
224  // Shot change detection.
225  STREAMING_SHOT_CHANGE_DETECTION = 2;
226
227  // Explicit content detection.
228  STREAMING_EXPLICIT_CONTENT_DETECTION = 3;
229
230  // Object detection and tracking.
231  STREAMING_OBJECT_TRACKING = 4;
232
233  // Action recognition based on AutoML model.
234  STREAMING_AUTOML_ACTION_RECOGNITION = 23;
235
236  // Video classification based on AutoML model.
237  STREAMING_AUTOML_CLASSIFICATION = 21;
238
239  // Object detection and tracking based on AutoML model.
240  STREAMING_AUTOML_OBJECT_TRACKING = 22;
241}
242
243// Video annotation feature.
244enum Feature {
245  // Unspecified.
246  FEATURE_UNSPECIFIED = 0;
247
248  // Label detection. Detect objects, such as dog or flower.
249  LABEL_DETECTION = 1;
250
251  // Shot change detection.
252  SHOT_CHANGE_DETECTION = 2;
253
254  // Explicit content detection.
255  EXPLICIT_CONTENT_DETECTION = 3;
256
257  // Human face detection.
258  FACE_DETECTION = 4;
259
260  // Speech transcription.
261  SPEECH_TRANSCRIPTION = 6;
262
263  // OCR text detection and tracking.
264  TEXT_DETECTION = 7;
265
266  // Object detection and tracking.
267  OBJECT_TRACKING = 9;
268
269  // Logo detection, tracking, and recognition.
270  LOGO_RECOGNITION = 12;
271
272  // Celebrity recognition.
273  CELEBRITY_RECOGNITION = 13;
274
275  // Person detection.
276  PERSON_DETECTION = 14;
277}
278
279// Config for SHOT_CHANGE_DETECTION.
280message ShotChangeDetectionConfig {
281  // Model to use for shot change detection.
282  // Supported values: "builtin/stable" (the default if unset) and
283  // "builtin/latest".
284  string model = 1;
285}
286
287// Config for OBJECT_TRACKING.
288message ObjectTrackingConfig {
289  // Model to use for object tracking.
290  // Supported values: "builtin/stable" (the default if unset) and
291  // "builtin/latest".
292  string model = 1;
293}
294
295// Config for EXPLICIT_CONTENT_DETECTION.
296message ExplicitContentDetectionConfig {
297  // Model to use for explicit content detection.
298  // Supported values: "builtin/stable" (the default if unset) and
299  // "builtin/latest".
300  string model = 1;
301}
302
303// Config for FACE_DETECTION.
304message FaceDetectionConfig {
305  // Model to use for face detection.
306  // Supported values: "builtin/stable" (the default if unset) and
307  // "builtin/latest".
308  string model = 1;
309
310  // Whether bounding boxes are included in the face annotation output.
311  bool include_bounding_boxes = 2;
312
313  // Whether to enable face attributes detection, such as glasses, dark_glasses,
314  // mouth_open etc. Ignored if 'include_bounding_boxes' is set to false.
315  bool include_attributes = 5;
316}
317
318// Config for PERSON_DETECTION.
319message PersonDetectionConfig {
320  // Whether bounding boxes are included in the person detection annotation
321  // output.
322  bool include_bounding_boxes = 1;
323
324  // Whether to enable pose landmarks detection. Ignored if
325  // 'include_bounding_boxes' is set to false.
326  bool include_pose_landmarks = 2;
327
328  // Whether to enable person attributes detection, such as cloth color (black,
329  // blue, etc), type (coat, dress, etc), pattern (plain, floral, etc), hair,
330  // etc.
331  // Ignored if 'include_bounding_boxes' is set to false.
332  bool include_attributes = 3;
333}
334
335// Config for TEXT_DETECTION.
336message TextDetectionConfig {
337  // Language hint can be specified if the language to be detected is known a
338  // priori. It can increase the accuracy of the detection. Language hint must
339  // be language code in BCP-47 format.
340  //
341  // Automatic language detection is performed if no hint is provided.
342  repeated string language_hints = 1;
343
344  // Model to use for text detection.
345  // Supported values: "builtin/stable" (the default if unset) and
346  // "builtin/latest".
347  string model = 2;
348}
349
350// Video segment.
351message VideoSegment {
352  // Time-offset, relative to the beginning of the video,
353  // corresponding to the start of the segment (inclusive).
354  google.protobuf.Duration start_time_offset = 1;
355
356  // Time-offset, relative to the beginning of the video,
357  // corresponding to the end of the segment (inclusive).
358  google.protobuf.Duration end_time_offset = 2;
359}
360
361// Video segment level annotation results for label detection.
362message LabelSegment {
363  // Video segment where a label was detected.
364  VideoSegment segment = 1;
365
366  // Confidence that the label is accurate. Range: [0, 1].
367  float confidence = 2;
368}
369
370// Video frame level annotation results for label detection.
371message LabelFrame {
372  // Time-offset, relative to the beginning of the video, corresponding to the
373  // video frame for this location.
374  google.protobuf.Duration time_offset = 1;
375
376  // Confidence that the label is accurate. Range: [0, 1].
377  float confidence = 2;
378}
379
380// Detected entity from video analysis.
381message Entity {
382  // Opaque entity ID. Some IDs may be available in
383  // [Google Knowledge Graph Search
384  // API](https://developers.google.com/knowledge-graph/).
385  string entity_id = 1;
386
387  // Textual description, e.g., `Fixed-gear bicycle`.
388  string description = 2;
389
390  // Language code for `description` in BCP-47 format.
391  string language_code = 3;
392}
393
394// Label annotation.
395message LabelAnnotation {
396  // Detected entity.
397  Entity entity = 1;
398
399  // Common categories for the detected entity.
400  // For example, when the label is `Terrier`, the category is likely `dog`. And
401  // in some cases there might be more than one categories e.g., `Terrier` could
402  // also be a `pet`.
403  repeated Entity category_entities = 2;
404
405  // All video segments where a label was detected.
406  repeated LabelSegment segments = 3;
407
408  // All video frames where a label was detected.
409  repeated LabelFrame frames = 4;
410}
411
412// Video frame level annotation results for explicit content.
413message ExplicitContentFrame {
414  // Time-offset, relative to the beginning of the video, corresponding to the
415  // video frame for this location.
416  google.protobuf.Duration time_offset = 1;
417
418  // Likelihood of the pornography content..
419  Likelihood pornography_likelihood = 2;
420}
421
422// Explicit content annotation (based on per-frame visual signals only).
423// If no explicit content has been detected in a frame, no annotations are
424// present for that frame.
425message ExplicitContentAnnotation {
426  // All video frames where explicit content was detected.
427  repeated ExplicitContentFrame frames = 1;
428}
429
430// Normalized bounding box.
431// The normalized vertex coordinates are relative to the original image.
432// Range: [0, 1].
433message NormalizedBoundingBox {
434  // Left X coordinate.
435  float left = 1;
436
437  // Top Y coordinate.
438  float top = 2;
439
440  // Right X coordinate.
441  float right = 3;
442
443  // Bottom Y coordinate.
444  float bottom = 4;
445}
446
447// For tracking related features.
448// An object at time_offset with attributes, and located with
449// normalized_bounding_box.
450message TimestampedObject {
451  // Normalized Bounding box in a frame, where the object is located.
452  NormalizedBoundingBox normalized_bounding_box = 1;
453
454  // Time-offset, relative to the beginning of the video,
455  // corresponding to the video frame for this object.
456  google.protobuf.Duration time_offset = 2;
457
458  // Optional. The attributes of the object in the bounding box.
459  repeated DetectedAttribute attributes = 3
460      [(google.api.field_behavior) = OPTIONAL];
461
462  // Optional. The detected landmarks.
463  repeated DetectedLandmark landmarks = 4
464      [(google.api.field_behavior) = OPTIONAL];
465}
466
467// A track of an object instance.
468message Track {
469  // Video segment of a track.
470  VideoSegment segment = 1;
471
472  // The object with timestamp and attributes per frame in the track.
473  repeated TimestampedObject timestamped_objects = 2;
474
475  // Optional. Attributes in the track level.
476  repeated DetectedAttribute attributes = 3
477      [(google.api.field_behavior) = OPTIONAL];
478
479  // Optional. The confidence score of the tracked object.
480  float confidence = 4 [(google.api.field_behavior) = OPTIONAL];
481}
482
483// A generic detected attribute represented by name in string format.
484message DetectedAttribute {
485  // The name of the attribute, for example, glasses, dark_glasses, mouth_open.
486  // A full list of supported type names will be provided in the document.
487  string name = 1;
488
489  // Detected attribute confidence. Range [0, 1].
490  float confidence = 2;
491
492  // Text value of the detection result. For example, the value for "HairColor"
493  // can be "black", "blonde", etc.
494  string value = 3;
495}
496
497// Celebrity definition.
498message Celebrity {
499  // The resource name of the celebrity. Have the format
500  // `video-intelligence/kg-mid` indicates a celebrity from preloaded gallery.
501  // kg-mid is the id in Google knowledge graph, which is unique for the
502  // celebrity.
503  string name = 1;
504
505  // The celebrity name.
506  string display_name = 2;
507
508  // Textual description of additional information about the celebrity, if
509  // applicable.
510  string description = 3;
511}
512
513// The annotation result of a celebrity face track. RecognizedCelebrity field
514// could be empty if the face track does not have any matched celebrities.
515message CelebrityTrack {
516  // The recognized celebrity with confidence score.
517  message RecognizedCelebrity {
518    // The recognized celebrity.
519    Celebrity celebrity = 1;
520
521    // Recognition confidence. Range [0, 1].
522    float confidence = 2;
523  }
524
525  // Top N match of the celebrities for the face in this track.
526  repeated RecognizedCelebrity celebrities = 1;
527
528  // A track of a person's face.
529  Track face_track = 3;
530}
531
532// Celebrity recognition annotation per video.
533message CelebrityRecognitionAnnotation {
534  // The tracks detected from the input video, including recognized celebrities
535  // and other detected faces in the video.
536  repeated CelebrityTrack celebrity_tracks = 1;
537}
538
539// A generic detected landmark represented by name in string format and a 2D
540// location.
541message DetectedLandmark {
542  // The name of this landmark, for example, left_hand, right_shoulder.
543  string name = 1;
544
545  // The 2D point of the detected landmark using the normalized image
546  // coordindate system. The normalized coordinates have the range from 0 to 1.
547  NormalizedVertex point = 2;
548
549  // The confidence score of the detected landmark. Range [0, 1].
550  float confidence = 3;
551}
552
553// Face detection annotation.
554message FaceDetectionAnnotation {
555  // The face tracks with attributes.
556  repeated Track tracks = 3;
557
558  // The thumbnail of a person's face.
559  bytes thumbnail = 4;
560}
561
562// Person detection annotation per video.
563message PersonDetectionAnnotation {
564  // The detected tracks of a person.
565  repeated Track tracks = 1;
566}
567
568// Annotation results for a single video.
569message VideoAnnotationResults {
570  // Video file location in
571  // [Cloud Storage](https://cloud.google.com/storage/).
572  string input_uri = 1;
573
574  // Video segment on which the annotation is run.
575  VideoSegment segment = 10;
576
577  // Topical label annotations on video level or user-specified segment level.
578  // There is exactly one element for each unique label.
579  repeated LabelAnnotation segment_label_annotations = 2;
580
581  // Presence label annotations on video level or user-specified segment level.
582  // There is exactly one element for each unique label. Compared to the
583  // existing topical `segment_label_annotations`, this field presents more
584  // fine-grained, segment-level labels detected in video content and is made
585  // available only when the client sets `LabelDetectionConfig.model` to
586  // "builtin/latest" in the request.
587  repeated LabelAnnotation segment_presence_label_annotations = 23;
588
589  // Topical label annotations on shot level.
590  // There is exactly one element for each unique label.
591  repeated LabelAnnotation shot_label_annotations = 3;
592
593  // Presence label annotations on shot level. There is exactly one element for
594  // each unique label. Compared to the existing topical
595  // `shot_label_annotations`, this field presents more fine-grained, shot-level
596  // labels detected in video content and is made available only when the client
597  // sets `LabelDetectionConfig.model` to "builtin/latest" in the request.
598  repeated LabelAnnotation shot_presence_label_annotations = 24;
599
600  // Label annotations on frame level.
601  // There is exactly one element for each unique label.
602  repeated LabelAnnotation frame_label_annotations = 4;
603
604  // Face detection annotations.
605  repeated FaceDetectionAnnotation face_detection_annotations = 13;
606
607  // Shot annotations. Each shot is represented as a video segment.
608  repeated VideoSegment shot_annotations = 6;
609
610  // Explicit content annotation.
611  ExplicitContentAnnotation explicit_annotation = 7;
612
613  // Speech transcription.
614  repeated SpeechTranscription speech_transcriptions = 11;
615
616  // OCR text detection and tracking.
617  // Annotations for list of detected text snippets. Each will have list of
618  // frame information associated with it.
619  repeated TextAnnotation text_annotations = 12;
620
621  // Annotations for list of objects detected and tracked in video.
622  repeated ObjectTrackingAnnotation object_annotations = 14;
623
624  // Annotations for list of logos detected, tracked and recognized in video.
625  repeated LogoRecognitionAnnotation logo_recognition_annotations = 19;
626
627  // Person detection annotations.
628  repeated PersonDetectionAnnotation person_detection_annotations = 20;
629
630  // Celebrity recognition annotations.
631  CelebrityRecognitionAnnotation celebrity_recognition_annotations = 21;
632
633  // If set, indicates an error. Note that for a single `AnnotateVideoRequest`
634  // some videos may succeed and some may fail.
635  google.rpc.Status error = 9;
636}
637
638// Video annotation response. Included in the `response`
639// field of the `Operation` returned by the `GetOperation`
640// call of the `google::longrunning::Operations` service.
641message AnnotateVideoResponse {
642  // Annotation results for all videos specified in `AnnotateVideoRequest`.
643  repeated VideoAnnotationResults annotation_results = 1;
644}
645
646// Annotation progress for a single video.
647message VideoAnnotationProgress {
648  // Video file location in
649  // [Cloud Storage](https://cloud.google.com/storage/).
650  string input_uri = 1;
651
652  // Approximate percentage processed thus far. Guaranteed to be
653  // 100 when fully processed.
654  int32 progress_percent = 2;
655
656  // Time when the request was received.
657  google.protobuf.Timestamp start_time = 3;
658
659  // Time of the most recent update.
660  google.protobuf.Timestamp update_time = 4;
661
662  // Specifies which feature is being tracked if the request contains more than
663  // one feature.
664  Feature feature = 5;
665
666  // Specifies which segment is being tracked if the request contains more than
667  // one segment.
668  VideoSegment segment = 6;
669}
670
671// Video annotation progress. Included in the `metadata`
672// field of the `Operation` returned by the `GetOperation`
673// call of the `google::longrunning::Operations` service.
674message AnnotateVideoProgress {
675  // Progress metadata for all videos specified in `AnnotateVideoRequest`.
676  repeated VideoAnnotationProgress annotation_progress = 1;
677}
678
679// Config for SPEECH_TRANSCRIPTION.
680message SpeechTranscriptionConfig {
681  // Required. *Required* The language of the supplied audio as a
682  // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
683  // Example: "en-US".
684  // See [Language Support](https://cloud.google.com/speech/docs/languages)
685  // for a list of the currently supported language codes.
686  string language_code = 1 [(google.api.field_behavior) = REQUIRED];
687
688  // Optional. Maximum number of recognition hypotheses to be returned.
689  // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
690  // within each `SpeechTranscription`. The server may return fewer than
691  // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will
692  // return a maximum of one. If omitted, will return a maximum of one.
693  int32 max_alternatives = 2 [(google.api.field_behavior) = OPTIONAL];
694
695  // Optional. If set to `true`, the server will attempt to filter out
696  // profanities, replacing all but the initial character in each filtered word
697  // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
698  // won't be filtered out.
699  bool filter_profanity = 3 [(google.api.field_behavior) = OPTIONAL];
700
701  // Optional. A means to provide context to assist the speech recognition.
702  repeated SpeechContext speech_contexts = 4
703      [(google.api.field_behavior) = OPTIONAL];
704
705  // Optional. If 'true', adds punctuation to recognition result hypotheses.
706  // This feature is only available in select languages. Setting this for
707  // requests in other languages has no effect at all. The default 'false' value
708  // does not add punctuation to result hypotheses. NOTE: "This is currently
709  // offered as an experimental service, complimentary to all users. In the
710  // future this may be exclusively available as a premium feature."
711  bool enable_automatic_punctuation = 5
712      [(google.api.field_behavior) = OPTIONAL];
713
714  // Optional. For file formats, such as MXF or MKV, supporting multiple audio
715  // tracks, specify up to two tracks. Default: track 0.
716  repeated int32 audio_tracks = 6 [(google.api.field_behavior) = OPTIONAL];
717
718  // Optional. If 'true', enables speaker detection for each recognized word in
719  // the top alternative of the recognition result using a speaker_tag provided
720  // in the WordInfo.
721  // Note: When this is true, we send all the words from the beginning of the
722  // audio for the top alternative in every consecutive response.
723  // This is done in order to improve our speaker tags as our models learn to
724  // identify the speakers in the conversation over time.
725  bool enable_speaker_diarization = 7 [(google.api.field_behavior) = OPTIONAL];
726
727  // Optional. If set, specifies the estimated number of speakers in the
728  // conversation. If not set, defaults to '2'. Ignored unless
729  // enable_speaker_diarization is set to true.
730  int32 diarization_speaker_count = 8 [(google.api.field_behavior) = OPTIONAL];
731
732  // Optional. If `true`, the top result includes a list of words and the
733  // confidence for those words. If `false`, no word-level confidence
734  // information is returned. The default is `false`.
735  bool enable_word_confidence = 9 [(google.api.field_behavior) = OPTIONAL];
736}
737
738// Provides "hints" to the speech recognizer to favor specific words and phrases
739// in the results.
740message SpeechContext {
741  // Optional. A list of strings containing words and phrases "hints" so that
742  // the speech recognition is more likely to recognize them. This can be used
743  // to improve the accuracy for specific words and phrases, for example, if
744  // specific commands are typically spoken by the user. This can also be used
745  // to add additional words to the vocabulary of the recognizer. See
746  // [usage limits](https://cloud.google.com/speech/limits#content).
747  repeated string phrases = 1 [(google.api.field_behavior) = OPTIONAL];
748}
749
750// A speech recognition result corresponding to a portion of the audio.
751message SpeechTranscription {
752  // May contain one or more recognition hypotheses (up to the maximum specified
753  // in `max_alternatives`).  These alternatives are ordered in terms of
754  // accuracy, with the top (first) alternative being the most probable, as
755  // ranked by the recognizer.
756  repeated SpeechRecognitionAlternative alternatives = 1;
757
758  // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
759  // language tag of the language in this result. This language code was
760  // detected to have the most likelihood of being spoken in the audio.
761  string language_code = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
762}
763
764// Alternative hypotheses (a.k.a. n-best list).
765message SpeechRecognitionAlternative {
766  // Transcript text representing the words that the user spoke.
767  string transcript = 1;
768
769  // Output only. The confidence estimate between 0.0 and 1.0. A higher number
770  // indicates an estimated greater likelihood that the recognized words are
771  // correct. This field is set only for the top alternative.
772  // This field is not guaranteed to be accurate and users should not rely on it
773  // to be always provided.
774  // The default of 0.0 is a sentinel value indicating `confidence` was not set.
775  float confidence = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
776
777  // Output only. A list of word-specific information for each recognized word.
778  // Note: When `enable_speaker_diarization` is set to true, you will see all
779  // the words from the beginning of the audio.
780  repeated WordInfo words = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
781}
782
783// Word-specific information for recognized words. Word information is only
784// included in the response when certain request parameters are set, such
785// as `enable_word_time_offsets`.
786message WordInfo {
787  // Time offset relative to the beginning of the audio, and
788  // corresponding to the start of the spoken word. This field is only set if
789  // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
790  // experimental feature and the accuracy of the time offset can vary.
791  google.protobuf.Duration start_time = 1;
792
793  // Time offset relative to the beginning of the audio, and
794  // corresponding to the end of the spoken word. This field is only set if
795  // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
796  // experimental feature and the accuracy of the time offset can vary.
797  google.protobuf.Duration end_time = 2;
798
799  // The word corresponding to this set of information.
800  string word = 3;
801
802  // Output only. The confidence estimate between 0.0 and 1.0. A higher number
803  // indicates an estimated greater likelihood that the recognized words are
804  // correct. This field is set only for the top alternative.
805  // This field is not guaranteed to be accurate and users should not rely on it
806  // to be always provided.
807  // The default of 0.0 is a sentinel value indicating `confidence` was not set.
808  float confidence = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
809
810  // Output only. A distinct integer value is assigned for every speaker within
811  // the audio. This field specifies which one of those speakers was detected to
812  // have spoken this word. Value ranges from 1 up to diarization_speaker_count,
813  // and is only set if speaker diarization is enabled.
814  int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
815}
816
817// A vertex represents a 2D point in the image.
818// NOTE: the normalized vertex coordinates are relative to the original image
819// and range from 0 to 1.
820message NormalizedVertex {
821  // X coordinate.
822  float x = 1;
823
824  // Y coordinate.
825  float y = 2;
826}
827
828// Normalized bounding polygon for text (that might not be aligned with axis).
829// Contains list of the corner points in clockwise order starting from
830// top-left corner. For example, for a rectangular bounding box:
831// When the text is horizontal it might look like:
832//         0----1
833//         |    |
834//         3----2
835//
836// When it's clockwise rotated 180 degrees around the top-left corner it
837// becomes:
838//         2----3
839//         |    |
840//         1----0
841//
842// and the vertex order will still be (0, 1, 2, 3). Note that values can be less
843// than 0, or greater than 1 due to trignometric calculations for location of
844// the box.
845message NormalizedBoundingPoly {
846  // Normalized vertices of the bounding polygon.
847  repeated NormalizedVertex vertices = 1;
848}
849
850// Video segment level annotation results for text detection.
851message TextSegment {
852  // Video segment where a text snippet was detected.
853  VideoSegment segment = 1;
854
855  // Confidence for the track of detected text. It is calculated as the highest
856  // over all frames where OCR detected text appears.
857  float confidence = 2;
858
859  // Information related to the frames where OCR detected text appears.
860  repeated TextFrame frames = 3;
861}
862
863// Video frame level annotation results for text annotation (OCR).
864// Contains information regarding timestamp and bounding box locations for the
865// frames containing detected OCR text snippets.
866message TextFrame {
867  // Bounding polygon of the detected text for this frame.
868  NormalizedBoundingPoly rotated_bounding_box = 1;
869
870  // Timestamp of this frame.
871  google.protobuf.Duration time_offset = 2;
872}
873
874// Annotations related to one detected OCR text snippet. This will contain the
875// corresponding text, confidence value, and frame level information for each
876// detection.
877message TextAnnotation {
878  // The detected text.
879  string text = 1;
880
881  // All video segments where OCR detected text appears.
882  repeated TextSegment segments = 2;
883}
884
885// Video frame level annotations for object detection and tracking. This field
886// stores per frame location, time offset, and confidence.
887message ObjectTrackingFrame {
888  // The normalized bounding box location of this object track for the frame.
889  NormalizedBoundingBox normalized_bounding_box = 1;
890
891  // The timestamp of the frame in microseconds.
892  google.protobuf.Duration time_offset = 2;
893}
894
895// Annotations corresponding to one tracked object.
896message ObjectTrackingAnnotation {
897  // Different representation of tracking info in non-streaming batch
898  // and streaming modes.
899  oneof track_info {
900    // Non-streaming batch mode ONLY.
901    // Each object track corresponds to one video segment where it appears.
902    VideoSegment segment = 3;
903
904    // Streaming mode ONLY.
905    // In streaming mode, we do not know the end time of a tracked object
906    // before it is completed. Hence, there is no VideoSegment info returned.
907    // Instead, we provide a unique identifiable integer track_id so that
908    // the customers can correlate the results of the ongoing
909    // ObjectTrackAnnotation of the same track_id over time.
910    int64 track_id = 5;
911  }
912
913  // Entity to specify the object category that this track is labeled as.
914  Entity entity = 1;
915
916  // Object category's labeling confidence of this track.
917  float confidence = 4;
918
919  // Information corresponding to all frames where this object track appears.
920  // Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame
921  // messages in frames.
922  // Streaming mode: it can only be one ObjectTrackingFrame message in frames.
923  repeated ObjectTrackingFrame frames = 2;
924}
925
926// Annotation corresponding to one detected, tracked and recognized logo class.
927message LogoRecognitionAnnotation {
928  // Entity category information to specify the logo class that all the logo
929  // tracks within this LogoRecognitionAnnotation are recognized as.
930  Entity entity = 1;
931
932  // All logo tracks where the recognized logo appears. Each track corresponds
933  // to one logo instance appearing in consecutive frames.
934  repeated Track tracks = 2;
935
936  // All video segments where the recognized logo appears. There might be
937  // multiple instances of the same logo class appearing in one VideoSegment.
938  repeated VideoSegment segments = 3;
939}
940
941// The top-level message sent by the client for the `StreamingAnnotateVideo`
942// method. Multiple `StreamingAnnotateVideoRequest` messages are sent.
943// The first message must only contain a `StreamingVideoConfig` message.
944// All subsequent messages must only contain `input_content` data.
945message StreamingAnnotateVideoRequest {
946  // *Required* The streaming request, which is either a streaming config or
947  // video content.
948  oneof streaming_request {
949    // Provides information to the annotator, specifing how to process the
950    // request. The first `AnnotateStreamingVideoRequest` message must only
951    // contain a `video_config` message.
952    StreamingVideoConfig video_config = 1;
953
954    // The video data to be annotated. Chunks of video data are sequentially
955    // sent in `StreamingAnnotateVideoRequest` messages. Except the initial
956    // `StreamingAnnotateVideoRequest` message containing only
957    // `video_config`, all subsequent `AnnotateStreamingVideoRequest`
958    // messages must only contain `input_content` field.
959    // Note: as with all bytes fields, protobuffers use a pure binary
960    // representation (not base64).
961    bytes input_content = 2;
962  }
963}
964
965// Provides information to the annotator that specifies how to process the
966// request.
967message StreamingVideoConfig {
968  // Config for requested annotation feature.
969  oneof streaming_config {
970    // Config for STREAMING_SHOT_CHANGE_DETECTION.
971    StreamingShotChangeDetectionConfig shot_change_detection_config = 2;
972
973    // Config for STREAMING_LABEL_DETECTION.
974    StreamingLabelDetectionConfig label_detection_config = 3;
975
976    // Config for STREAMING_EXPLICIT_CONTENT_DETECTION.
977    StreamingExplicitContentDetectionConfig explicit_content_detection_config =
978        4;
979
980    // Config for STREAMING_OBJECT_TRACKING.
981    StreamingObjectTrackingConfig object_tracking_config = 5;
982
983    // Config for STREAMING_AUTOML_ACTION_RECOGNITION.
984    StreamingAutomlActionRecognitionConfig automl_action_recognition_config =
985        23;
986
987    // Config for STREAMING_AUTOML_CLASSIFICATION.
988    StreamingAutomlClassificationConfig automl_classification_config = 21;
989
990    // Config for STREAMING_AUTOML_OBJECT_TRACKING.
991    StreamingAutomlObjectTrackingConfig automl_object_tracking_config = 22;
992  }
993
994  // Requested annotation feature.
995  StreamingFeature feature = 1;
996
997  // Streaming storage option. By default: storage is disabled.
998  StreamingStorageConfig storage_config = 30;
999}
1000
1001// `StreamingAnnotateVideoResponse` is the only message returned to the client
1002// by `StreamingAnnotateVideo`. A series of zero or more
1003// `StreamingAnnotateVideoResponse` messages are streamed back to the client.
1004message StreamingAnnotateVideoResponse {
1005  // If set, returns a [google.rpc.Status][google.rpc.Status] message that
1006  // specifies the error for the operation.
1007  google.rpc.Status error = 1;
1008
1009  // Streaming annotation results.
1010  StreamingVideoAnnotationResults annotation_results = 2;
1011
1012  // Google Cloud Storage(GCS) URI that stores annotation results of one
1013  // streaming session in JSON format.
1014  // It is the annotation_result_storage_directory
1015  // from the request followed by '/cloud_project_number-session_id'.
1016  string annotation_results_uri = 3;
1017}
1018
1019// Streaming annotation results corresponding to a portion of the video
1020// that is currently being processed.
1021message StreamingVideoAnnotationResults {
1022  // Shot annotation results. Each shot is represented as a video segment.
1023  repeated VideoSegment shot_annotations = 1;
1024
1025  // Label annotation results.
1026  repeated LabelAnnotation label_annotations = 2;
1027
1028  // Explicit content annotation results.
1029  ExplicitContentAnnotation explicit_annotation = 3;
1030
1031  // Object tracking results.
1032  repeated ObjectTrackingAnnotation object_annotations = 4;
1033}
1034
1035// Config for STREAMING_SHOT_CHANGE_DETECTION.
1036message StreamingShotChangeDetectionConfig {}
1037
1038// Config for STREAMING_LABEL_DETECTION.
1039message StreamingLabelDetectionConfig {
1040  // Whether the video has been captured from a stationary (i.e. non-moving)
1041  // camera. When set to true, might improve detection accuracy for moving
1042  // objects. Default: false.
1043  bool stationary_camera = 1;
1044}
1045
1046// Config for STREAMING_EXPLICIT_CONTENT_DETECTION.
1047message StreamingExplicitContentDetectionConfig {}
1048
1049// Config for STREAMING_OBJECT_TRACKING.
1050message StreamingObjectTrackingConfig {}
1051
1052// Config for STREAMING_AUTOML_ACTION_RECOGNITION.
1053message StreamingAutomlActionRecognitionConfig {
1054  // Resource name of AutoML model.
1055  // Format: `projects/{project_id}/locations/{location_id}/models/{model_id}`
1056  string model_name = 1;
1057}
1058
1059// Config for STREAMING_AUTOML_CLASSIFICATION.
1060message StreamingAutomlClassificationConfig {
1061  // Resource name of AutoML model.
1062  // Format:
1063  // `projects/{project_number}/locations/{location_id}/models/{model_id}`
1064  string model_name = 1;
1065}
1066
1067// Config for STREAMING_AUTOML_OBJECT_TRACKING.
1068message StreamingAutomlObjectTrackingConfig {
1069  // Resource name of AutoML model.
1070  // Format: `projects/{project_id}/locations/{location_id}/models/{model_id}`
1071  string model_name = 1;
1072}
1073
1074// Config for streaming storage option.
1075message StreamingStorageConfig {
1076  // Enable streaming storage. Default: false.
1077  bool enable_storage_annotation_result = 1;
1078
1079  // Cloud Storage URI to store all annotation results for one client. Client
1080  // should specify this field as the top-level storage directory. Annotation
1081  // results of different sessions will be put into different sub-directories
1082  // denoted by project_name and session_id. All sub-directories will be auto
1083  // generated by program and will be made accessible to client in response
1084  // proto. URIs must be specified in the following format:
1085  // `gs://bucket-id/object-id` `bucket-id` should be a valid Cloud Storage
1086  // bucket created by client and bucket permission shall also be configured
1087  // properly. `object-id` can be arbitrary string that make sense to client.
1088  // Other URI formats will return error and cause Cloud Storage write failure.
1089  string annotation_result_storage_directory = 3;
1090}
1091