1// Copyright 2020 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.videointelligence.v1p3beta1; 18 19import "google/api/annotations.proto"; 20import "google/api/client.proto"; 21import "google/api/field_behavior.proto"; 22import "google/longrunning/operations.proto"; 23import "google/protobuf/duration.proto"; 24import "google/protobuf/timestamp.proto"; 25import "google/rpc/status.proto"; 26 27option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P3Beta1"; 28option go_package = "cloud.google.com/go/videointelligence/apiv1p3beta1/videointelligencepb;videointelligencepb"; 29option java_multiple_files = true; 30option java_outer_classname = "VideoIntelligenceServiceProto"; 31option java_package = "com.google.cloud.videointelligence.v1p3beta1"; 32option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p3beta1"; 33option ruby_package = "Google::Cloud::VideoIntelligence::V1p3beta1"; 34 35// Service that implements the Video Intelligence API. 36service VideoIntelligenceService { 37 option (google.api.default_host) = "videointelligence.googleapis.com"; 38 option (google.api.oauth_scopes) = 39 "https://www.googleapis.com/auth/cloud-platform"; 40 41 // Performs asynchronous video annotation. Progress and results can be 42 // retrieved through the `google.longrunning.Operations` interface. 43 // `Operation.metadata` contains `AnnotateVideoProgress` (progress). 44 // `Operation.response` contains `AnnotateVideoResponse` (results). 45 rpc AnnotateVideo(AnnotateVideoRequest) 46 returns (google.longrunning.Operation) { 47 option (google.api.http) = { 48 post: "/v1p3beta1/videos:annotate" 49 body: "*" 50 }; 51 option (google.api.method_signature) = "input_uri,features"; 52 option (google.longrunning.operation_info) = { 53 response_type: "AnnotateVideoResponse" 54 metadata_type: "AnnotateVideoProgress" 55 }; 56 } 57} 58 59// Service that implements streaming Video Intelligence API. 60service StreamingVideoIntelligenceService { 61 option (google.api.default_host) = "videointelligence.googleapis.com"; 62 option (google.api.oauth_scopes) = 63 "https://www.googleapis.com/auth/cloud-platform"; 64 65 // Performs video annotation with bidirectional streaming: emitting results 66 // while sending video/audio bytes. 67 // This method is only available via the gRPC API (not REST). 68 rpc StreamingAnnotateVideo(stream StreamingAnnotateVideoRequest) 69 returns (stream StreamingAnnotateVideoResponse) {} 70} 71 72// Video annotation request. 73message AnnotateVideoRequest { 74 // Input video location. Currently, only 75 // [Cloud Storage](https://cloud.google.com/storage/) URIs are 76 // supported. URIs must be specified in the following format: 77 // `gs://bucket-id/object-id` (other URI formats return 78 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For 79 // more information, see [Request 80 // URIs](https://cloud.google.com/storage/docs/request-endpoints). To identify 81 // multiple videos, a video URI may include wildcards in the `object-id`. 82 // Supported wildcards: '*' to match 0 or more characters; 83 // '?' to match 1 character. If unset, the input video should be embedded 84 // in the request as `input_content`. If set, `input_content` must be unset. 85 string input_uri = 1; 86 87 // The video data bytes. 88 // If unset, the input video(s) should be specified via the `input_uri`. 89 // If set, `input_uri` must be unset. 90 bytes input_content = 6; 91 92 // Required. Requested video annotation features. 93 repeated Feature features = 2 [(google.api.field_behavior) = REQUIRED]; 94 95 // Additional video context and/or feature-specific parameters. 96 VideoContext video_context = 3; 97 98 // Optional. Location where the output (in JSON format) should be stored. 99 // Currently, only [Cloud Storage](https://cloud.google.com/storage/) 100 // URIs are supported. These must be specified in the following format: 101 // `gs://bucket-id/object-id` (other URI formats return 102 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For 103 // more information, see [Request 104 // URIs](https://cloud.google.com/storage/docs/request-endpoints). 105 string output_uri = 4 [(google.api.field_behavior) = OPTIONAL]; 106 107 // Optional. Cloud region where annotation should take place. Supported cloud 108 // regions are: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no 109 // region is specified, the region will be determined based on video file 110 // location. 111 string location_id = 5 [(google.api.field_behavior) = OPTIONAL]; 112} 113 114// Video context and/or feature-specific parameters. 115message VideoContext { 116 // Video segments to annotate. The segments may overlap and are not required 117 // to be contiguous or span the whole video. If unspecified, each video is 118 // treated as a single segment. 119 repeated VideoSegment segments = 1; 120 121 // Config for LABEL_DETECTION. 122 LabelDetectionConfig label_detection_config = 2; 123 124 // Config for SHOT_CHANGE_DETECTION. 125 ShotChangeDetectionConfig shot_change_detection_config = 3; 126 127 // Config for EXPLICIT_CONTENT_DETECTION. 128 ExplicitContentDetectionConfig explicit_content_detection_config = 4; 129 130 // Config for FACE_DETECTION. 131 FaceDetectionConfig face_detection_config = 5; 132 133 // Config for SPEECH_TRANSCRIPTION. 134 SpeechTranscriptionConfig speech_transcription_config = 6; 135 136 // Config for TEXT_DETECTION. 137 TextDetectionConfig text_detection_config = 8; 138 139 // Config for PERSON_DETECTION. 140 PersonDetectionConfig person_detection_config = 11; 141 142 // Config for OBJECT_TRACKING. 143 ObjectTrackingConfig object_tracking_config = 13; 144} 145 146// Label detection mode. 147enum LabelDetectionMode { 148 // Unspecified. 149 LABEL_DETECTION_MODE_UNSPECIFIED = 0; 150 151 // Detect shot-level labels. 152 SHOT_MODE = 1; 153 154 // Detect frame-level labels. 155 FRAME_MODE = 2; 156 157 // Detect both shot-level and frame-level labels. 158 SHOT_AND_FRAME_MODE = 3; 159} 160 161// Bucketized representation of likelihood. 162enum Likelihood { 163 // Unspecified likelihood. 164 LIKELIHOOD_UNSPECIFIED = 0; 165 166 // Very unlikely. 167 VERY_UNLIKELY = 1; 168 169 // Unlikely. 170 UNLIKELY = 2; 171 172 // Possible. 173 POSSIBLE = 3; 174 175 // Likely. 176 LIKELY = 4; 177 178 // Very likely. 179 VERY_LIKELY = 5; 180} 181 182// Config for LABEL_DETECTION. 183message LabelDetectionConfig { 184 // What labels should be detected with LABEL_DETECTION, in addition to 185 // video-level labels or segment-level labels. 186 // If unspecified, defaults to `SHOT_MODE`. 187 LabelDetectionMode label_detection_mode = 1; 188 189 // Whether the video has been shot from a stationary (i.e., non-moving) 190 // camera. When set to true, might improve detection accuracy for moving 191 // objects. Should be used with `SHOT_AND_FRAME_MODE` enabled. 192 bool stationary_camera = 2; 193 194 // Model to use for label detection. 195 // Supported values: "builtin/stable" (the default if unset) and 196 // "builtin/latest". 197 string model = 3; 198 199 // The confidence threshold we perform filtering on the labels from 200 // frame-level detection. If not set, it is set to 0.4 by default. The valid 201 // range for this threshold is [0.1, 0.9]. Any value set outside of this 202 // range will be clipped. 203 // Note: For best results, follow the default threshold. We will update 204 // the default threshold everytime when we release a new model. 205 float frame_confidence_threshold = 4; 206 207 // The confidence threshold we perform filtering on the labels from 208 // video-level and shot-level detections. If not set, it's set to 0.3 by 209 // default. The valid range for this threshold is [0.1, 0.9]. Any value set 210 // outside of this range will be clipped. 211 // Note: For best results, follow the default threshold. We will update 212 // the default threshold everytime when we release a new model. 213 float video_confidence_threshold = 5; 214} 215 216// Streaming video annotation feature. 217enum StreamingFeature { 218 // Unspecified. 219 STREAMING_FEATURE_UNSPECIFIED = 0; 220 221 // Label detection. Detect objects, such as dog or flower. 222 STREAMING_LABEL_DETECTION = 1; 223 224 // Shot change detection. 225 STREAMING_SHOT_CHANGE_DETECTION = 2; 226 227 // Explicit content detection. 228 STREAMING_EXPLICIT_CONTENT_DETECTION = 3; 229 230 // Object detection and tracking. 231 STREAMING_OBJECT_TRACKING = 4; 232 233 // Action recognition based on AutoML model. 234 STREAMING_AUTOML_ACTION_RECOGNITION = 23; 235 236 // Video classification based on AutoML model. 237 STREAMING_AUTOML_CLASSIFICATION = 21; 238 239 // Object detection and tracking based on AutoML model. 240 STREAMING_AUTOML_OBJECT_TRACKING = 22; 241} 242 243// Video annotation feature. 244enum Feature { 245 // Unspecified. 246 FEATURE_UNSPECIFIED = 0; 247 248 // Label detection. Detect objects, such as dog or flower. 249 LABEL_DETECTION = 1; 250 251 // Shot change detection. 252 SHOT_CHANGE_DETECTION = 2; 253 254 // Explicit content detection. 255 EXPLICIT_CONTENT_DETECTION = 3; 256 257 // Human face detection. 258 FACE_DETECTION = 4; 259 260 // Speech transcription. 261 SPEECH_TRANSCRIPTION = 6; 262 263 // OCR text detection and tracking. 264 TEXT_DETECTION = 7; 265 266 // Object detection and tracking. 267 OBJECT_TRACKING = 9; 268 269 // Logo detection, tracking, and recognition. 270 LOGO_RECOGNITION = 12; 271 272 // Celebrity recognition. 273 CELEBRITY_RECOGNITION = 13; 274 275 // Person detection. 276 PERSON_DETECTION = 14; 277} 278 279// Config for SHOT_CHANGE_DETECTION. 280message ShotChangeDetectionConfig { 281 // Model to use for shot change detection. 282 // Supported values: "builtin/stable" (the default if unset) and 283 // "builtin/latest". 284 string model = 1; 285} 286 287// Config for OBJECT_TRACKING. 288message ObjectTrackingConfig { 289 // Model to use for object tracking. 290 // Supported values: "builtin/stable" (the default if unset) and 291 // "builtin/latest". 292 string model = 1; 293} 294 295// Config for EXPLICIT_CONTENT_DETECTION. 296message ExplicitContentDetectionConfig { 297 // Model to use for explicit content detection. 298 // Supported values: "builtin/stable" (the default if unset) and 299 // "builtin/latest". 300 string model = 1; 301} 302 303// Config for FACE_DETECTION. 304message FaceDetectionConfig { 305 // Model to use for face detection. 306 // Supported values: "builtin/stable" (the default if unset) and 307 // "builtin/latest". 308 string model = 1; 309 310 // Whether bounding boxes are included in the face annotation output. 311 bool include_bounding_boxes = 2; 312 313 // Whether to enable face attributes detection, such as glasses, dark_glasses, 314 // mouth_open etc. Ignored if 'include_bounding_boxes' is set to false. 315 bool include_attributes = 5; 316} 317 318// Config for PERSON_DETECTION. 319message PersonDetectionConfig { 320 // Whether bounding boxes are included in the person detection annotation 321 // output. 322 bool include_bounding_boxes = 1; 323 324 // Whether to enable pose landmarks detection. Ignored if 325 // 'include_bounding_boxes' is set to false. 326 bool include_pose_landmarks = 2; 327 328 // Whether to enable person attributes detection, such as cloth color (black, 329 // blue, etc), type (coat, dress, etc), pattern (plain, floral, etc), hair, 330 // etc. 331 // Ignored if 'include_bounding_boxes' is set to false. 332 bool include_attributes = 3; 333} 334 335// Config for TEXT_DETECTION. 336message TextDetectionConfig { 337 // Language hint can be specified if the language to be detected is known a 338 // priori. It can increase the accuracy of the detection. Language hint must 339 // be language code in BCP-47 format. 340 // 341 // Automatic language detection is performed if no hint is provided. 342 repeated string language_hints = 1; 343 344 // Model to use for text detection. 345 // Supported values: "builtin/stable" (the default if unset) and 346 // "builtin/latest". 347 string model = 2; 348} 349 350// Video segment. 351message VideoSegment { 352 // Time-offset, relative to the beginning of the video, 353 // corresponding to the start of the segment (inclusive). 354 google.protobuf.Duration start_time_offset = 1; 355 356 // Time-offset, relative to the beginning of the video, 357 // corresponding to the end of the segment (inclusive). 358 google.protobuf.Duration end_time_offset = 2; 359} 360 361// Video segment level annotation results for label detection. 362message LabelSegment { 363 // Video segment where a label was detected. 364 VideoSegment segment = 1; 365 366 // Confidence that the label is accurate. Range: [0, 1]. 367 float confidence = 2; 368} 369 370// Video frame level annotation results for label detection. 371message LabelFrame { 372 // Time-offset, relative to the beginning of the video, corresponding to the 373 // video frame for this location. 374 google.protobuf.Duration time_offset = 1; 375 376 // Confidence that the label is accurate. Range: [0, 1]. 377 float confidence = 2; 378} 379 380// Detected entity from video analysis. 381message Entity { 382 // Opaque entity ID. Some IDs may be available in 383 // [Google Knowledge Graph Search 384 // API](https://developers.google.com/knowledge-graph/). 385 string entity_id = 1; 386 387 // Textual description, e.g., `Fixed-gear bicycle`. 388 string description = 2; 389 390 // Language code for `description` in BCP-47 format. 391 string language_code = 3; 392} 393 394// Label annotation. 395message LabelAnnotation { 396 // Detected entity. 397 Entity entity = 1; 398 399 // Common categories for the detected entity. 400 // For example, when the label is `Terrier`, the category is likely `dog`. And 401 // in some cases there might be more than one categories e.g., `Terrier` could 402 // also be a `pet`. 403 repeated Entity category_entities = 2; 404 405 // All video segments where a label was detected. 406 repeated LabelSegment segments = 3; 407 408 // All video frames where a label was detected. 409 repeated LabelFrame frames = 4; 410} 411 412// Video frame level annotation results for explicit content. 413message ExplicitContentFrame { 414 // Time-offset, relative to the beginning of the video, corresponding to the 415 // video frame for this location. 416 google.protobuf.Duration time_offset = 1; 417 418 // Likelihood of the pornography content.. 419 Likelihood pornography_likelihood = 2; 420} 421 422// Explicit content annotation (based on per-frame visual signals only). 423// If no explicit content has been detected in a frame, no annotations are 424// present for that frame. 425message ExplicitContentAnnotation { 426 // All video frames where explicit content was detected. 427 repeated ExplicitContentFrame frames = 1; 428} 429 430// Normalized bounding box. 431// The normalized vertex coordinates are relative to the original image. 432// Range: [0, 1]. 433message NormalizedBoundingBox { 434 // Left X coordinate. 435 float left = 1; 436 437 // Top Y coordinate. 438 float top = 2; 439 440 // Right X coordinate. 441 float right = 3; 442 443 // Bottom Y coordinate. 444 float bottom = 4; 445} 446 447// For tracking related features. 448// An object at time_offset with attributes, and located with 449// normalized_bounding_box. 450message TimestampedObject { 451 // Normalized Bounding box in a frame, where the object is located. 452 NormalizedBoundingBox normalized_bounding_box = 1; 453 454 // Time-offset, relative to the beginning of the video, 455 // corresponding to the video frame for this object. 456 google.protobuf.Duration time_offset = 2; 457 458 // Optional. The attributes of the object in the bounding box. 459 repeated DetectedAttribute attributes = 3 460 [(google.api.field_behavior) = OPTIONAL]; 461 462 // Optional. The detected landmarks. 463 repeated DetectedLandmark landmarks = 4 464 [(google.api.field_behavior) = OPTIONAL]; 465} 466 467// A track of an object instance. 468message Track { 469 // Video segment of a track. 470 VideoSegment segment = 1; 471 472 // The object with timestamp and attributes per frame in the track. 473 repeated TimestampedObject timestamped_objects = 2; 474 475 // Optional. Attributes in the track level. 476 repeated DetectedAttribute attributes = 3 477 [(google.api.field_behavior) = OPTIONAL]; 478 479 // Optional. The confidence score of the tracked object. 480 float confidence = 4 [(google.api.field_behavior) = OPTIONAL]; 481} 482 483// A generic detected attribute represented by name in string format. 484message DetectedAttribute { 485 // The name of the attribute, for example, glasses, dark_glasses, mouth_open. 486 // A full list of supported type names will be provided in the document. 487 string name = 1; 488 489 // Detected attribute confidence. Range [0, 1]. 490 float confidence = 2; 491 492 // Text value of the detection result. For example, the value for "HairColor" 493 // can be "black", "blonde", etc. 494 string value = 3; 495} 496 497// Celebrity definition. 498message Celebrity { 499 // The resource name of the celebrity. Have the format 500 // `video-intelligence/kg-mid` indicates a celebrity from preloaded gallery. 501 // kg-mid is the id in Google knowledge graph, which is unique for the 502 // celebrity. 503 string name = 1; 504 505 // The celebrity name. 506 string display_name = 2; 507 508 // Textual description of additional information about the celebrity, if 509 // applicable. 510 string description = 3; 511} 512 513// The annotation result of a celebrity face track. RecognizedCelebrity field 514// could be empty if the face track does not have any matched celebrities. 515message CelebrityTrack { 516 // The recognized celebrity with confidence score. 517 message RecognizedCelebrity { 518 // The recognized celebrity. 519 Celebrity celebrity = 1; 520 521 // Recognition confidence. Range [0, 1]. 522 float confidence = 2; 523 } 524 525 // Top N match of the celebrities for the face in this track. 526 repeated RecognizedCelebrity celebrities = 1; 527 528 // A track of a person's face. 529 Track face_track = 3; 530} 531 532// Celebrity recognition annotation per video. 533message CelebrityRecognitionAnnotation { 534 // The tracks detected from the input video, including recognized celebrities 535 // and other detected faces in the video. 536 repeated CelebrityTrack celebrity_tracks = 1; 537} 538 539// A generic detected landmark represented by name in string format and a 2D 540// location. 541message DetectedLandmark { 542 // The name of this landmark, for example, left_hand, right_shoulder. 543 string name = 1; 544 545 // The 2D point of the detected landmark using the normalized image 546 // coordindate system. The normalized coordinates have the range from 0 to 1. 547 NormalizedVertex point = 2; 548 549 // The confidence score of the detected landmark. Range [0, 1]. 550 float confidence = 3; 551} 552 553// Face detection annotation. 554message FaceDetectionAnnotation { 555 // The face tracks with attributes. 556 repeated Track tracks = 3; 557 558 // The thumbnail of a person's face. 559 bytes thumbnail = 4; 560} 561 562// Person detection annotation per video. 563message PersonDetectionAnnotation { 564 // The detected tracks of a person. 565 repeated Track tracks = 1; 566} 567 568// Annotation results for a single video. 569message VideoAnnotationResults { 570 // Video file location in 571 // [Cloud Storage](https://cloud.google.com/storage/). 572 string input_uri = 1; 573 574 // Video segment on which the annotation is run. 575 VideoSegment segment = 10; 576 577 // Topical label annotations on video level or user-specified segment level. 578 // There is exactly one element for each unique label. 579 repeated LabelAnnotation segment_label_annotations = 2; 580 581 // Presence label annotations on video level or user-specified segment level. 582 // There is exactly one element for each unique label. Compared to the 583 // existing topical `segment_label_annotations`, this field presents more 584 // fine-grained, segment-level labels detected in video content and is made 585 // available only when the client sets `LabelDetectionConfig.model` to 586 // "builtin/latest" in the request. 587 repeated LabelAnnotation segment_presence_label_annotations = 23; 588 589 // Topical label annotations on shot level. 590 // There is exactly one element for each unique label. 591 repeated LabelAnnotation shot_label_annotations = 3; 592 593 // Presence label annotations on shot level. There is exactly one element for 594 // each unique label. Compared to the existing topical 595 // `shot_label_annotations`, this field presents more fine-grained, shot-level 596 // labels detected in video content and is made available only when the client 597 // sets `LabelDetectionConfig.model` to "builtin/latest" in the request. 598 repeated LabelAnnotation shot_presence_label_annotations = 24; 599 600 // Label annotations on frame level. 601 // There is exactly one element for each unique label. 602 repeated LabelAnnotation frame_label_annotations = 4; 603 604 // Face detection annotations. 605 repeated FaceDetectionAnnotation face_detection_annotations = 13; 606 607 // Shot annotations. Each shot is represented as a video segment. 608 repeated VideoSegment shot_annotations = 6; 609 610 // Explicit content annotation. 611 ExplicitContentAnnotation explicit_annotation = 7; 612 613 // Speech transcription. 614 repeated SpeechTranscription speech_transcriptions = 11; 615 616 // OCR text detection and tracking. 617 // Annotations for list of detected text snippets. Each will have list of 618 // frame information associated with it. 619 repeated TextAnnotation text_annotations = 12; 620 621 // Annotations for list of objects detected and tracked in video. 622 repeated ObjectTrackingAnnotation object_annotations = 14; 623 624 // Annotations for list of logos detected, tracked and recognized in video. 625 repeated LogoRecognitionAnnotation logo_recognition_annotations = 19; 626 627 // Person detection annotations. 628 repeated PersonDetectionAnnotation person_detection_annotations = 20; 629 630 // Celebrity recognition annotations. 631 CelebrityRecognitionAnnotation celebrity_recognition_annotations = 21; 632 633 // If set, indicates an error. Note that for a single `AnnotateVideoRequest` 634 // some videos may succeed and some may fail. 635 google.rpc.Status error = 9; 636} 637 638// Video annotation response. Included in the `response` 639// field of the `Operation` returned by the `GetOperation` 640// call of the `google::longrunning::Operations` service. 641message AnnotateVideoResponse { 642 // Annotation results for all videos specified in `AnnotateVideoRequest`. 643 repeated VideoAnnotationResults annotation_results = 1; 644} 645 646// Annotation progress for a single video. 647message VideoAnnotationProgress { 648 // Video file location in 649 // [Cloud Storage](https://cloud.google.com/storage/). 650 string input_uri = 1; 651 652 // Approximate percentage processed thus far. Guaranteed to be 653 // 100 when fully processed. 654 int32 progress_percent = 2; 655 656 // Time when the request was received. 657 google.protobuf.Timestamp start_time = 3; 658 659 // Time of the most recent update. 660 google.protobuf.Timestamp update_time = 4; 661 662 // Specifies which feature is being tracked if the request contains more than 663 // one feature. 664 Feature feature = 5; 665 666 // Specifies which segment is being tracked if the request contains more than 667 // one segment. 668 VideoSegment segment = 6; 669} 670 671// Video annotation progress. Included in the `metadata` 672// field of the `Operation` returned by the `GetOperation` 673// call of the `google::longrunning::Operations` service. 674message AnnotateVideoProgress { 675 // Progress metadata for all videos specified in `AnnotateVideoRequest`. 676 repeated VideoAnnotationProgress annotation_progress = 1; 677} 678 679// Config for SPEECH_TRANSCRIPTION. 680message SpeechTranscriptionConfig { 681 // Required. *Required* The language of the supplied audio as a 682 // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. 683 // Example: "en-US". 684 // See [Language Support](https://cloud.google.com/speech/docs/languages) 685 // for a list of the currently supported language codes. 686 string language_code = 1 [(google.api.field_behavior) = REQUIRED]; 687 688 // Optional. Maximum number of recognition hypotheses to be returned. 689 // Specifically, the maximum number of `SpeechRecognitionAlternative` messages 690 // within each `SpeechTranscription`. The server may return fewer than 691 // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will 692 // return a maximum of one. If omitted, will return a maximum of one. 693 int32 max_alternatives = 2 [(google.api.field_behavior) = OPTIONAL]; 694 695 // Optional. If set to `true`, the server will attempt to filter out 696 // profanities, replacing all but the initial character in each filtered word 697 // with asterisks, e.g. "f***". If set to `false` or omitted, profanities 698 // won't be filtered out. 699 bool filter_profanity = 3 [(google.api.field_behavior) = OPTIONAL]; 700 701 // Optional. A means to provide context to assist the speech recognition. 702 repeated SpeechContext speech_contexts = 4 703 [(google.api.field_behavior) = OPTIONAL]; 704 705 // Optional. If 'true', adds punctuation to recognition result hypotheses. 706 // This feature is only available in select languages. Setting this for 707 // requests in other languages has no effect at all. The default 'false' value 708 // does not add punctuation to result hypotheses. NOTE: "This is currently 709 // offered as an experimental service, complimentary to all users. In the 710 // future this may be exclusively available as a premium feature." 711 bool enable_automatic_punctuation = 5 712 [(google.api.field_behavior) = OPTIONAL]; 713 714 // Optional. For file formats, such as MXF or MKV, supporting multiple audio 715 // tracks, specify up to two tracks. Default: track 0. 716 repeated int32 audio_tracks = 6 [(google.api.field_behavior) = OPTIONAL]; 717 718 // Optional. If 'true', enables speaker detection for each recognized word in 719 // the top alternative of the recognition result using a speaker_tag provided 720 // in the WordInfo. 721 // Note: When this is true, we send all the words from the beginning of the 722 // audio for the top alternative in every consecutive response. 723 // This is done in order to improve our speaker tags as our models learn to 724 // identify the speakers in the conversation over time. 725 bool enable_speaker_diarization = 7 [(google.api.field_behavior) = OPTIONAL]; 726 727 // Optional. If set, specifies the estimated number of speakers in the 728 // conversation. If not set, defaults to '2'. Ignored unless 729 // enable_speaker_diarization is set to true. 730 int32 diarization_speaker_count = 8 [(google.api.field_behavior) = OPTIONAL]; 731 732 // Optional. If `true`, the top result includes a list of words and the 733 // confidence for those words. If `false`, no word-level confidence 734 // information is returned. The default is `false`. 735 bool enable_word_confidence = 9 [(google.api.field_behavior) = OPTIONAL]; 736} 737 738// Provides "hints" to the speech recognizer to favor specific words and phrases 739// in the results. 740message SpeechContext { 741 // Optional. A list of strings containing words and phrases "hints" so that 742 // the speech recognition is more likely to recognize them. This can be used 743 // to improve the accuracy for specific words and phrases, for example, if 744 // specific commands are typically spoken by the user. This can also be used 745 // to add additional words to the vocabulary of the recognizer. See 746 // [usage limits](https://cloud.google.com/speech/limits#content). 747 repeated string phrases = 1 [(google.api.field_behavior) = OPTIONAL]; 748} 749 750// A speech recognition result corresponding to a portion of the audio. 751message SpeechTranscription { 752 // May contain one or more recognition hypotheses (up to the maximum specified 753 // in `max_alternatives`). These alternatives are ordered in terms of 754 // accuracy, with the top (first) alternative being the most probable, as 755 // ranked by the recognizer. 756 repeated SpeechRecognitionAlternative alternatives = 1; 757 758 // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) 759 // language tag of the language in this result. This language code was 760 // detected to have the most likelihood of being spoken in the audio. 761 string language_code = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 762} 763 764// Alternative hypotheses (a.k.a. n-best list). 765message SpeechRecognitionAlternative { 766 // Transcript text representing the words that the user spoke. 767 string transcript = 1; 768 769 // Output only. The confidence estimate between 0.0 and 1.0. A higher number 770 // indicates an estimated greater likelihood that the recognized words are 771 // correct. This field is set only for the top alternative. 772 // This field is not guaranteed to be accurate and users should not rely on it 773 // to be always provided. 774 // The default of 0.0 is a sentinel value indicating `confidence` was not set. 775 float confidence = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 776 777 // Output only. A list of word-specific information for each recognized word. 778 // Note: When `enable_speaker_diarization` is set to true, you will see all 779 // the words from the beginning of the audio. 780 repeated WordInfo words = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 781} 782 783// Word-specific information for recognized words. Word information is only 784// included in the response when certain request parameters are set, such 785// as `enable_word_time_offsets`. 786message WordInfo { 787 // Time offset relative to the beginning of the audio, and 788 // corresponding to the start of the spoken word. This field is only set if 789 // `enable_word_time_offsets=true` and only in the top hypothesis. This is an 790 // experimental feature and the accuracy of the time offset can vary. 791 google.protobuf.Duration start_time = 1; 792 793 // Time offset relative to the beginning of the audio, and 794 // corresponding to the end of the spoken word. This field is only set if 795 // `enable_word_time_offsets=true` and only in the top hypothesis. This is an 796 // experimental feature and the accuracy of the time offset can vary. 797 google.protobuf.Duration end_time = 2; 798 799 // The word corresponding to this set of information. 800 string word = 3; 801 802 // Output only. The confidence estimate between 0.0 and 1.0. A higher number 803 // indicates an estimated greater likelihood that the recognized words are 804 // correct. This field is set only for the top alternative. 805 // This field is not guaranteed to be accurate and users should not rely on it 806 // to be always provided. 807 // The default of 0.0 is a sentinel value indicating `confidence` was not set. 808 float confidence = 4 [(google.api.field_behavior) = OUTPUT_ONLY]; 809 810 // Output only. A distinct integer value is assigned for every speaker within 811 // the audio. This field specifies which one of those speakers was detected to 812 // have spoken this word. Value ranges from 1 up to diarization_speaker_count, 813 // and is only set if speaker diarization is enabled. 814 int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; 815} 816 817// A vertex represents a 2D point in the image. 818// NOTE: the normalized vertex coordinates are relative to the original image 819// and range from 0 to 1. 820message NormalizedVertex { 821 // X coordinate. 822 float x = 1; 823 824 // Y coordinate. 825 float y = 2; 826} 827 828// Normalized bounding polygon for text (that might not be aligned with axis). 829// Contains list of the corner points in clockwise order starting from 830// top-left corner. For example, for a rectangular bounding box: 831// When the text is horizontal it might look like: 832// 0----1 833// | | 834// 3----2 835// 836// When it's clockwise rotated 180 degrees around the top-left corner it 837// becomes: 838// 2----3 839// | | 840// 1----0 841// 842// and the vertex order will still be (0, 1, 2, 3). Note that values can be less 843// than 0, or greater than 1 due to trignometric calculations for location of 844// the box. 845message NormalizedBoundingPoly { 846 // Normalized vertices of the bounding polygon. 847 repeated NormalizedVertex vertices = 1; 848} 849 850// Video segment level annotation results for text detection. 851message TextSegment { 852 // Video segment where a text snippet was detected. 853 VideoSegment segment = 1; 854 855 // Confidence for the track of detected text. It is calculated as the highest 856 // over all frames where OCR detected text appears. 857 float confidence = 2; 858 859 // Information related to the frames where OCR detected text appears. 860 repeated TextFrame frames = 3; 861} 862 863// Video frame level annotation results for text annotation (OCR). 864// Contains information regarding timestamp and bounding box locations for the 865// frames containing detected OCR text snippets. 866message TextFrame { 867 // Bounding polygon of the detected text for this frame. 868 NormalizedBoundingPoly rotated_bounding_box = 1; 869 870 // Timestamp of this frame. 871 google.protobuf.Duration time_offset = 2; 872} 873 874// Annotations related to one detected OCR text snippet. This will contain the 875// corresponding text, confidence value, and frame level information for each 876// detection. 877message TextAnnotation { 878 // The detected text. 879 string text = 1; 880 881 // All video segments where OCR detected text appears. 882 repeated TextSegment segments = 2; 883} 884 885// Video frame level annotations for object detection and tracking. This field 886// stores per frame location, time offset, and confidence. 887message ObjectTrackingFrame { 888 // The normalized bounding box location of this object track for the frame. 889 NormalizedBoundingBox normalized_bounding_box = 1; 890 891 // The timestamp of the frame in microseconds. 892 google.protobuf.Duration time_offset = 2; 893} 894 895// Annotations corresponding to one tracked object. 896message ObjectTrackingAnnotation { 897 // Different representation of tracking info in non-streaming batch 898 // and streaming modes. 899 oneof track_info { 900 // Non-streaming batch mode ONLY. 901 // Each object track corresponds to one video segment where it appears. 902 VideoSegment segment = 3; 903 904 // Streaming mode ONLY. 905 // In streaming mode, we do not know the end time of a tracked object 906 // before it is completed. Hence, there is no VideoSegment info returned. 907 // Instead, we provide a unique identifiable integer track_id so that 908 // the customers can correlate the results of the ongoing 909 // ObjectTrackAnnotation of the same track_id over time. 910 int64 track_id = 5; 911 } 912 913 // Entity to specify the object category that this track is labeled as. 914 Entity entity = 1; 915 916 // Object category's labeling confidence of this track. 917 float confidence = 4; 918 919 // Information corresponding to all frames where this object track appears. 920 // Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame 921 // messages in frames. 922 // Streaming mode: it can only be one ObjectTrackingFrame message in frames. 923 repeated ObjectTrackingFrame frames = 2; 924} 925 926// Annotation corresponding to one detected, tracked and recognized logo class. 927message LogoRecognitionAnnotation { 928 // Entity category information to specify the logo class that all the logo 929 // tracks within this LogoRecognitionAnnotation are recognized as. 930 Entity entity = 1; 931 932 // All logo tracks where the recognized logo appears. Each track corresponds 933 // to one logo instance appearing in consecutive frames. 934 repeated Track tracks = 2; 935 936 // All video segments where the recognized logo appears. There might be 937 // multiple instances of the same logo class appearing in one VideoSegment. 938 repeated VideoSegment segments = 3; 939} 940 941// The top-level message sent by the client for the `StreamingAnnotateVideo` 942// method. Multiple `StreamingAnnotateVideoRequest` messages are sent. 943// The first message must only contain a `StreamingVideoConfig` message. 944// All subsequent messages must only contain `input_content` data. 945message StreamingAnnotateVideoRequest { 946 // *Required* The streaming request, which is either a streaming config or 947 // video content. 948 oneof streaming_request { 949 // Provides information to the annotator, specifing how to process the 950 // request. The first `AnnotateStreamingVideoRequest` message must only 951 // contain a `video_config` message. 952 StreamingVideoConfig video_config = 1; 953 954 // The video data to be annotated. Chunks of video data are sequentially 955 // sent in `StreamingAnnotateVideoRequest` messages. Except the initial 956 // `StreamingAnnotateVideoRequest` message containing only 957 // `video_config`, all subsequent `AnnotateStreamingVideoRequest` 958 // messages must only contain `input_content` field. 959 // Note: as with all bytes fields, protobuffers use a pure binary 960 // representation (not base64). 961 bytes input_content = 2; 962 } 963} 964 965// Provides information to the annotator that specifies how to process the 966// request. 967message StreamingVideoConfig { 968 // Config for requested annotation feature. 969 oneof streaming_config { 970 // Config for STREAMING_SHOT_CHANGE_DETECTION. 971 StreamingShotChangeDetectionConfig shot_change_detection_config = 2; 972 973 // Config for STREAMING_LABEL_DETECTION. 974 StreamingLabelDetectionConfig label_detection_config = 3; 975 976 // Config for STREAMING_EXPLICIT_CONTENT_DETECTION. 977 StreamingExplicitContentDetectionConfig explicit_content_detection_config = 978 4; 979 980 // Config for STREAMING_OBJECT_TRACKING. 981 StreamingObjectTrackingConfig object_tracking_config = 5; 982 983 // Config for STREAMING_AUTOML_ACTION_RECOGNITION. 984 StreamingAutomlActionRecognitionConfig automl_action_recognition_config = 985 23; 986 987 // Config for STREAMING_AUTOML_CLASSIFICATION. 988 StreamingAutomlClassificationConfig automl_classification_config = 21; 989 990 // Config for STREAMING_AUTOML_OBJECT_TRACKING. 991 StreamingAutomlObjectTrackingConfig automl_object_tracking_config = 22; 992 } 993 994 // Requested annotation feature. 995 StreamingFeature feature = 1; 996 997 // Streaming storage option. By default: storage is disabled. 998 StreamingStorageConfig storage_config = 30; 999} 1000 1001// `StreamingAnnotateVideoResponse` is the only message returned to the client 1002// by `StreamingAnnotateVideo`. A series of zero or more 1003// `StreamingAnnotateVideoResponse` messages are streamed back to the client. 1004message StreamingAnnotateVideoResponse { 1005 // If set, returns a [google.rpc.Status][google.rpc.Status] message that 1006 // specifies the error for the operation. 1007 google.rpc.Status error = 1; 1008 1009 // Streaming annotation results. 1010 StreamingVideoAnnotationResults annotation_results = 2; 1011 1012 // Google Cloud Storage(GCS) URI that stores annotation results of one 1013 // streaming session in JSON format. 1014 // It is the annotation_result_storage_directory 1015 // from the request followed by '/cloud_project_number-session_id'. 1016 string annotation_results_uri = 3; 1017} 1018 1019// Streaming annotation results corresponding to a portion of the video 1020// that is currently being processed. 1021message StreamingVideoAnnotationResults { 1022 // Shot annotation results. Each shot is represented as a video segment. 1023 repeated VideoSegment shot_annotations = 1; 1024 1025 // Label annotation results. 1026 repeated LabelAnnotation label_annotations = 2; 1027 1028 // Explicit content annotation results. 1029 ExplicitContentAnnotation explicit_annotation = 3; 1030 1031 // Object tracking results. 1032 repeated ObjectTrackingAnnotation object_annotations = 4; 1033} 1034 1035// Config for STREAMING_SHOT_CHANGE_DETECTION. 1036message StreamingShotChangeDetectionConfig {} 1037 1038// Config for STREAMING_LABEL_DETECTION. 1039message StreamingLabelDetectionConfig { 1040 // Whether the video has been captured from a stationary (i.e. non-moving) 1041 // camera. When set to true, might improve detection accuracy for moving 1042 // objects. Default: false. 1043 bool stationary_camera = 1; 1044} 1045 1046// Config for STREAMING_EXPLICIT_CONTENT_DETECTION. 1047message StreamingExplicitContentDetectionConfig {} 1048 1049// Config for STREAMING_OBJECT_TRACKING. 1050message StreamingObjectTrackingConfig {} 1051 1052// Config for STREAMING_AUTOML_ACTION_RECOGNITION. 1053message StreamingAutomlActionRecognitionConfig { 1054 // Resource name of AutoML model. 1055 // Format: `projects/{project_id}/locations/{location_id}/models/{model_id}` 1056 string model_name = 1; 1057} 1058 1059// Config for STREAMING_AUTOML_CLASSIFICATION. 1060message StreamingAutomlClassificationConfig { 1061 // Resource name of AutoML model. 1062 // Format: 1063 // `projects/{project_number}/locations/{location_id}/models/{model_id}` 1064 string model_name = 1; 1065} 1066 1067// Config for STREAMING_AUTOML_OBJECT_TRACKING. 1068message StreamingAutomlObjectTrackingConfig { 1069 // Resource name of AutoML model. 1070 // Format: `projects/{project_id}/locations/{location_id}/models/{model_id}` 1071 string model_name = 1; 1072} 1073 1074// Config for streaming storage option. 1075message StreamingStorageConfig { 1076 // Enable streaming storage. Default: false. 1077 bool enable_storage_annotation_result = 1; 1078 1079 // Cloud Storage URI to store all annotation results for one client. Client 1080 // should specify this field as the top-level storage directory. Annotation 1081 // results of different sessions will be put into different sub-directories 1082 // denoted by project_name and session_id. All sub-directories will be auto 1083 // generated by program and will be made accessible to client in response 1084 // proto. URIs must be specified in the following format: 1085 // `gs://bucket-id/object-id` `bucket-id` should be a valid Cloud Storage 1086 // bucket created by client and bucket permission shall also be configured 1087 // properly. `object-id` can be arbitrary string that make sense to client. 1088 // Other URI formats will return error and cause Cloud Storage write failure. 1089 string annotation_result_storage_directory = 3; 1090} 1091