1// Copyright 2020 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.videointelligence.v1; 18 19import "google/api/annotations.proto"; 20import "google/api/client.proto"; 21import "google/api/field_behavior.proto"; 22import "google/longrunning/operations.proto"; 23import "google/protobuf/duration.proto"; 24import "google/protobuf/timestamp.proto"; 25import "google/rpc/status.proto"; 26 27option csharp_namespace = "Google.Cloud.VideoIntelligence.V1"; 28option go_package = "cloud.google.com/go/videointelligence/apiv1/videointelligencepb;videointelligencepb"; 29option java_multiple_files = true; 30option java_outer_classname = "VideoIntelligenceServiceProto"; 31option java_package = "com.google.cloud.videointelligence.v1"; 32option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1"; 33option ruby_package = "Google::Cloud::VideoIntelligence::V1"; 34 35// Service that implements the Video Intelligence API. 36service VideoIntelligenceService { 37 option (google.api.default_host) = "videointelligence.googleapis.com"; 38 option (google.api.oauth_scopes) = 39 "https://www.googleapis.com/auth/cloud-platform"; 40 41 // Performs asynchronous video annotation. Progress and results can be 42 // retrieved through the `google.longrunning.Operations` interface. 43 // `Operation.metadata` contains `AnnotateVideoProgress` (progress). 44 // `Operation.response` contains `AnnotateVideoResponse` (results). 45 rpc AnnotateVideo(AnnotateVideoRequest) 46 returns (google.longrunning.Operation) { 47 option (google.api.http) = { 48 post: "/v1/videos:annotate" 49 body: "*" 50 }; 51 option (google.api.method_signature) = "input_uri,features"; 52 option (google.longrunning.operation_info) = { 53 response_type: "AnnotateVideoResponse" 54 metadata_type: "AnnotateVideoProgress" 55 }; 56 } 57} 58 59// Video annotation request. 60message AnnotateVideoRequest { 61 // Input video location. Currently, only 62 // [Cloud Storage](https://cloud.google.com/storage/) URIs are 63 // supported. URIs must be specified in the following format: 64 // `gs://bucket-id/object-id` (other URI formats return 65 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For 66 // more information, see [Request 67 // URIs](https://cloud.google.com/storage/docs/request-endpoints). To identify 68 // multiple videos, a video URI may include wildcards in the `object-id`. 69 // Supported wildcards: '*' to match 0 or more characters; 70 // '?' to match 1 character. If unset, the input video should be embedded 71 // in the request as `input_content`. If set, `input_content` must be unset. 72 string input_uri = 1; 73 74 // The video data bytes. 75 // If unset, the input video(s) should be specified via the `input_uri`. 76 // If set, `input_uri` must be unset. 77 bytes input_content = 6; 78 79 // Required. Requested video annotation features. 80 repeated Feature features = 2 [(google.api.field_behavior) = REQUIRED]; 81 82 // Additional video context and/or feature-specific parameters. 83 VideoContext video_context = 3; 84 85 // Optional. Location where the output (in JSON format) should be stored. 86 // Currently, only [Cloud Storage](https://cloud.google.com/storage/) 87 // URIs are supported. These must be specified in the following format: 88 // `gs://bucket-id/object-id` (other URI formats return 89 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For 90 // more information, see [Request 91 // URIs](https://cloud.google.com/storage/docs/request-endpoints). 92 string output_uri = 4 [(google.api.field_behavior) = OPTIONAL]; 93 94 // Optional. Cloud region where annotation should take place. Supported cloud 95 // regions are: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no 96 // region is specified, the region will be determined based on video file 97 // location. 98 string location_id = 5 [(google.api.field_behavior) = OPTIONAL]; 99} 100 101// Video context and/or feature-specific parameters. 102message VideoContext { 103 // Video segments to annotate. The segments may overlap and are not required 104 // to be contiguous or span the whole video. If unspecified, each video is 105 // treated as a single segment. 106 repeated VideoSegment segments = 1; 107 108 // Config for LABEL_DETECTION. 109 LabelDetectionConfig label_detection_config = 2; 110 111 // Config for SHOT_CHANGE_DETECTION. 112 ShotChangeDetectionConfig shot_change_detection_config = 3; 113 114 // Config for EXPLICIT_CONTENT_DETECTION. 115 ExplicitContentDetectionConfig explicit_content_detection_config = 4; 116 117 // Config for FACE_DETECTION. 118 FaceDetectionConfig face_detection_config = 5; 119 120 // Config for SPEECH_TRANSCRIPTION. 121 SpeechTranscriptionConfig speech_transcription_config = 6; 122 123 // Config for TEXT_DETECTION. 124 TextDetectionConfig text_detection_config = 8; 125 126 // Config for PERSON_DETECTION. 127 PersonDetectionConfig person_detection_config = 11; 128 129 // Config for OBJECT_TRACKING. 130 ObjectTrackingConfig object_tracking_config = 13; 131} 132 133// Video annotation feature. 134enum Feature { 135 // Unspecified. 136 FEATURE_UNSPECIFIED = 0; 137 138 // Label detection. Detect objects, such as dog or flower. 139 LABEL_DETECTION = 1; 140 141 // Shot change detection. 142 SHOT_CHANGE_DETECTION = 2; 143 144 // Explicit content detection. 145 EXPLICIT_CONTENT_DETECTION = 3; 146 147 // Human face detection. 148 FACE_DETECTION = 4; 149 150 // Speech transcription. 151 SPEECH_TRANSCRIPTION = 6; 152 153 // OCR text detection and tracking. 154 TEXT_DETECTION = 7; 155 156 // Object detection and tracking. 157 OBJECT_TRACKING = 9; 158 159 // Logo detection, tracking, and recognition. 160 LOGO_RECOGNITION = 12; 161 162 // Person detection. 163 PERSON_DETECTION = 14; 164} 165 166// Label detection mode. 167enum LabelDetectionMode { 168 // Unspecified. 169 LABEL_DETECTION_MODE_UNSPECIFIED = 0; 170 171 // Detect shot-level labels. 172 SHOT_MODE = 1; 173 174 // Detect frame-level labels. 175 FRAME_MODE = 2; 176 177 // Detect both shot-level and frame-level labels. 178 SHOT_AND_FRAME_MODE = 3; 179} 180 181// Bucketized representation of likelihood. 182enum Likelihood { 183 // Unspecified likelihood. 184 LIKELIHOOD_UNSPECIFIED = 0; 185 186 // Very unlikely. 187 VERY_UNLIKELY = 1; 188 189 // Unlikely. 190 UNLIKELY = 2; 191 192 // Possible. 193 POSSIBLE = 3; 194 195 // Likely. 196 LIKELY = 4; 197 198 // Very likely. 199 VERY_LIKELY = 5; 200} 201 202// Config for LABEL_DETECTION. 203message LabelDetectionConfig { 204 // What labels should be detected with LABEL_DETECTION, in addition to 205 // video-level labels or segment-level labels. 206 // If unspecified, defaults to `SHOT_MODE`. 207 LabelDetectionMode label_detection_mode = 1; 208 209 // Whether the video has been shot from a stationary (i.e., non-moving) 210 // camera. When set to true, might improve detection accuracy for moving 211 // objects. Should be used with `SHOT_AND_FRAME_MODE` enabled. 212 bool stationary_camera = 2; 213 214 // Model to use for label detection. 215 // Supported values: "builtin/stable" (the default if unset) and 216 // "builtin/latest". 217 string model = 3; 218 219 // The confidence threshold we perform filtering on the labels from 220 // frame-level detection. If not set, it is set to 0.4 by default. The valid 221 // range for this threshold is [0.1, 0.9]. Any value set outside of this 222 // range will be clipped. 223 // Note: For best results, follow the default threshold. We will update 224 // the default threshold everytime when we release a new model. 225 float frame_confidence_threshold = 4; 226 227 // The confidence threshold we perform filtering on the labels from 228 // video-level and shot-level detections. If not set, it's set to 0.3 by 229 // default. The valid range for this threshold is [0.1, 0.9]. Any value set 230 // outside of this range will be clipped. 231 // Note: For best results, follow the default threshold. We will update 232 // the default threshold everytime when we release a new model. 233 float video_confidence_threshold = 5; 234} 235 236// Config for SHOT_CHANGE_DETECTION. 237message ShotChangeDetectionConfig { 238 // Model to use for shot change detection. 239 // Supported values: "builtin/stable" (the default if unset) and 240 // "builtin/latest". 241 string model = 1; 242} 243 244// Config for OBJECT_TRACKING. 245message ObjectTrackingConfig { 246 // Model to use for object tracking. 247 // Supported values: "builtin/stable" (the default if unset) and 248 // "builtin/latest". 249 string model = 1; 250} 251 252// Config for FACE_DETECTION. 253message FaceDetectionConfig { 254 // Model to use for face detection. 255 // Supported values: "builtin/stable" (the default if unset) and 256 // "builtin/latest". 257 string model = 1; 258 259 // Whether bounding boxes are included in the face annotation output. 260 bool include_bounding_boxes = 2; 261 262 // Whether to enable face attributes detection, such as glasses, dark_glasses, 263 // mouth_open etc. Ignored if 'include_bounding_boxes' is set to false. 264 bool include_attributes = 5; 265} 266 267// Config for PERSON_DETECTION. 268message PersonDetectionConfig { 269 // Whether bounding boxes are included in the person detection annotation 270 // output. 271 bool include_bounding_boxes = 1; 272 273 // Whether to enable pose landmarks detection. Ignored if 274 // 'include_bounding_boxes' is set to false. 275 bool include_pose_landmarks = 2; 276 277 // Whether to enable person attributes detection, such as cloth color (black, 278 // blue, etc), type (coat, dress, etc), pattern (plain, floral, etc), hair, 279 // etc. 280 // Ignored if 'include_bounding_boxes' is set to false. 281 bool include_attributes = 3; 282} 283 284// Config for EXPLICIT_CONTENT_DETECTION. 285message ExplicitContentDetectionConfig { 286 // Model to use for explicit content detection. 287 // Supported values: "builtin/stable" (the default if unset) and 288 // "builtin/latest". 289 string model = 1; 290} 291 292// Config for TEXT_DETECTION. 293message TextDetectionConfig { 294 // Language hint can be specified if the language to be detected is known a 295 // priori. It can increase the accuracy of the detection. Language hint must 296 // be language code in BCP-47 format. 297 // 298 // Automatic language detection is performed if no hint is provided. 299 repeated string language_hints = 1; 300 301 // Model to use for text detection. 302 // Supported values: "builtin/stable" (the default if unset) and 303 // "builtin/latest". 304 string model = 2; 305} 306 307// Video segment. 308message VideoSegment { 309 // Time-offset, relative to the beginning of the video, 310 // corresponding to the start of the segment (inclusive). 311 google.protobuf.Duration start_time_offset = 1; 312 313 // Time-offset, relative to the beginning of the video, 314 // corresponding to the end of the segment (inclusive). 315 google.protobuf.Duration end_time_offset = 2; 316} 317 318// Video segment level annotation results for label detection. 319message LabelSegment { 320 // Video segment where a label was detected. 321 VideoSegment segment = 1; 322 323 // Confidence that the label is accurate. Range: [0, 1]. 324 float confidence = 2; 325} 326 327// Video frame level annotation results for label detection. 328message LabelFrame { 329 // Time-offset, relative to the beginning of the video, corresponding to the 330 // video frame for this location. 331 google.protobuf.Duration time_offset = 1; 332 333 // Confidence that the label is accurate. Range: [0, 1]. 334 float confidence = 2; 335} 336 337// Detected entity from video analysis. 338message Entity { 339 // Opaque entity ID. Some IDs may be available in 340 // [Google Knowledge Graph Search 341 // API](https://developers.google.com/knowledge-graph/). 342 string entity_id = 1; 343 344 // Textual description, e.g., `Fixed-gear bicycle`. 345 string description = 2; 346 347 // Language code for `description` in BCP-47 format. 348 string language_code = 3; 349} 350 351// Label annotation. 352message LabelAnnotation { 353 // Detected entity. 354 Entity entity = 1; 355 356 // Common categories for the detected entity. 357 // For example, when the label is `Terrier`, the category is likely `dog`. And 358 // in some cases there might be more than one categories e.g., `Terrier` could 359 // also be a `pet`. 360 repeated Entity category_entities = 2; 361 362 // All video segments where a label was detected. 363 repeated LabelSegment segments = 3; 364 365 // All video frames where a label was detected. 366 repeated LabelFrame frames = 4; 367 368 // Feature version. 369 string version = 5; 370} 371 372// Video frame level annotation results for explicit content. 373message ExplicitContentFrame { 374 // Time-offset, relative to the beginning of the video, corresponding to the 375 // video frame for this location. 376 google.protobuf.Duration time_offset = 1; 377 378 // Likelihood of the pornography content.. 379 Likelihood pornography_likelihood = 2; 380} 381 382// Explicit content annotation (based on per-frame visual signals only). 383// If no explicit content has been detected in a frame, no annotations are 384// present for that frame. 385message ExplicitContentAnnotation { 386 // All video frames where explicit content was detected. 387 repeated ExplicitContentFrame frames = 1; 388 389 // Feature version. 390 string version = 2; 391} 392 393// Normalized bounding box. 394// The normalized vertex coordinates are relative to the original image. 395// Range: [0, 1]. 396message NormalizedBoundingBox { 397 // Left X coordinate. 398 float left = 1; 399 400 // Top Y coordinate. 401 float top = 2; 402 403 // Right X coordinate. 404 float right = 3; 405 406 // Bottom Y coordinate. 407 float bottom = 4; 408} 409 410// Face detection annotation. 411message FaceDetectionAnnotation { 412 // The face tracks with attributes. 413 repeated Track tracks = 3; 414 415 // The thumbnail of a person's face. 416 bytes thumbnail = 4; 417 418 // Feature version. 419 string version = 5; 420} 421 422// Person detection annotation per video. 423message PersonDetectionAnnotation { 424 // The detected tracks of a person. 425 repeated Track tracks = 1; 426 427 // Feature version. 428 string version = 2; 429} 430 431// Video segment level annotation results for face detection. 432message FaceSegment { 433 // Video segment where a face was detected. 434 VideoSegment segment = 1; 435} 436 437// Deprecated. No effect. 438message FaceFrame { 439 option deprecated = true; 440 441 // Normalized Bounding boxes in a frame. 442 // There can be more than one boxes if the same face is detected in multiple 443 // locations within the current frame. 444 repeated NormalizedBoundingBox normalized_bounding_boxes = 1; 445 446 // Time-offset, relative to the beginning of the video, 447 // corresponding to the video frame for this location. 448 google.protobuf.Duration time_offset = 2; 449} 450 451// Deprecated. No effect. 452message FaceAnnotation { 453 option deprecated = true; 454 455 // Thumbnail of a representative face view (in JPEG format). 456 bytes thumbnail = 1; 457 458 // All video segments where a face was detected. 459 repeated FaceSegment segments = 2; 460 461 // All video frames where a face was detected. 462 repeated FaceFrame frames = 3; 463} 464 465// For tracking related features. 466// An object at time_offset with attributes, and located with 467// normalized_bounding_box. 468message TimestampedObject { 469 // Normalized Bounding box in a frame, where the object is located. 470 NormalizedBoundingBox normalized_bounding_box = 1; 471 472 // Time-offset, relative to the beginning of the video, 473 // corresponding to the video frame for this object. 474 google.protobuf.Duration time_offset = 2; 475 476 // Optional. The attributes of the object in the bounding box. 477 repeated DetectedAttribute attributes = 3 478 [(google.api.field_behavior) = OPTIONAL]; 479 480 // Optional. The detected landmarks. 481 repeated DetectedLandmark landmarks = 4 482 [(google.api.field_behavior) = OPTIONAL]; 483} 484 485// A track of an object instance. 486message Track { 487 // Video segment of a track. 488 VideoSegment segment = 1; 489 490 // The object with timestamp and attributes per frame in the track. 491 repeated TimestampedObject timestamped_objects = 2; 492 493 // Optional. Attributes in the track level. 494 repeated DetectedAttribute attributes = 3 495 [(google.api.field_behavior) = OPTIONAL]; 496 497 // Optional. The confidence score of the tracked object. 498 float confidence = 4 [(google.api.field_behavior) = OPTIONAL]; 499} 500 501// A generic detected attribute represented by name in string format. 502message DetectedAttribute { 503 // The name of the attribute, for example, glasses, dark_glasses, mouth_open. 504 // A full list of supported type names will be provided in the document. 505 string name = 1; 506 507 // Detected attribute confidence. Range [0, 1]. 508 float confidence = 2; 509 510 // Text value of the detection result. For example, the value for "HairColor" 511 // can be "black", "blonde", etc. 512 string value = 3; 513} 514 515// A generic detected landmark represented by name in string format and a 2D 516// location. 517message DetectedLandmark { 518 // The name of this landmark, for example, left_hand, right_shoulder. 519 string name = 1; 520 521 // The 2D point of the detected landmark using the normalized image 522 // coordindate system. The normalized coordinates have the range from 0 to 1. 523 NormalizedVertex point = 2; 524 525 // The confidence score of the detected landmark. Range [0, 1]. 526 float confidence = 3; 527} 528 529// Annotation results for a single video. 530message VideoAnnotationResults { 531 // Video file location in 532 // [Cloud Storage](https://cloud.google.com/storage/). 533 string input_uri = 1; 534 535 // Video segment on which the annotation is run. 536 VideoSegment segment = 10; 537 538 // Topical label annotations on video level or user-specified segment level. 539 // There is exactly one element for each unique label. 540 repeated LabelAnnotation segment_label_annotations = 2; 541 542 // Presence label annotations on video level or user-specified segment level. 543 // There is exactly one element for each unique label. Compared to the 544 // existing topical `segment_label_annotations`, this field presents more 545 // fine-grained, segment-level labels detected in video content and is made 546 // available only when the client sets `LabelDetectionConfig.model` to 547 // "builtin/latest" in the request. 548 repeated LabelAnnotation segment_presence_label_annotations = 23; 549 550 // Topical label annotations on shot level. 551 // There is exactly one element for each unique label. 552 repeated LabelAnnotation shot_label_annotations = 3; 553 554 // Presence label annotations on shot level. There is exactly one element for 555 // each unique label. Compared to the existing topical 556 // `shot_label_annotations`, this field presents more fine-grained, shot-level 557 // labels detected in video content and is made available only when the client 558 // sets `LabelDetectionConfig.model` to "builtin/latest" in the request. 559 repeated LabelAnnotation shot_presence_label_annotations = 24; 560 561 // Label annotations on frame level. 562 // There is exactly one element for each unique label. 563 repeated LabelAnnotation frame_label_annotations = 4; 564 565 // Deprecated. Please use `face_detection_annotations` instead. 566 repeated FaceAnnotation face_annotations = 5 [deprecated = true]; 567 568 // Face detection annotations. 569 repeated FaceDetectionAnnotation face_detection_annotations = 13; 570 571 // Shot annotations. Each shot is represented as a video segment. 572 repeated VideoSegment shot_annotations = 6; 573 574 // Explicit content annotation. 575 ExplicitContentAnnotation explicit_annotation = 7; 576 577 // Speech transcription. 578 repeated SpeechTranscription speech_transcriptions = 11; 579 580 // OCR text detection and tracking. 581 // Annotations for list of detected text snippets. Each will have list of 582 // frame information associated with it. 583 repeated TextAnnotation text_annotations = 12; 584 585 // Annotations for list of objects detected and tracked in video. 586 repeated ObjectTrackingAnnotation object_annotations = 14; 587 588 // Annotations for list of logos detected, tracked and recognized in video. 589 repeated LogoRecognitionAnnotation logo_recognition_annotations = 19; 590 591 // Person detection annotations. 592 repeated PersonDetectionAnnotation person_detection_annotations = 20; 593 594 // If set, indicates an error. Note that for a single `AnnotateVideoRequest` 595 // some videos may succeed and some may fail. 596 google.rpc.Status error = 9; 597} 598 599// Video annotation response. Included in the `response` 600// field of the `Operation` returned by the `GetOperation` 601// call of the `google::longrunning::Operations` service. 602message AnnotateVideoResponse { 603 // Annotation results for all videos specified in `AnnotateVideoRequest`. 604 repeated VideoAnnotationResults annotation_results = 1; 605} 606 607// Annotation progress for a single video. 608message VideoAnnotationProgress { 609 // Video file location in 610 // [Cloud Storage](https://cloud.google.com/storage/). 611 string input_uri = 1; 612 613 // Approximate percentage processed thus far. Guaranteed to be 614 // 100 when fully processed. 615 int32 progress_percent = 2; 616 617 // Time when the request was received. 618 google.protobuf.Timestamp start_time = 3; 619 620 // Time of the most recent update. 621 google.protobuf.Timestamp update_time = 4; 622 623 // Specifies which feature is being tracked if the request contains more than 624 // one feature. 625 Feature feature = 5; 626 627 // Specifies which segment is being tracked if the request contains more than 628 // one segment. 629 VideoSegment segment = 6; 630} 631 632// Video annotation progress. Included in the `metadata` 633// field of the `Operation` returned by the `GetOperation` 634// call of the `google::longrunning::Operations` service. 635message AnnotateVideoProgress { 636 // Progress metadata for all videos specified in `AnnotateVideoRequest`. 637 repeated VideoAnnotationProgress annotation_progress = 1; 638} 639 640// Config for SPEECH_TRANSCRIPTION. 641message SpeechTranscriptionConfig { 642 // Required. *Required* The language of the supplied audio as a 643 // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. 644 // Example: "en-US". 645 // See [Language Support](https://cloud.google.com/speech/docs/languages) 646 // for a list of the currently supported language codes. 647 string language_code = 1 [(google.api.field_behavior) = REQUIRED]; 648 649 // Optional. Maximum number of recognition hypotheses to be returned. 650 // Specifically, the maximum number of `SpeechRecognitionAlternative` messages 651 // within each `SpeechTranscription`. The server may return fewer than 652 // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will 653 // return a maximum of one. If omitted, will return a maximum of one. 654 int32 max_alternatives = 2 [(google.api.field_behavior) = OPTIONAL]; 655 656 // Optional. If set to `true`, the server will attempt to filter out 657 // profanities, replacing all but the initial character in each filtered word 658 // with asterisks, e.g. "f***". If set to `false` or omitted, profanities 659 // won't be filtered out. 660 bool filter_profanity = 3 [(google.api.field_behavior) = OPTIONAL]; 661 662 // Optional. A means to provide context to assist the speech recognition. 663 repeated SpeechContext speech_contexts = 4 664 [(google.api.field_behavior) = OPTIONAL]; 665 666 // Optional. If 'true', adds punctuation to recognition result hypotheses. 667 // This feature is only available in select languages. Setting this for 668 // requests in other languages has no effect at all. The default 'false' value 669 // does not add punctuation to result hypotheses. NOTE: "This is currently 670 // offered as an experimental service, complimentary to all users. In the 671 // future this may be exclusively available as a premium feature." 672 bool enable_automatic_punctuation = 5 673 [(google.api.field_behavior) = OPTIONAL]; 674 675 // Optional. For file formats, such as MXF or MKV, supporting multiple audio 676 // tracks, specify up to two tracks. Default: track 0. 677 repeated int32 audio_tracks = 6 [(google.api.field_behavior) = OPTIONAL]; 678 679 // Optional. If 'true', enables speaker detection for each recognized word in 680 // the top alternative of the recognition result using a speaker_tag provided 681 // in the WordInfo. 682 // Note: When this is true, we send all the words from the beginning of the 683 // audio for the top alternative in every consecutive response. 684 // This is done in order to improve our speaker tags as our models learn to 685 // identify the speakers in the conversation over time. 686 bool enable_speaker_diarization = 7 [(google.api.field_behavior) = OPTIONAL]; 687 688 // Optional. If set, specifies the estimated number of speakers in the 689 // conversation. If not set, defaults to '2'. Ignored unless 690 // enable_speaker_diarization is set to true. 691 int32 diarization_speaker_count = 8 [(google.api.field_behavior) = OPTIONAL]; 692 693 // Optional. If `true`, the top result includes a list of words and the 694 // confidence for those words. If `false`, no word-level confidence 695 // information is returned. The default is `false`. 696 bool enable_word_confidence = 9 [(google.api.field_behavior) = OPTIONAL]; 697} 698 699// Provides "hints" to the speech recognizer to favor specific words and phrases 700// in the results. 701message SpeechContext { 702 // Optional. A list of strings containing words and phrases "hints" so that 703 // the speech recognition is more likely to recognize them. This can be used 704 // to improve the accuracy for specific words and phrases, for example, if 705 // specific commands are typically spoken by the user. This can also be used 706 // to add additional words to the vocabulary of the recognizer. See 707 // [usage limits](https://cloud.google.com/speech/limits#content). 708 repeated string phrases = 1 [(google.api.field_behavior) = OPTIONAL]; 709} 710 711// A speech recognition result corresponding to a portion of the audio. 712message SpeechTranscription { 713 // May contain one or more recognition hypotheses (up to the maximum specified 714 // in `max_alternatives`). These alternatives are ordered in terms of 715 // accuracy, with the top (first) alternative being the most probable, as 716 // ranked by the recognizer. 717 repeated SpeechRecognitionAlternative alternatives = 1; 718 719 // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) 720 // language tag of the language in this result. This language code was 721 // detected to have the most likelihood of being spoken in the audio. 722 string language_code = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 723} 724 725// Alternative hypotheses (a.k.a. n-best list). 726message SpeechRecognitionAlternative { 727 // Transcript text representing the words that the user spoke. 728 string transcript = 1; 729 730 // Output only. The confidence estimate between 0.0 and 1.0. A higher number 731 // indicates an estimated greater likelihood that the recognized words are 732 // correct. This field is set only for the top alternative. 733 // This field is not guaranteed to be accurate and users should not rely on it 734 // to be always provided. 735 // The default of 0.0 is a sentinel value indicating `confidence` was not set. 736 float confidence = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 737 738 // Output only. A list of word-specific information for each recognized word. 739 // Note: When `enable_speaker_diarization` is set to true, you will see all 740 // the words from the beginning of the audio. 741 repeated WordInfo words = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 742} 743 744// Word-specific information for recognized words. Word information is only 745// included in the response when certain request parameters are set, such 746// as `enable_word_time_offsets`. 747message WordInfo { 748 // Time offset relative to the beginning of the audio, and 749 // corresponding to the start of the spoken word. This field is only set if 750 // `enable_word_time_offsets=true` and only in the top hypothesis. This is an 751 // experimental feature and the accuracy of the time offset can vary. 752 google.protobuf.Duration start_time = 1; 753 754 // Time offset relative to the beginning of the audio, and 755 // corresponding to the end of the spoken word. This field is only set if 756 // `enable_word_time_offsets=true` and only in the top hypothesis. This is an 757 // experimental feature and the accuracy of the time offset can vary. 758 google.protobuf.Duration end_time = 2; 759 760 // The word corresponding to this set of information. 761 string word = 3; 762 763 // Output only. The confidence estimate between 0.0 and 1.0. A higher number 764 // indicates an estimated greater likelihood that the recognized words are 765 // correct. This field is set only for the top alternative. 766 // This field is not guaranteed to be accurate and users should not rely on it 767 // to be always provided. 768 // The default of 0.0 is a sentinel value indicating `confidence` was not set. 769 float confidence = 4 [(google.api.field_behavior) = OUTPUT_ONLY]; 770 771 // Output only. A distinct integer value is assigned for every speaker within 772 // the audio. This field specifies which one of those speakers was detected to 773 // have spoken this word. Value ranges from 1 up to diarization_speaker_count, 774 // and is only set if speaker diarization is enabled. 775 int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; 776} 777 778// A vertex represents a 2D point in the image. 779// NOTE: the normalized vertex coordinates are relative to the original image 780// and range from 0 to 1. 781message NormalizedVertex { 782 // X coordinate. 783 float x = 1; 784 785 // Y coordinate. 786 float y = 2; 787} 788 789// Normalized bounding polygon for text (that might not be aligned with axis). 790// Contains list of the corner points in clockwise order starting from 791// top-left corner. For example, for a rectangular bounding box: 792// When the text is horizontal it might look like: 793// 0----1 794// | | 795// 3----2 796// 797// When it's clockwise rotated 180 degrees around the top-left corner it 798// becomes: 799// 2----3 800// | | 801// 1----0 802// 803// and the vertex order will still be (0, 1, 2, 3). Note that values can be less 804// than 0, or greater than 1 due to trignometric calculations for location of 805// the box. 806message NormalizedBoundingPoly { 807 // Normalized vertices of the bounding polygon. 808 repeated NormalizedVertex vertices = 1; 809} 810 811// Video segment level annotation results for text detection. 812message TextSegment { 813 // Video segment where a text snippet was detected. 814 VideoSegment segment = 1; 815 816 // Confidence for the track of detected text. It is calculated as the highest 817 // over all frames where OCR detected text appears. 818 float confidence = 2; 819 820 // Information related to the frames where OCR detected text appears. 821 repeated TextFrame frames = 3; 822} 823 824// Video frame level annotation results for text annotation (OCR). 825// Contains information regarding timestamp and bounding box locations for the 826// frames containing detected OCR text snippets. 827message TextFrame { 828 // Bounding polygon of the detected text for this frame. 829 NormalizedBoundingPoly rotated_bounding_box = 1; 830 831 // Timestamp of this frame. 832 google.protobuf.Duration time_offset = 2; 833} 834 835// Annotations related to one detected OCR text snippet. This will contain the 836// corresponding text, confidence value, and frame level information for each 837// detection. 838message TextAnnotation { 839 // The detected text. 840 string text = 1; 841 842 // All video segments where OCR detected text appears. 843 repeated TextSegment segments = 2; 844 845 // Feature version. 846 string version = 3; 847} 848 849// Video frame level annotations for object detection and tracking. This field 850// stores per frame location, time offset, and confidence. 851message ObjectTrackingFrame { 852 // The normalized bounding box location of this object track for the frame. 853 NormalizedBoundingBox normalized_bounding_box = 1; 854 855 // The timestamp of the frame in microseconds. 856 google.protobuf.Duration time_offset = 2; 857} 858 859// Annotations corresponding to one tracked object. 860message ObjectTrackingAnnotation { 861 // Different representation of tracking info in non-streaming batch 862 // and streaming modes. 863 oneof track_info { 864 // Non-streaming batch mode ONLY. 865 // Each object track corresponds to one video segment where it appears. 866 VideoSegment segment = 3; 867 868 // Streaming mode ONLY. 869 // In streaming mode, we do not know the end time of a tracked object 870 // before it is completed. Hence, there is no VideoSegment info returned. 871 // Instead, we provide a unique identifiable integer track_id so that 872 // the customers can correlate the results of the ongoing 873 // ObjectTrackAnnotation of the same track_id over time. 874 int64 track_id = 5; 875 } 876 877 // Entity to specify the object category that this track is labeled as. 878 Entity entity = 1; 879 880 // Object category's labeling confidence of this track. 881 float confidence = 4; 882 883 // Information corresponding to all frames where this object track appears. 884 // Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame 885 // messages in frames. 886 // Streaming mode: it can only be one ObjectTrackingFrame message in frames. 887 repeated ObjectTrackingFrame frames = 2; 888 889 // Feature version. 890 string version = 6; 891} 892 893// Annotation corresponding to one detected, tracked and recognized logo class. 894message LogoRecognitionAnnotation { 895 // Entity category information to specify the logo class that all the logo 896 // tracks within this LogoRecognitionAnnotation are recognized as. 897 Entity entity = 1; 898 899 // All logo tracks where the recognized logo appears. Each track corresponds 900 // to one logo instance appearing in consecutive frames. 901 repeated Track tracks = 2; 902 903 // All video segments where the recognized logo appears. There might be 904 // multiple instances of the same logo class appearing in one VideoSegment. 905 repeated VideoSegment segments = 3; 906} 907