1// Copyright 2019 Google LLC. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14// 15 16syntax = "proto3"; 17 18package google.cloud.videointelligence.v1p1beta1; 19 20import "google/api/annotations.proto"; 21import "google/api/client.proto"; 22import "google/api/field_behavior.proto"; 23import "google/longrunning/operations.proto"; 24import "google/protobuf/duration.proto"; 25import "google/protobuf/timestamp.proto"; 26import "google/rpc/status.proto"; 27 28option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P1Beta1"; 29option go_package = "cloud.google.com/go/videointelligence/apiv1p1beta1/videointelligencepb;videointelligencepb"; 30option java_multiple_files = true; 31option java_outer_classname = "VideoIntelligenceServiceProto"; 32option java_package = "com.google.cloud.videointelligence.v1p1beta1"; 33option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p1beta1"; 34option ruby_package = "Google::Cloud::VideoIntelligence::V1p1beta1"; 35 36// Service that implements Google Cloud Video Intelligence API. 37service VideoIntelligenceService { 38 option (google.api.default_host) = "videointelligence.googleapis.com"; 39 option (google.api.oauth_scopes) = 40 "https://www.googleapis.com/auth/cloud-platform"; 41 42 // Performs asynchronous video annotation. Progress and results can be 43 // retrieved through the `google.longrunning.Operations` interface. 44 // `Operation.metadata` contains `AnnotateVideoProgress` (progress). 45 // `Operation.response` contains `AnnotateVideoResponse` (results). 46 rpc AnnotateVideo(AnnotateVideoRequest) 47 returns (google.longrunning.Operation) { 48 option (google.api.http) = { 49 post: "/v1p1beta1/videos:annotate" 50 body: "*" 51 }; 52 option (google.api.method_signature) = "input_uri,features"; 53 option (google.longrunning.operation_info) = { 54 response_type: "AnnotateVideoResponse" 55 metadata_type: "AnnotateVideoProgress" 56 }; 57 } 58} 59 60// Video annotation request. 61message AnnotateVideoRequest { 62 // Input video location. Currently, only 63 // [Google Cloud Storage](https://cloud.google.com/storage/) URIs are 64 // supported, which must be specified in the following format: 65 // `gs://bucket-id/object-id` (other URI formats return 66 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For 67 // more information, see [Request 68 // URIs](https://cloud.google.com/storage/docs/request-endpoints). A video URI 69 // may include wildcards in `object-id`, and thus identify multiple videos. 70 // Supported wildcards: '*' to match 0 or more characters; 71 // '?' to match 1 character. If unset, the input video should be embedded 72 // in the request as `input_content`. If set, `input_content` should be unset. 73 string input_uri = 1; 74 75 // The video data bytes. 76 // If unset, the input video(s) should be specified via `input_uri`. 77 // If set, `input_uri` should be unset. 78 bytes input_content = 6; 79 80 // Required. Requested video annotation features. 81 repeated Feature features = 2 [(google.api.field_behavior) = REQUIRED]; 82 83 // Additional video context and/or feature-specific parameters. 84 VideoContext video_context = 3; 85 86 // Optional. Location where the output (in JSON format) should be stored. 87 // Currently, only [Google Cloud Storage](https://cloud.google.com/storage/) 88 // URIs are supported, which must be specified in the following format: 89 // `gs://bucket-id/object-id` (other URI formats return 90 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For 91 // more information, see [Request 92 // URIs](https://cloud.google.com/storage/docs/request-endpoints). 93 string output_uri = 4 [(google.api.field_behavior) = OPTIONAL]; 94 95 // Optional. Cloud region where annotation should take place. Supported cloud 96 // regions: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no region 97 // is specified, a region will be determined based on video file location. 98 string location_id = 5 [(google.api.field_behavior) = OPTIONAL]; 99} 100 101// Video context and/or feature-specific parameters. 102message VideoContext { 103 // Video segments to annotate. The segments may overlap and are not required 104 // to be contiguous or span the whole video. If unspecified, each video is 105 // treated as a single segment. 106 repeated VideoSegment segments = 1; 107 108 // Config for LABEL_DETECTION. 109 LabelDetectionConfig label_detection_config = 2; 110 111 // Config for SHOT_CHANGE_DETECTION. 112 ShotChangeDetectionConfig shot_change_detection_config = 3; 113 114 // Config for EXPLICIT_CONTENT_DETECTION. 115 ExplicitContentDetectionConfig explicit_content_detection_config = 4; 116 117 // Config for SPEECH_TRANSCRIPTION. 118 SpeechTranscriptionConfig speech_transcription_config = 6; 119} 120 121// Config for LABEL_DETECTION. 122message LabelDetectionConfig { 123 // What labels should be detected with LABEL_DETECTION, in addition to 124 // video-level labels or segment-level labels. 125 // If unspecified, defaults to `SHOT_MODE`. 126 LabelDetectionMode label_detection_mode = 1; 127 128 // Whether the video has been shot from a stationary (i.e. non-moving) camera. 129 // When set to true, might improve detection accuracy for moving objects. 130 // Should be used with `SHOT_AND_FRAME_MODE` enabled. 131 bool stationary_camera = 2; 132 133 // Model to use for label detection. 134 // Supported values: "builtin/stable" (the default if unset) and 135 // "builtin/latest". 136 string model = 3; 137} 138 139// Config for SHOT_CHANGE_DETECTION. 140message ShotChangeDetectionConfig { 141 // Model to use for shot change detection. 142 // Supported values: "builtin/stable" (the default if unset) and 143 // "builtin/latest". 144 string model = 1; 145} 146 147// Config for EXPLICIT_CONTENT_DETECTION. 148message ExplicitContentDetectionConfig { 149 // Model to use for explicit content detection. 150 // Supported values: "builtin/stable" (the default if unset) and 151 // "builtin/latest". 152 string model = 1; 153} 154 155// Video segment. 156message VideoSegment { 157 // Time-offset, relative to the beginning of the video, 158 // corresponding to the start of the segment (inclusive). 159 google.protobuf.Duration start_time_offset = 1; 160 161 // Time-offset, relative to the beginning of the video, 162 // corresponding to the end of the segment (inclusive). 163 google.protobuf.Duration end_time_offset = 2; 164} 165 166// Video segment level annotation results for label detection. 167message LabelSegment { 168 // Video segment where a label was detected. 169 VideoSegment segment = 1; 170 171 // Confidence that the label is accurate. Range: [0, 1]. 172 float confidence = 2; 173} 174 175// Video frame level annotation results for label detection. 176message LabelFrame { 177 // Time-offset, relative to the beginning of the video, corresponding to the 178 // video frame for this location. 179 google.protobuf.Duration time_offset = 1; 180 181 // Confidence that the label is accurate. Range: [0, 1]. 182 float confidence = 2; 183} 184 185// Detected entity from video analysis. 186message Entity { 187 // Opaque entity ID. Some IDs may be available in 188 // [Google Knowledge Graph Search 189 // API](https://developers.google.com/knowledge-graph/). 190 string entity_id = 1; 191 192 // Textual description, e.g. `Fixed-gear bicycle`. 193 string description = 2; 194 195 // Language code for `description` in BCP-47 format. 196 string language_code = 3; 197} 198 199// Label annotation. 200message LabelAnnotation { 201 // Detected entity. 202 Entity entity = 1; 203 204 // Common categories for the detected entity. 205 // E.g. when the label is `Terrier` the category is likely `dog`. And in some 206 // cases there might be more than one categories e.g. `Terrier` could also be 207 // a `pet`. 208 repeated Entity category_entities = 2; 209 210 // All video segments where a label was detected. 211 repeated LabelSegment segments = 3; 212 213 // All video frames where a label was detected. 214 repeated LabelFrame frames = 4; 215} 216 217// Video frame level annotation results for explicit content. 218message ExplicitContentFrame { 219 // Time-offset, relative to the beginning of the video, corresponding to the 220 // video frame for this location. 221 google.protobuf.Duration time_offset = 1; 222 223 // Likelihood of the pornography content.. 224 Likelihood pornography_likelihood = 2; 225} 226 227// Explicit content annotation (based on per-frame visual signals only). 228// If no explicit content has been detected in a frame, no annotations are 229// present for that frame. 230message ExplicitContentAnnotation { 231 // All video frames where explicit content was detected. 232 repeated ExplicitContentFrame frames = 1; 233} 234 235// Annotation results for a single video. 236message VideoAnnotationResults { 237 // Output only. Video file location in 238 // [Google Cloud Storage](https://cloud.google.com/storage/). 239 string input_uri = 1; 240 241 // Label annotations on video level or user specified segment level. 242 // There is exactly one element for each unique label. 243 repeated LabelAnnotation segment_label_annotations = 2; 244 245 // Label annotations on shot level. 246 // There is exactly one element for each unique label. 247 repeated LabelAnnotation shot_label_annotations = 3; 248 249 // Label annotations on frame level. 250 // There is exactly one element for each unique label. 251 repeated LabelAnnotation frame_label_annotations = 4; 252 253 // Shot annotations. Each shot is represented as a video segment. 254 repeated VideoSegment shot_annotations = 6; 255 256 // Explicit content annotation. 257 ExplicitContentAnnotation explicit_annotation = 7; 258 259 // Speech transcription. 260 repeated SpeechTranscription speech_transcriptions = 11; 261 262 // Output only. If set, indicates an error. Note that for a single 263 // `AnnotateVideoRequest` some videos may succeed and some may fail. 264 google.rpc.Status error = 9; 265} 266 267// Video annotation response. Included in the `response` 268// field of the `Operation` returned by the `GetOperation` 269// call of the `google::longrunning::Operations` service. 270message AnnotateVideoResponse { 271 // Annotation results for all videos specified in `AnnotateVideoRequest`. 272 repeated VideoAnnotationResults annotation_results = 1; 273} 274 275// Annotation progress for a single video. 276message VideoAnnotationProgress { 277 // Output only. Video file location in 278 // [Google Cloud Storage](https://cloud.google.com/storage/). 279 string input_uri = 1; 280 281 // Output only. Approximate percentage processed thus far. Guaranteed to be 282 // 100 when fully processed. 283 int32 progress_percent = 2; 284 285 // Output only. Time when the request was received. 286 google.protobuf.Timestamp start_time = 3; 287 288 // Output only. Time of the most recent update. 289 google.protobuf.Timestamp update_time = 4; 290} 291 292// Video annotation progress. Included in the `metadata` 293// field of the `Operation` returned by the `GetOperation` 294// call of the `google::longrunning::Operations` service. 295message AnnotateVideoProgress { 296 // Progress metadata for all videos specified in `AnnotateVideoRequest`. 297 repeated VideoAnnotationProgress annotation_progress = 1; 298} 299 300// Config for SPEECH_TRANSCRIPTION. 301message SpeechTranscriptionConfig { 302 // Required. *Required* The language of the supplied audio as a 303 // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. 304 // Example: "en-US". 305 // See [Language Support](https://cloud.google.com/speech/docs/languages) 306 // for a list of the currently supported language codes. 307 string language_code = 1 [(google.api.field_behavior) = REQUIRED]; 308 309 // Optional. Maximum number of recognition hypotheses to be returned. 310 // Specifically, the maximum number of `SpeechRecognitionAlternative` messages 311 // within each `SpeechTranscription`. The server may return fewer than 312 // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will 313 // return a maximum of one. If omitted, will return a maximum of one. 314 int32 max_alternatives = 2 [(google.api.field_behavior) = OPTIONAL]; 315 316 // Optional. If set to `true`, the server will attempt to filter out 317 // profanities, replacing all but the initial character in each filtered word 318 // with asterisks, e.g. "f***". If set to `false` or omitted, profanities 319 // won't be filtered out. 320 bool filter_profanity = 3 [(google.api.field_behavior) = OPTIONAL]; 321 322 // Optional. A means to provide context to assist the speech recognition. 323 repeated SpeechContext speech_contexts = 4 324 [(google.api.field_behavior) = OPTIONAL]; 325 326 // Optional. If 'true', adds punctuation to recognition result hypotheses. 327 // This feature is only available in select languages. Setting this for 328 // requests in other languages has no effect at all. The default 'false' value 329 // does not add punctuation to result hypotheses. NOTE: "This is currently 330 // offered as an experimental service, complimentary to all users. In the 331 // future this may be exclusively available as a premium feature." 332 bool enable_automatic_punctuation = 5 333 [(google.api.field_behavior) = OPTIONAL]; 334 335 // Optional. For file formats, such as MXF or MKV, supporting multiple audio 336 // tracks, specify up to two tracks. Default: track 0. 337 repeated int32 audio_tracks = 6 [(google.api.field_behavior) = OPTIONAL]; 338} 339 340// Provides "hints" to the speech recognizer to favor specific words and phrases 341// in the results. 342message SpeechContext { 343 // Optional. A list of strings containing words and phrases "hints" so that 344 // the speech recognition is more likely to recognize them. This can be used 345 // to improve the accuracy for specific words and phrases, for example, if 346 // specific commands are typically spoken by the user. This can also be used 347 // to add additional words to the vocabulary of the recognizer. See 348 // [usage limits](https://cloud.google.com/speech/limits#content). 349 repeated string phrases = 1 [(google.api.field_behavior) = OPTIONAL]; 350} 351 352// A speech recognition result corresponding to a portion of the audio. 353message SpeechTranscription { 354 // May contain one or more recognition hypotheses (up to the maximum specified 355 // in `max_alternatives`). These alternatives are ordered in terms of 356 // accuracy, with the top (first) alternative being the most probable, as 357 // ranked by the recognizer. 358 repeated SpeechRecognitionAlternative alternatives = 1; 359} 360 361// Alternative hypotheses (a.k.a. n-best list). 362message SpeechRecognitionAlternative { 363 // Output only. Transcript text representing the words that the user spoke. 364 string transcript = 1; 365 366 // Output only. The confidence estimate between 0.0 and 1.0. A higher number 367 // indicates an estimated greater likelihood that the recognized words are 368 // correct. This field is set only for the top alternative. 369 // This field is not guaranteed to be accurate and users should not rely on it 370 // to be always provided. 371 // The default of 0.0 is a sentinel value indicating `confidence` was not set. 372 float confidence = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 373 374 // Output only. A list of word-specific information for each recognized word. 375 repeated WordInfo words = 3; 376} 377 378// Word-specific information for recognized words. Word information is only 379// included in the response when certain request parameters are set, such 380// as `enable_word_time_offsets`. 381message WordInfo { 382 // Output only. Time offset relative to the beginning of the audio, and 383 // corresponding to the start of the spoken word. This field is only set if 384 // `enable_word_time_offsets=true` and only in the top hypothesis. This is an 385 // experimental feature and the accuracy of the time offset can vary. 386 google.protobuf.Duration start_time = 1; 387 388 // Output only. Time offset relative to the beginning of the audio, and 389 // corresponding to the end of the spoken word. This field is only set if 390 // `enable_word_time_offsets=true` and only in the top hypothesis. This is an 391 // experimental feature and the accuracy of the time offset can vary. 392 google.protobuf.Duration end_time = 2; 393 394 // Output only. The word corresponding to this set of information. 395 string word = 3; 396} 397 398// Video annotation feature. 399enum Feature { 400 // Unspecified. 401 FEATURE_UNSPECIFIED = 0; 402 403 // Label detection. Detect objects, such as dog or flower. 404 LABEL_DETECTION = 1; 405 406 // Shot change detection. 407 SHOT_CHANGE_DETECTION = 2; 408 409 // Explicit content detection. 410 EXPLICIT_CONTENT_DETECTION = 3; 411 412 // Speech transcription. 413 SPEECH_TRANSCRIPTION = 6; 414} 415 416// Label detection mode. 417enum LabelDetectionMode { 418 // Unspecified. 419 LABEL_DETECTION_MODE_UNSPECIFIED = 0; 420 421 // Detect shot-level labels. 422 SHOT_MODE = 1; 423 424 // Detect frame-level labels. 425 FRAME_MODE = 2; 426 427 // Detect both shot-level and frame-level labels. 428 SHOT_AND_FRAME_MODE = 3; 429} 430 431// Bucketized representation of likelihood. 432enum Likelihood { 433 // Unspecified likelihood. 434 LIKELIHOOD_UNSPECIFIED = 0; 435 436 // Very unlikely. 437 VERY_UNLIKELY = 1; 438 439 // Unlikely. 440 UNLIKELY = 2; 441 442 // Possible. 443 POSSIBLE = 3; 444 445 // Likely. 446 LIKELY = 4; 447 448 // Very likely. 449 VERY_LIKELY = 5; 450} 451