xref: /aosp_15_r20/external/googleapis/google/cloud/videointelligence/v1p1beta1/video_intelligence.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2019 Google LLC.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15
16syntax = "proto3";
17
18package google.cloud.videointelligence.v1p1beta1;
19
20import "google/api/annotations.proto";
21import "google/api/client.proto";
22import "google/api/field_behavior.proto";
23import "google/longrunning/operations.proto";
24import "google/protobuf/duration.proto";
25import "google/protobuf/timestamp.proto";
26import "google/rpc/status.proto";
27
28option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P1Beta1";
29option go_package = "cloud.google.com/go/videointelligence/apiv1p1beta1/videointelligencepb;videointelligencepb";
30option java_multiple_files = true;
31option java_outer_classname = "VideoIntelligenceServiceProto";
32option java_package = "com.google.cloud.videointelligence.v1p1beta1";
33option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p1beta1";
34option ruby_package = "Google::Cloud::VideoIntelligence::V1p1beta1";
35
36// Service that implements Google Cloud Video Intelligence API.
37service VideoIntelligenceService {
38  option (google.api.default_host) = "videointelligence.googleapis.com";
39  option (google.api.oauth_scopes) =
40      "https://www.googleapis.com/auth/cloud-platform";
41
42  // Performs asynchronous video annotation. Progress and results can be
43  // retrieved through the `google.longrunning.Operations` interface.
44  // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
45  // `Operation.response` contains `AnnotateVideoResponse` (results).
46  rpc AnnotateVideo(AnnotateVideoRequest)
47      returns (google.longrunning.Operation) {
48    option (google.api.http) = {
49      post: "/v1p1beta1/videos:annotate"
50      body: "*"
51    };
52    option (google.api.method_signature) = "input_uri,features";
53    option (google.longrunning.operation_info) = {
54      response_type: "AnnotateVideoResponse"
55      metadata_type: "AnnotateVideoProgress"
56    };
57  }
58}
59
60// Video annotation request.
61message AnnotateVideoRequest {
62  // Input video location. Currently, only
63  // [Google Cloud Storage](https://cloud.google.com/storage/) URIs are
64  // supported, which must be specified in the following format:
65  // `gs://bucket-id/object-id` (other URI formats return
66  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
67  // more information, see [Request
68  // URIs](https://cloud.google.com/storage/docs/request-endpoints). A video URI
69  // may include wildcards in `object-id`, and thus identify multiple videos.
70  // Supported wildcards: '*' to match 0 or more characters;
71  // '?' to match 1 character. If unset, the input video should be embedded
72  // in the request as `input_content`. If set, `input_content` should be unset.
73  string input_uri = 1;
74
75  // The video data bytes.
76  // If unset, the input video(s) should be specified via `input_uri`.
77  // If set, `input_uri` should be unset.
78  bytes input_content = 6;
79
80  // Required. Requested video annotation features.
81  repeated Feature features = 2 [(google.api.field_behavior) = REQUIRED];
82
83  // Additional video context and/or feature-specific parameters.
84  VideoContext video_context = 3;
85
86  // Optional. Location where the output (in JSON format) should be stored.
87  // Currently, only [Google Cloud Storage](https://cloud.google.com/storage/)
88  // URIs are supported, which must be specified in the following format:
89  // `gs://bucket-id/object-id` (other URI formats return
90  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
91  // more information, see [Request
92  // URIs](https://cloud.google.com/storage/docs/request-endpoints).
93  string output_uri = 4 [(google.api.field_behavior) = OPTIONAL];
94
95  // Optional. Cloud region where annotation should take place. Supported cloud
96  // regions: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no region
97  // is specified, a region will be determined based on video file location.
98  string location_id = 5 [(google.api.field_behavior) = OPTIONAL];
99}
100
101// Video context and/or feature-specific parameters.
102message VideoContext {
103  // Video segments to annotate. The segments may overlap and are not required
104  // to be contiguous or span the whole video. If unspecified, each video is
105  // treated as a single segment.
106  repeated VideoSegment segments = 1;
107
108  // Config for LABEL_DETECTION.
109  LabelDetectionConfig label_detection_config = 2;
110
111  // Config for SHOT_CHANGE_DETECTION.
112  ShotChangeDetectionConfig shot_change_detection_config = 3;
113
114  // Config for EXPLICIT_CONTENT_DETECTION.
115  ExplicitContentDetectionConfig explicit_content_detection_config = 4;
116
117  // Config for SPEECH_TRANSCRIPTION.
118  SpeechTranscriptionConfig speech_transcription_config = 6;
119}
120
121// Config for LABEL_DETECTION.
122message LabelDetectionConfig {
123  // What labels should be detected with LABEL_DETECTION, in addition to
124  // video-level labels or segment-level labels.
125  // If unspecified, defaults to `SHOT_MODE`.
126  LabelDetectionMode label_detection_mode = 1;
127
128  // Whether the video has been shot from a stationary (i.e. non-moving) camera.
129  // When set to true, might improve detection accuracy for moving objects.
130  // Should be used with `SHOT_AND_FRAME_MODE` enabled.
131  bool stationary_camera = 2;
132
133  // Model to use for label detection.
134  // Supported values: "builtin/stable" (the default if unset) and
135  // "builtin/latest".
136  string model = 3;
137}
138
139// Config for SHOT_CHANGE_DETECTION.
140message ShotChangeDetectionConfig {
141  // Model to use for shot change detection.
142  // Supported values: "builtin/stable" (the default if unset) and
143  // "builtin/latest".
144  string model = 1;
145}
146
147// Config for EXPLICIT_CONTENT_DETECTION.
148message ExplicitContentDetectionConfig {
149  // Model to use for explicit content detection.
150  // Supported values: "builtin/stable" (the default if unset) and
151  // "builtin/latest".
152  string model = 1;
153}
154
155// Video segment.
156message VideoSegment {
157  // Time-offset, relative to the beginning of the video,
158  // corresponding to the start of the segment (inclusive).
159  google.protobuf.Duration start_time_offset = 1;
160
161  // Time-offset, relative to the beginning of the video,
162  // corresponding to the end of the segment (inclusive).
163  google.protobuf.Duration end_time_offset = 2;
164}
165
166// Video segment level annotation results for label detection.
167message LabelSegment {
168  // Video segment where a label was detected.
169  VideoSegment segment = 1;
170
171  // Confidence that the label is accurate. Range: [0, 1].
172  float confidence = 2;
173}
174
175// Video frame level annotation results for label detection.
176message LabelFrame {
177  // Time-offset, relative to the beginning of the video, corresponding to the
178  // video frame for this location.
179  google.protobuf.Duration time_offset = 1;
180
181  // Confidence that the label is accurate. Range: [0, 1].
182  float confidence = 2;
183}
184
185// Detected entity from video analysis.
186message Entity {
187  // Opaque entity ID. Some IDs may be available in
188  // [Google Knowledge Graph Search
189  // API](https://developers.google.com/knowledge-graph/).
190  string entity_id = 1;
191
192  // Textual description, e.g. `Fixed-gear bicycle`.
193  string description = 2;
194
195  // Language code for `description` in BCP-47 format.
196  string language_code = 3;
197}
198
199// Label annotation.
200message LabelAnnotation {
201  // Detected entity.
202  Entity entity = 1;
203
204  // Common categories for the detected entity.
205  // E.g. when the label is `Terrier` the category is likely `dog`. And in some
206  // cases there might be more than one categories e.g. `Terrier` could also be
207  // a `pet`.
208  repeated Entity category_entities = 2;
209
210  // All video segments where a label was detected.
211  repeated LabelSegment segments = 3;
212
213  // All video frames where a label was detected.
214  repeated LabelFrame frames = 4;
215}
216
217// Video frame level annotation results for explicit content.
218message ExplicitContentFrame {
219  // Time-offset, relative to the beginning of the video, corresponding to the
220  // video frame for this location.
221  google.protobuf.Duration time_offset = 1;
222
223  // Likelihood of the pornography content..
224  Likelihood pornography_likelihood = 2;
225}
226
227// Explicit content annotation (based on per-frame visual signals only).
228// If no explicit content has been detected in a frame, no annotations are
229// present for that frame.
230message ExplicitContentAnnotation {
231  // All video frames where explicit content was detected.
232  repeated ExplicitContentFrame frames = 1;
233}
234
235// Annotation results for a single video.
236message VideoAnnotationResults {
237  // Output only. Video file location in
238  // [Google Cloud Storage](https://cloud.google.com/storage/).
239  string input_uri = 1;
240
241  // Label annotations on video level or user specified segment level.
242  // There is exactly one element for each unique label.
243  repeated LabelAnnotation segment_label_annotations = 2;
244
245  // Label annotations on shot level.
246  // There is exactly one element for each unique label.
247  repeated LabelAnnotation shot_label_annotations = 3;
248
249  // Label annotations on frame level.
250  // There is exactly one element for each unique label.
251  repeated LabelAnnotation frame_label_annotations = 4;
252
253  // Shot annotations. Each shot is represented as a video segment.
254  repeated VideoSegment shot_annotations = 6;
255
256  // Explicit content annotation.
257  ExplicitContentAnnotation explicit_annotation = 7;
258
259  // Speech transcription.
260  repeated SpeechTranscription speech_transcriptions = 11;
261
262  // Output only. If set, indicates an error. Note that for a single
263  // `AnnotateVideoRequest` some videos may succeed and some may fail.
264  google.rpc.Status error = 9;
265}
266
267// Video annotation response. Included in the `response`
268// field of the `Operation` returned by the `GetOperation`
269// call of the `google::longrunning::Operations` service.
270message AnnotateVideoResponse {
271  // Annotation results for all videos specified in `AnnotateVideoRequest`.
272  repeated VideoAnnotationResults annotation_results = 1;
273}
274
275// Annotation progress for a single video.
276message VideoAnnotationProgress {
277  // Output only. Video file location in
278  // [Google Cloud Storage](https://cloud.google.com/storage/).
279  string input_uri = 1;
280
281  // Output only. Approximate percentage processed thus far. Guaranteed to be
282  // 100 when fully processed.
283  int32 progress_percent = 2;
284
285  // Output only. Time when the request was received.
286  google.protobuf.Timestamp start_time = 3;
287
288  // Output only. Time of the most recent update.
289  google.protobuf.Timestamp update_time = 4;
290}
291
292// Video annotation progress. Included in the `metadata`
293// field of the `Operation` returned by the `GetOperation`
294// call of the `google::longrunning::Operations` service.
295message AnnotateVideoProgress {
296  // Progress metadata for all videos specified in `AnnotateVideoRequest`.
297  repeated VideoAnnotationProgress annotation_progress = 1;
298}
299
300// Config for SPEECH_TRANSCRIPTION.
301message SpeechTranscriptionConfig {
302  // Required. *Required* The language of the supplied audio as a
303  // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
304  // Example: "en-US".
305  // See [Language Support](https://cloud.google.com/speech/docs/languages)
306  // for a list of the currently supported language codes.
307  string language_code = 1 [(google.api.field_behavior) = REQUIRED];
308
309  // Optional. Maximum number of recognition hypotheses to be returned.
310  // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
311  // within each `SpeechTranscription`. The server may return fewer than
312  // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will
313  // return a maximum of one. If omitted, will return a maximum of one.
314  int32 max_alternatives = 2 [(google.api.field_behavior) = OPTIONAL];
315
316  // Optional. If set to `true`, the server will attempt to filter out
317  // profanities, replacing all but the initial character in each filtered word
318  // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
319  // won't be filtered out.
320  bool filter_profanity = 3 [(google.api.field_behavior) = OPTIONAL];
321
322  // Optional. A means to provide context to assist the speech recognition.
323  repeated SpeechContext speech_contexts = 4
324      [(google.api.field_behavior) = OPTIONAL];
325
326  // Optional. If 'true', adds punctuation to recognition result hypotheses.
327  // This feature is only available in select languages. Setting this for
328  // requests in other languages has no effect at all. The default 'false' value
329  // does not add punctuation to result hypotheses. NOTE: "This is currently
330  // offered as an experimental service, complimentary to all users. In the
331  // future this may be exclusively available as a premium feature."
332  bool enable_automatic_punctuation = 5
333      [(google.api.field_behavior) = OPTIONAL];
334
335  // Optional. For file formats, such as MXF or MKV, supporting multiple audio
336  // tracks, specify up to two tracks. Default: track 0.
337  repeated int32 audio_tracks = 6 [(google.api.field_behavior) = OPTIONAL];
338}
339
340// Provides "hints" to the speech recognizer to favor specific words and phrases
341// in the results.
342message SpeechContext {
343  // Optional. A list of strings containing words and phrases "hints" so that
344  // the speech recognition is more likely to recognize them. This can be used
345  // to improve the accuracy for specific words and phrases, for example, if
346  // specific commands are typically spoken by the user. This can also be used
347  // to add additional words to the vocabulary of the recognizer. See
348  // [usage limits](https://cloud.google.com/speech/limits#content).
349  repeated string phrases = 1 [(google.api.field_behavior) = OPTIONAL];
350}
351
352// A speech recognition result corresponding to a portion of the audio.
353message SpeechTranscription {
354  // May contain one or more recognition hypotheses (up to the maximum specified
355  // in `max_alternatives`).  These alternatives are ordered in terms of
356  // accuracy, with the top (first) alternative being the most probable, as
357  // ranked by the recognizer.
358  repeated SpeechRecognitionAlternative alternatives = 1;
359}
360
361// Alternative hypotheses (a.k.a. n-best list).
362message SpeechRecognitionAlternative {
363  // Output only. Transcript text representing the words that the user spoke.
364  string transcript = 1;
365
366  // Output only. The confidence estimate between 0.0 and 1.0. A higher number
367  // indicates an estimated greater likelihood that the recognized words are
368  // correct. This field is set only for the top alternative.
369  // This field is not guaranteed to be accurate and users should not rely on it
370  // to be always provided.
371  // The default of 0.0 is a sentinel value indicating `confidence` was not set.
372  float confidence = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
373
374  // Output only. A list of word-specific information for each recognized word.
375  repeated WordInfo words = 3;
376}
377
378// Word-specific information for recognized words. Word information is only
379// included in the response when certain request parameters are set, such
380// as `enable_word_time_offsets`.
381message WordInfo {
382  // Output only. Time offset relative to the beginning of the audio, and
383  // corresponding to the start of the spoken word. This field is only set if
384  // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
385  // experimental feature and the accuracy of the time offset can vary.
386  google.protobuf.Duration start_time = 1;
387
388  // Output only. Time offset relative to the beginning of the audio, and
389  // corresponding to the end of the spoken word. This field is only set if
390  // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
391  // experimental feature and the accuracy of the time offset can vary.
392  google.protobuf.Duration end_time = 2;
393
394  // Output only. The word corresponding to this set of information.
395  string word = 3;
396}
397
398// Video annotation feature.
399enum Feature {
400  // Unspecified.
401  FEATURE_UNSPECIFIED = 0;
402
403  // Label detection. Detect objects, such as dog or flower.
404  LABEL_DETECTION = 1;
405
406  // Shot change detection.
407  SHOT_CHANGE_DETECTION = 2;
408
409  // Explicit content detection.
410  EXPLICIT_CONTENT_DETECTION = 3;
411
412  // Speech transcription.
413  SPEECH_TRANSCRIPTION = 6;
414}
415
416// Label detection mode.
417enum LabelDetectionMode {
418  // Unspecified.
419  LABEL_DETECTION_MODE_UNSPECIFIED = 0;
420
421  // Detect shot-level labels.
422  SHOT_MODE = 1;
423
424  // Detect frame-level labels.
425  FRAME_MODE = 2;
426
427  // Detect both shot-level and frame-level labels.
428  SHOT_AND_FRAME_MODE = 3;
429}
430
431// Bucketized representation of likelihood.
432enum Likelihood {
433  // Unspecified likelihood.
434  LIKELIHOOD_UNSPECIFIED = 0;
435
436  // Very unlikely.
437  VERY_UNLIKELY = 1;
438
439  // Unlikely.
440  UNLIKELY = 2;
441
442  // Possible.
443  POSSIBLE = 3;
444
445  // Likely.
446  LIKELY = 4;
447
448  // Very likely.
449  VERY_LIKELY = 5;
450}
451