1// Copyright 2019 Google LLC. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14// 15 16syntax = "proto3"; 17 18package google.cloud.videointelligence.v1p2beta1; 19 20import "google/api/annotations.proto"; 21import "google/api/client.proto"; 22import "google/api/field_behavior.proto"; 23import "google/longrunning/operations.proto"; 24import "google/protobuf/duration.proto"; 25import "google/protobuf/timestamp.proto"; 26import "google/rpc/status.proto"; 27 28option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P2Beta1"; 29option go_package = "cloud.google.com/go/videointelligence/apiv1p2beta1/videointelligencepb;videointelligencepb"; 30option java_multiple_files = true; 31option java_outer_classname = "VideoIntelligenceServiceProto"; 32option java_package = "com.google.cloud.videointelligence.v1p2beta1"; 33option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p2beta1"; 34option ruby_package = "Google::Cloud::VideoIntelligence::V1p2beta1"; 35 36// Service that implements Google Cloud Video Intelligence API. 37service VideoIntelligenceService { 38 option (google.api.default_host) = "videointelligence.googleapis.com"; 39 option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform"; 40 41 // Performs asynchronous video annotation. Progress and results can be 42 // retrieved through the `google.longrunning.Operations` interface. 43 // `Operation.metadata` contains `AnnotateVideoProgress` (progress). 44 // `Operation.response` contains `AnnotateVideoResponse` (results). 45 rpc AnnotateVideo(AnnotateVideoRequest) returns (google.longrunning.Operation) { 46 option (google.api.http) = { 47 post: "/v1p2beta1/videos:annotate" 48 body: "*" 49 }; 50 option (google.api.method_signature) = "input_uri,features"; 51 option (google.longrunning.operation_info) = { 52 response_type: "AnnotateVideoResponse" 53 metadata_type: "AnnotateVideoProgress" 54 }; 55 } 56} 57 58// Video annotation request. 59message AnnotateVideoRequest { 60 // Input video location. Currently, only 61 // [Google Cloud Storage](https://cloud.google.com/storage/) URIs are 62 // supported, which must be specified in the following format: 63 // `gs://bucket-id/object-id` (other URI formats return 64 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see 65 // [Request URIs](https://cloud.google.com/storage/docs/request-endpoints). 66 // A video URI may include wildcards in `object-id`, and thus identify 67 // multiple videos. Supported wildcards: '*' to match 0 or more characters; 68 // '?' to match 1 character. If unset, the input video should be embedded 69 // in the request as `input_content`. If set, `input_content` should be unset. 70 string input_uri = 1; 71 72 // The video data bytes. 73 // If unset, the input video(s) should be specified via `input_uri`. 74 // If set, `input_uri` should be unset. 75 bytes input_content = 6; 76 77 // Required. Requested video annotation features. 78 repeated Feature features = 2 [(google.api.field_behavior) = REQUIRED]; 79 80 // Additional video context and/or feature-specific parameters. 81 VideoContext video_context = 3; 82 83 // Optional. Location where the output (in JSON format) should be stored. 84 // Currently, only [Google Cloud Storage](https://cloud.google.com/storage/) 85 // URIs are supported, which must be specified in the following format: 86 // `gs://bucket-id/object-id` (other URI formats return 87 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see 88 // [Request URIs](https://cloud.google.com/storage/docs/request-endpoints). 89 string output_uri = 4 [(google.api.field_behavior) = OPTIONAL]; 90 91 // Optional. Cloud region where annotation should take place. Supported cloud 92 // regions: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no region 93 // is specified, a region will be determined based on video file location. 94 string location_id = 5 [(google.api.field_behavior) = OPTIONAL]; 95} 96 97// Video context and/or feature-specific parameters. 98message VideoContext { 99 // Video segments to annotate. The segments may overlap and are not required 100 // to be contiguous or span the whole video. If unspecified, each video is 101 // treated as a single segment. 102 repeated VideoSegment segments = 1; 103 104 // Config for LABEL_DETECTION. 105 LabelDetectionConfig label_detection_config = 2; 106 107 // Config for SHOT_CHANGE_DETECTION. 108 ShotChangeDetectionConfig shot_change_detection_config = 3; 109 110 // Config for EXPLICIT_CONTENT_DETECTION. 111 ExplicitContentDetectionConfig explicit_content_detection_config = 4; 112 113 // Config for TEXT_DETECTION. 114 TextDetectionConfig text_detection_config = 8; 115} 116 117// Config for LABEL_DETECTION. 118message LabelDetectionConfig { 119 // What labels should be detected with LABEL_DETECTION, in addition to 120 // video-level labels or segment-level labels. 121 // If unspecified, defaults to `SHOT_MODE`. 122 LabelDetectionMode label_detection_mode = 1; 123 124 // Whether the video has been shot from a stationary (i.e. non-moving) camera. 125 // When set to true, might improve detection accuracy for moving objects. 126 // Should be used with `SHOT_AND_FRAME_MODE` enabled. 127 bool stationary_camera = 2; 128 129 // Model to use for label detection. 130 // Supported values: "builtin/stable" (the default if unset) and 131 // "builtin/latest". 132 string model = 3; 133} 134 135// Config for SHOT_CHANGE_DETECTION. 136message ShotChangeDetectionConfig { 137 // Model to use for shot change detection. 138 // Supported values: "builtin/stable" (the default if unset) and 139 // "builtin/latest". 140 string model = 1; 141} 142 143// Config for EXPLICIT_CONTENT_DETECTION. 144message ExplicitContentDetectionConfig { 145 // Model to use for explicit content detection. 146 // Supported values: "builtin/stable" (the default if unset) and 147 // "builtin/latest". 148 string model = 1; 149} 150 151// Config for TEXT_DETECTION. 152message TextDetectionConfig { 153 // Language hint can be specified if the language to be detected is known a 154 // priori. It can increase the accuracy of the detection. Language hint must 155 // be language code in BCP-47 format. 156 // 157 // Automatic language detection is performed if no hint is provided. 158 repeated string language_hints = 1; 159} 160 161// Video segment. 162message VideoSegment { 163 // Time-offset, relative to the beginning of the video, 164 // corresponding to the start of the segment (inclusive). 165 google.protobuf.Duration start_time_offset = 1; 166 167 // Time-offset, relative to the beginning of the video, 168 // corresponding to the end of the segment (inclusive). 169 google.protobuf.Duration end_time_offset = 2; 170} 171 172// Video segment level annotation results for label detection. 173message LabelSegment { 174 // Video segment where a label was detected. 175 VideoSegment segment = 1; 176 177 // Confidence that the label is accurate. Range: [0, 1]. 178 float confidence = 2; 179} 180 181// Video frame level annotation results for label detection. 182message LabelFrame { 183 // Time-offset, relative to the beginning of the video, corresponding to the 184 // video frame for this location. 185 google.protobuf.Duration time_offset = 1; 186 187 // Confidence that the label is accurate. Range: [0, 1]. 188 float confidence = 2; 189} 190 191// Detected entity from video analysis. 192message Entity { 193 // Opaque entity ID. Some IDs may be available in 194 // [Google Knowledge Graph Search 195 // API](https://developers.google.com/knowledge-graph/). 196 string entity_id = 1; 197 198 // Textual description, e.g. `Fixed-gear bicycle`. 199 string description = 2; 200 201 // Language code for `description` in BCP-47 format. 202 string language_code = 3; 203} 204 205// Label annotation. 206message LabelAnnotation { 207 // Detected entity. 208 Entity entity = 1; 209 210 // Common categories for the detected entity. 211 // E.g. when the label is `Terrier` the category is likely `dog`. And in some 212 // cases there might be more than one categories e.g. `Terrier` could also be 213 // a `pet`. 214 repeated Entity category_entities = 2; 215 216 // All video segments where a label was detected. 217 repeated LabelSegment segments = 3; 218 219 // All video frames where a label was detected. 220 repeated LabelFrame frames = 4; 221} 222 223// Video frame level annotation results for explicit content. 224message ExplicitContentFrame { 225 // Time-offset, relative to the beginning of the video, corresponding to the 226 // video frame for this location. 227 google.protobuf.Duration time_offset = 1; 228 229 // Likelihood of the pornography content.. 230 Likelihood pornography_likelihood = 2; 231} 232 233// Explicit content annotation (based on per-frame visual signals only). 234// If no explicit content has been detected in a frame, no annotations are 235// present for that frame. 236message ExplicitContentAnnotation { 237 // All video frames where explicit content was detected. 238 repeated ExplicitContentFrame frames = 1; 239} 240 241// Normalized bounding box. 242// The normalized vertex coordinates are relative to the original image. 243// Range: [0, 1]. 244message NormalizedBoundingBox { 245 // Left X coordinate. 246 float left = 1; 247 248 // Top Y coordinate. 249 float top = 2; 250 251 // Right X coordinate. 252 float right = 3; 253 254 // Bottom Y coordinate. 255 float bottom = 4; 256} 257 258// Annotation results for a single video. 259message VideoAnnotationResults { 260 // Video file location in 261 // [Google Cloud Storage](https://cloud.google.com/storage/). 262 string input_uri = 1; 263 264 // Label annotations on video level or user specified segment level. 265 // There is exactly one element for each unique label. 266 repeated LabelAnnotation segment_label_annotations = 2; 267 268 // Label annotations on shot level. 269 // There is exactly one element for each unique label. 270 repeated LabelAnnotation shot_label_annotations = 3; 271 272 // Label annotations on frame level. 273 // There is exactly one element for each unique label. 274 repeated LabelAnnotation frame_label_annotations = 4; 275 276 // Shot annotations. Each shot is represented as a video segment. 277 repeated VideoSegment shot_annotations = 6; 278 279 // Explicit content annotation. 280 ExplicitContentAnnotation explicit_annotation = 7; 281 282 // OCR text detection and tracking. 283 // Annotations for list of detected text snippets. Each will have list of 284 // frame information associated with it. 285 repeated TextAnnotation text_annotations = 12; 286 287 // Annotations for list of objects detected and tracked in video. 288 repeated ObjectTrackingAnnotation object_annotations = 14; 289 290 // If set, indicates an error. Note that for a single `AnnotateVideoRequest` 291 // some videos may succeed and some may fail. 292 google.rpc.Status error = 9; 293} 294 295// Video annotation response. Included in the `response` 296// field of the `Operation` returned by the `GetOperation` 297// call of the `google::longrunning::Operations` service. 298message AnnotateVideoResponse { 299 // Annotation results for all videos specified in `AnnotateVideoRequest`. 300 repeated VideoAnnotationResults annotation_results = 1; 301} 302 303// Annotation progress for a single video. 304message VideoAnnotationProgress { 305 // Video file location in 306 // [Google Cloud Storage](https://cloud.google.com/storage/). 307 string input_uri = 1; 308 309 // Approximate percentage processed thus far. Guaranteed to be 310 // 100 when fully processed. 311 int32 progress_percent = 2; 312 313 // Time when the request was received. 314 google.protobuf.Timestamp start_time = 3; 315 316 // Time of the most recent update. 317 google.protobuf.Timestamp update_time = 4; 318} 319 320// Video annotation progress. Included in the `metadata` 321// field of the `Operation` returned by the `GetOperation` 322// call of the `google::longrunning::Operations` service. 323message AnnotateVideoProgress { 324 // Progress metadata for all videos specified in `AnnotateVideoRequest`. 325 repeated VideoAnnotationProgress annotation_progress = 1; 326} 327 328// A vertex represents a 2D point in the image. 329// NOTE: the normalized vertex coordinates are relative to the original image 330// and range from 0 to 1. 331message NormalizedVertex { 332 // X coordinate. 333 float x = 1; 334 335 // Y coordinate. 336 float y = 2; 337} 338 339// Normalized bounding polygon for text (that might not be aligned with axis). 340// Contains list of the corner points in clockwise order starting from 341// top-left corner. For example, for a rectangular bounding box: 342// When the text is horizontal it might look like: 343// 0----1 344// | | 345// 3----2 346// 347// When it's clockwise rotated 180 degrees around the top-left corner it 348// becomes: 349// 2----3 350// | | 351// 1----0 352// 353// and the vertex order will still be (0, 1, 2, 3). Note that values can be less 354// than 0, or greater than 1 due to trignometric calculations for location of 355// the box. 356message NormalizedBoundingPoly { 357 // Normalized vertices of the bounding polygon. 358 repeated NormalizedVertex vertices = 1; 359} 360 361// Video segment level annotation results for text detection. 362message TextSegment { 363 // Video segment where a text snippet was detected. 364 VideoSegment segment = 1; 365 366 // Confidence for the track of detected text. It is calculated as the highest 367 // over all frames where OCR detected text appears. 368 float confidence = 2; 369 370 // Information related to the frames where OCR detected text appears. 371 repeated TextFrame frames = 3; 372} 373 374// Video frame level annotation results for text annotation (OCR). 375// Contains information regarding timestamp and bounding box locations for the 376// frames containing detected OCR text snippets. 377message TextFrame { 378 // Bounding polygon of the detected text for this frame. 379 NormalizedBoundingPoly rotated_bounding_box = 1; 380 381 // Timestamp of this frame. 382 google.protobuf.Duration time_offset = 2; 383} 384 385// Annotations related to one detected OCR text snippet. This will contain the 386// corresponding text, confidence value, and frame level information for each 387// detection. 388message TextAnnotation { 389 // The detected text. 390 string text = 1; 391 392 // All video segments where OCR detected text appears. 393 repeated TextSegment segments = 2; 394} 395 396// Video frame level annotations for object detection and tracking. This field 397// stores per frame location, time offset, and confidence. 398message ObjectTrackingFrame { 399 // The normalized bounding box location of this object track for the frame. 400 NormalizedBoundingBox normalized_bounding_box = 1; 401 402 // The timestamp of the frame in microseconds. 403 google.protobuf.Duration time_offset = 2; 404} 405 406// Annotations corresponding to one tracked object. 407message ObjectTrackingAnnotation { 408 // Different representation of tracking info in non-streaming batch 409 // and streaming modes. 410 oneof track_info { 411 // Non-streaming batch mode ONLY. 412 // Each object track corresponds to one video segment where it appears. 413 VideoSegment segment = 3; 414 415 // Streaming mode ONLY. 416 // In streaming mode, we do not know the end time of a tracked object 417 // before it is completed. Hence, there is no VideoSegment info returned. 418 // Instead, we provide a unique identifiable integer track_id so that 419 // the customers can correlate the results of the ongoing 420 // ObjectTrackAnnotation of the same track_id over time. 421 int64 track_id = 5; 422 } 423 424 // Entity to specify the object category that this track is labeled as. 425 Entity entity = 1; 426 427 // Object category's labeling confidence of this track. 428 float confidence = 4; 429 430 // Information corresponding to all frames where this object track appears. 431 repeated ObjectTrackingFrame frames = 2; 432} 433 434// Video annotation feature. 435enum Feature { 436 // Unspecified. 437 FEATURE_UNSPECIFIED = 0; 438 439 // Label detection. Detect objects, such as dog or flower. 440 LABEL_DETECTION = 1; 441 442 // Shot change detection. 443 SHOT_CHANGE_DETECTION = 2; 444 445 // Explicit content detection. 446 EXPLICIT_CONTENT_DETECTION = 3; 447 448 // OCR text detection and tracking. 449 TEXT_DETECTION = 7; 450 451 // Object detection and tracking. 452 OBJECT_TRACKING = 9; 453} 454 455// Label detection mode. 456enum LabelDetectionMode { 457 // Unspecified. 458 LABEL_DETECTION_MODE_UNSPECIFIED = 0; 459 460 // Detect shot-level labels. 461 SHOT_MODE = 1; 462 463 // Detect frame-level labels. 464 FRAME_MODE = 2; 465 466 // Detect both shot-level and frame-level labels. 467 SHOT_AND_FRAME_MODE = 3; 468} 469 470// Bucketized representation of likelihood. 471enum Likelihood { 472 // Unspecified likelihood. 473 LIKELIHOOD_UNSPECIFIED = 0; 474 475 // Very unlikely. 476 VERY_UNLIKELY = 1; 477 478 // Unlikely. 479 UNLIKELY = 2; 480 481 // Possible. 482 POSSIBLE = 3; 483 484 // Likely. 485 LIKELY = 4; 486 487 // Very likely. 488 VERY_LIKELY = 5; 489} 490