1// Copyright 2019 Google LLC. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14// 15 16syntax = "proto3"; 17 18package google.cloud.vision.v1p1beta1; 19 20import "google/api/annotations.proto"; 21import "google/api/client.proto"; 22import "google/api/field_behavior.proto"; 23import "google/cloud/vision/v1p1beta1/geometry.proto"; 24import "google/cloud/vision/v1p1beta1/text_annotation.proto"; 25import "google/cloud/vision/v1p1beta1/web_detection.proto"; 26import "google/rpc/status.proto"; 27import "google/type/color.proto"; 28import "google/type/latlng.proto"; 29 30option cc_enable_arenas = true; 31option go_package = "cloud.google.com/go/vision/v2/apiv1p1beta1/visionpb;visionpb"; 32option java_multiple_files = true; 33option java_outer_classname = "ImageAnnotatorProto"; 34option java_package = "com.google.cloud.vision.v1p1beta1"; 35 36// Service that performs Google Cloud Vision API detection tasks over client 37// images, such as face, landmark, logo, label, and text detection. The 38// ImageAnnotator service returns detected entities from the images. 39service ImageAnnotator { 40 option (google.api.default_host) = "vision.googleapis.com"; 41 option (google.api.oauth_scopes) = 42 "https://www.googleapis.com/auth/cloud-platform," 43 "https://www.googleapis.com/auth/cloud-vision"; 44 45 // Run image detection and annotation for a batch of images. 46 rpc BatchAnnotateImages(BatchAnnotateImagesRequest) 47 returns (BatchAnnotateImagesResponse) { 48 option (google.api.http) = { 49 post: "/v1p1beta1/images:annotate" 50 body: "*" 51 }; 52 option (google.api.method_signature) = "requests"; 53 } 54} 55 56// Users describe the type of Google Cloud Vision API tasks to perform over 57// images by using *Feature*s. Each Feature indicates a type of image 58// detection task to perform. Features encode the Cloud Vision API 59// vertical to operate on and the number of top-scoring results to return. 60message Feature { 61 // Type of image feature. 62 enum Type { 63 // Unspecified feature type. 64 TYPE_UNSPECIFIED = 0; 65 66 // Run face detection. 67 FACE_DETECTION = 1; 68 69 // Run landmark detection. 70 LANDMARK_DETECTION = 2; 71 72 // Run logo detection. 73 LOGO_DETECTION = 3; 74 75 // Run label detection. 76 LABEL_DETECTION = 4; 77 78 // Run OCR. 79 TEXT_DETECTION = 5; 80 81 // Run dense text document OCR. Takes precedence when both 82 // DOCUMENT_TEXT_DETECTION and TEXT_DETECTION are present. 83 DOCUMENT_TEXT_DETECTION = 11; 84 85 // Run computer vision models to compute image safe-search properties. 86 SAFE_SEARCH_DETECTION = 6; 87 88 // Compute a set of image properties, such as the image's dominant colors. 89 IMAGE_PROPERTIES = 7; 90 91 // Run crop hints. 92 CROP_HINTS = 9; 93 94 // Run web detection. 95 WEB_DETECTION = 10; 96 } 97 98 // The feature type. 99 Type type = 1; 100 101 // Maximum number of results of this type. 102 int32 max_results = 2; 103 104 // Model to use for the feature. 105 // Supported values: "builtin/stable" (the default if unset) and 106 // "builtin/latest". `DOCUMENT_TEXT_DETECTION` and `TEXT_DETECTION` also 107 // support "builtin/weekly" for the bleeding edge release updated weekly. 108 string model = 3; 109} 110 111// External image source (Google Cloud Storage image location). 112message ImageSource { 113 // NOTE: For new code `image_uri` below is preferred. 114 // Google Cloud Storage image URI, which must be in the following form: 115 // `gs://bucket_name/object_name` (for details, see 116 // [Google Cloud Storage Request 117 // URIs](https://cloud.google.com/storage/docs/reference-uris)). 118 // NOTE: Cloud Storage object versioning is not supported. 119 string gcs_image_uri = 1; 120 121 // Image URI which supports: 122 // 1) Google Cloud Storage image URI, which must be in the following form: 123 // `gs://bucket_name/object_name` (for details, see 124 // [Google Cloud Storage Request 125 // URIs](https://cloud.google.com/storage/docs/reference-uris)). 126 // NOTE: Cloud Storage object versioning is not supported. 127 // 2) Publicly accessible image HTTP/HTTPS URL. 128 // This is preferred over the legacy `gcs_image_uri` above. When both 129 // `gcs_image_uri` and `image_uri` are specified, `image_uri` takes 130 // precedence. 131 string image_uri = 2; 132} 133 134// Client image to perform Google Cloud Vision API tasks over. 135message Image { 136 // Image content, represented as a stream of bytes. 137 // Note: as with all `bytes` fields, protobuffers use a pure binary 138 // representation, whereas JSON representations use base64. 139 bytes content = 1; 140 141 // Google Cloud Storage image location. If both `content` and `source` 142 // are provided for an image, `content` takes precedence and is 143 // used to perform the image annotation request. 144 ImageSource source = 2; 145} 146 147// A face annotation object contains the results of face detection. 148message FaceAnnotation { 149 // A face-specific landmark (for example, a face feature). 150 message Landmark { 151 // Face landmark (feature) type. 152 // Left and right are defined from the vantage of the viewer of the image 153 // without considering mirror projections typical of photos. So, `LEFT_EYE`, 154 // typically, is the person's right eye. 155 enum Type { 156 // Unknown face landmark detected. Should not be filled. 157 UNKNOWN_LANDMARK = 0; 158 159 // Left eye. 160 LEFT_EYE = 1; 161 162 // Right eye. 163 RIGHT_EYE = 2; 164 165 // Left of left eyebrow. 166 LEFT_OF_LEFT_EYEBROW = 3; 167 168 // Right of left eyebrow. 169 RIGHT_OF_LEFT_EYEBROW = 4; 170 171 // Left of right eyebrow. 172 LEFT_OF_RIGHT_EYEBROW = 5; 173 174 // Right of right eyebrow. 175 RIGHT_OF_RIGHT_EYEBROW = 6; 176 177 // Midpoint between eyes. 178 MIDPOINT_BETWEEN_EYES = 7; 179 180 // Nose tip. 181 NOSE_TIP = 8; 182 183 // Upper lip. 184 UPPER_LIP = 9; 185 186 // Lower lip. 187 LOWER_LIP = 10; 188 189 // Mouth left. 190 MOUTH_LEFT = 11; 191 192 // Mouth right. 193 MOUTH_RIGHT = 12; 194 195 // Mouth center. 196 MOUTH_CENTER = 13; 197 198 // Nose, bottom right. 199 NOSE_BOTTOM_RIGHT = 14; 200 201 // Nose, bottom left. 202 NOSE_BOTTOM_LEFT = 15; 203 204 // Nose, bottom center. 205 NOSE_BOTTOM_CENTER = 16; 206 207 // Left eye, top boundary. 208 LEFT_EYE_TOP_BOUNDARY = 17; 209 210 // Left eye, right corner. 211 LEFT_EYE_RIGHT_CORNER = 18; 212 213 // Left eye, bottom boundary. 214 LEFT_EYE_BOTTOM_BOUNDARY = 19; 215 216 // Left eye, left corner. 217 LEFT_EYE_LEFT_CORNER = 20; 218 219 // Right eye, top boundary. 220 RIGHT_EYE_TOP_BOUNDARY = 21; 221 222 // Right eye, right corner. 223 RIGHT_EYE_RIGHT_CORNER = 22; 224 225 // Right eye, bottom boundary. 226 RIGHT_EYE_BOTTOM_BOUNDARY = 23; 227 228 // Right eye, left corner. 229 RIGHT_EYE_LEFT_CORNER = 24; 230 231 // Left eyebrow, upper midpoint. 232 LEFT_EYEBROW_UPPER_MIDPOINT = 25; 233 234 // Right eyebrow, upper midpoint. 235 RIGHT_EYEBROW_UPPER_MIDPOINT = 26; 236 237 // Left ear tragion. 238 LEFT_EAR_TRAGION = 27; 239 240 // Right ear tragion. 241 RIGHT_EAR_TRAGION = 28; 242 243 // Left eye pupil. 244 LEFT_EYE_PUPIL = 29; 245 246 // Right eye pupil. 247 RIGHT_EYE_PUPIL = 30; 248 249 // Forehead glabella. 250 FOREHEAD_GLABELLA = 31; 251 252 // Chin gnathion. 253 CHIN_GNATHION = 32; 254 255 // Chin left gonion. 256 CHIN_LEFT_GONION = 33; 257 258 // Chin right gonion. 259 CHIN_RIGHT_GONION = 34; 260 } 261 262 // Face landmark type. 263 Type type = 3; 264 265 // Face landmark position. 266 Position position = 4; 267 } 268 269 // The bounding polygon around the face. The coordinates of the bounding box 270 // are in the original image's scale, as returned in `ImageParams`. 271 // The bounding box is computed to "frame" the face in accordance with human 272 // expectations. It is based on the landmarker results. 273 // Note that one or more x and/or y coordinates may not be generated in the 274 // `BoundingPoly` (the polygon will be unbounded) if only a partial face 275 // appears in the image to be annotated. 276 BoundingPoly bounding_poly = 1; 277 278 // The `fd_bounding_poly` bounding polygon is tighter than the 279 // `boundingPoly`, and encloses only the skin part of the face. Typically, it 280 // is used to eliminate the face from any image analysis that detects the 281 // "amount of skin" visible in an image. It is not based on the 282 // landmarker results, only on the initial face detection, hence 283 // the <code>fd</code> (face detection) prefix. 284 BoundingPoly fd_bounding_poly = 2; 285 286 // Detected face landmarks. 287 repeated Landmark landmarks = 3; 288 289 // Roll angle, which indicates the amount of clockwise/anti-clockwise rotation 290 // of the face relative to the image vertical about the axis perpendicular to 291 // the face. Range [-180,180]. 292 float roll_angle = 4; 293 294 // Yaw angle, which indicates the leftward/rightward angle that the face is 295 // pointing relative to the vertical plane perpendicular to the image. Range 296 // [-180,180]. 297 float pan_angle = 5; 298 299 // Pitch angle, which indicates the upwards/downwards angle that the face is 300 // pointing relative to the image's horizontal plane. Range [-180,180]. 301 float tilt_angle = 6; 302 303 // Detection confidence. Range [0, 1]. 304 float detection_confidence = 7; 305 306 // Face landmarking confidence. Range [0, 1]. 307 float landmarking_confidence = 8; 308 309 // Joy likelihood. 310 Likelihood joy_likelihood = 9; 311 312 // Sorrow likelihood. 313 Likelihood sorrow_likelihood = 10; 314 315 // Anger likelihood. 316 Likelihood anger_likelihood = 11; 317 318 // Surprise likelihood. 319 Likelihood surprise_likelihood = 12; 320 321 // Under-exposed likelihood. 322 Likelihood under_exposed_likelihood = 13; 323 324 // Blurred likelihood. 325 Likelihood blurred_likelihood = 14; 326 327 // Headwear likelihood. 328 Likelihood headwear_likelihood = 15; 329} 330 331// Detected entity location information. 332message LocationInfo { 333 // lat/long location coordinates. 334 google.type.LatLng lat_lng = 1; 335} 336 337// A `Property` consists of a user-supplied name/value pair. 338message Property { 339 // Name of the property. 340 string name = 1; 341 342 // Value of the property. 343 string value = 2; 344 345 // Value of numeric properties. 346 uint64 uint64_value = 3; 347} 348 349// Set of detected entity features. 350message EntityAnnotation { 351 // Opaque entity ID. Some IDs may be available in 352 // [Google Knowledge Graph Search 353 // API](https://developers.google.com/knowledge-graph/). 354 string mid = 1; 355 356 // The language code for the locale in which the entity textual 357 // `description` is expressed. 358 string locale = 2; 359 360 // Entity textual description, expressed in its `locale` language. 361 string description = 3; 362 363 // Overall score of the result. Range [0, 1]. 364 float score = 4; 365 366 // The accuracy of the entity detection in an image. 367 // For example, for an image in which the "Eiffel Tower" entity is detected, 368 // this field represents the confidence that there is a tower in the query 369 // image. Range [0, 1]. 370 float confidence = 5; 371 372 // The relevancy of the ICA (Image Content Annotation) label to the 373 // image. For example, the relevancy of "tower" is likely higher to an image 374 // containing the detected "Eiffel Tower" than to an image containing a 375 // detected distant towering building, even though the confidence that 376 // there is a tower in each image may be the same. Range [0, 1]. 377 float topicality = 6; 378 379 // Image region to which this entity belongs. Not produced 380 // for `LABEL_DETECTION` features. 381 BoundingPoly bounding_poly = 7; 382 383 // The location information for the detected entity. Multiple 384 // `LocationInfo` elements can be present because one location may 385 // indicate the location of the scene in the image, and another location 386 // may indicate the location of the place where the image was taken. 387 // Location information is usually present for landmarks. 388 repeated LocationInfo locations = 8; 389 390 // Some entities may have optional user-supplied `Property` (name/value) 391 // fields, such a score or string that qualifies the entity. 392 repeated Property properties = 9; 393} 394 395// Set of features pertaining to the image, computed by computer vision 396// methods over safe-search verticals (for example, adult, spoof, medical, 397// violence). 398message SafeSearchAnnotation { 399 // Represents the adult content likelihood for the image. Adult content may 400 // contain elements such as nudity, pornographic images or cartoons, or 401 // sexual activities. 402 Likelihood adult = 1; 403 404 // Spoof likelihood. The likelihood that an modification 405 // was made to the image's canonical version to make it appear 406 // funny or offensive. 407 Likelihood spoof = 2; 408 409 // Likelihood that this is a medical image. 410 Likelihood medical = 3; 411 412 // Likelihood that this image contains violent content. 413 Likelihood violence = 4; 414 415 // Likelihood that the request image contains racy content. Racy content may 416 // include (but is not limited to) skimpy or sheer clothing, strategically 417 // covered nudity, lewd or provocative poses, or close-ups of sensitive 418 // body areas. 419 Likelihood racy = 9; 420} 421 422// Rectangle determined by min and max `LatLng` pairs. 423message LatLongRect { 424 // Min lat/long pair. 425 google.type.LatLng min_lat_lng = 1; 426 427 // Max lat/long pair. 428 google.type.LatLng max_lat_lng = 2; 429} 430 431// Color information consists of RGB channels, score, and the fraction of 432// the image that the color occupies in the image. 433message ColorInfo { 434 // RGB components of the color. 435 google.type.Color color = 1; 436 437 // Image-specific score for this color. Value in range [0, 1]. 438 float score = 2; 439 440 // The fraction of pixels the color occupies in the image. 441 // Value in range [0, 1]. 442 float pixel_fraction = 3; 443} 444 445// Set of dominant colors and their corresponding scores. 446message DominantColorsAnnotation { 447 // RGB color values with their score and pixel fraction. 448 repeated ColorInfo colors = 1; 449} 450 451// Stores image properties, such as dominant colors. 452message ImageProperties { 453 // If present, dominant colors completed successfully. 454 DominantColorsAnnotation dominant_colors = 1; 455} 456 457// Single crop hint that is used to generate a new crop when serving an image. 458message CropHint { 459 // The bounding polygon for the crop region. The coordinates of the bounding 460 // box are in the original image's scale, as returned in `ImageParams`. 461 BoundingPoly bounding_poly = 1; 462 463 // Confidence of this being a salient region. Range [0, 1]. 464 float confidence = 2; 465 466 // Fraction of importance of this salient region with respect to the original 467 // image. 468 float importance_fraction = 3; 469} 470 471// Set of crop hints that are used to generate new crops when serving images. 472message CropHintsAnnotation { 473 // Crop hint results. 474 repeated CropHint crop_hints = 1; 475} 476 477// Parameters for crop hints annotation request. 478message CropHintsParams { 479 // Aspect ratios in floats, representing the ratio of the width to the height 480 // of the image. For example, if the desired aspect ratio is 4/3, the 481 // corresponding float value should be 1.33333. If not specified, the 482 // best possible crop is returned. The number of provided aspect ratios is 483 // limited to a maximum of 16; any aspect ratios provided after the 16th are 484 // ignored. 485 repeated float aspect_ratios = 1; 486} 487 488// Parameters for web detection request. 489message WebDetectionParams { 490 // Whether to include results derived from the geo information in the image. 491 bool include_geo_results = 2; 492} 493 494// Parameters for text detections. This is used to control TEXT_DETECTION and 495// DOCUMENT_TEXT_DETECTION features. 496message TextDetectionParams { 497 498 // By default, Cloud Vision API only includes confidence score for 499 // DOCUMENT_TEXT_DETECTION result. Set the flag to true to include confidence 500 // score for TEXT_DETECTION as well. 501 bool enable_text_detection_confidence_score = 9; 502 503 // A list of advanced OCR options to fine-tune OCR behavior. 504 repeated string advanced_ocr_options = 11; 505} 506 507// Image context and/or feature-specific parameters. 508message ImageContext { 509 // lat/long rectangle that specifies the location of the image. 510 LatLongRect lat_long_rect = 1; 511 512 // List of languages to use for TEXT_DETECTION. In most cases, an empty value 513 // yields the best results since it enables automatic language detection. For 514 // languages based on the Latin alphabet, setting `language_hints` is not 515 // needed. In rare cases, when the language of the text in the image is known, 516 // setting a hint will help get better results (although it will be a 517 // significant hindrance if the hint is wrong). Text detection returns an 518 // error if one or more of the specified languages is not one of the 519 // [supported languages](https://cloud.google.com/vision/docs/languages). 520 repeated string language_hints = 2; 521 522 // Parameters for crop hints annotation request. 523 CropHintsParams crop_hints_params = 4; 524 525 // Parameters for web detection. 526 WebDetectionParams web_detection_params = 6; 527 528 // Parameters for text detection and document text detection. 529 TextDetectionParams text_detection_params = 12; 530} 531 532// Request for performing Google Cloud Vision API tasks over a user-provided 533// image, with user-requested features. 534message AnnotateImageRequest { 535 // The image to be processed. 536 Image image = 1; 537 538 // Requested features. 539 repeated Feature features = 2; 540 541 // Additional context that may accompany the image. 542 ImageContext image_context = 3; 543} 544 545// Response to an image annotation request. 546message AnnotateImageResponse { 547 // If present, face detection has completed successfully. 548 repeated FaceAnnotation face_annotations = 1; 549 550 // If present, landmark detection has completed successfully. 551 repeated EntityAnnotation landmark_annotations = 2; 552 553 // If present, logo detection has completed successfully. 554 repeated EntityAnnotation logo_annotations = 3; 555 556 // If present, label detection has completed successfully. 557 repeated EntityAnnotation label_annotations = 4; 558 559 // If present, text (OCR) detection has completed successfully. 560 repeated EntityAnnotation text_annotations = 5; 561 562 // If present, text (OCR) detection or document (OCR) text detection has 563 // completed successfully. 564 // This annotation provides the structural hierarchy for the OCR detected 565 // text. 566 TextAnnotation full_text_annotation = 12; 567 568 // If present, safe-search annotation has completed successfully. 569 SafeSearchAnnotation safe_search_annotation = 6; 570 571 // If present, image properties were extracted successfully. 572 ImageProperties image_properties_annotation = 8; 573 574 // If present, crop hints have completed successfully. 575 CropHintsAnnotation crop_hints_annotation = 11; 576 577 // If present, web detection has completed successfully. 578 WebDetection web_detection = 13; 579 580 // If set, represents the error message for the operation. 581 // Note that filled-in image annotations are guaranteed to be 582 // correct, even when `error` is set. 583 google.rpc.Status error = 9; 584} 585 586// Multiple image annotation requests are batched into a single service call. 587message BatchAnnotateImagesRequest { 588 // Required. Individual image annotation requests for this batch. 589 repeated AnnotateImageRequest requests = 1 [(google.api.field_behavior) = REQUIRED]; 590} 591 592// Response to a batch image annotation request. 593message BatchAnnotateImagesResponse { 594 // Individual responses to image annotation requests within the batch. 595 repeated AnnotateImageResponse responses = 1; 596} 597 598// A bucketized representation of likelihood, which is intended to give clients 599// highly stable results across model upgrades. 600enum Likelihood { 601 // Unknown likelihood. 602 UNKNOWN = 0; 603 604 // It is very unlikely that the image belongs to the specified vertical. 605 VERY_UNLIKELY = 1; 606 607 // It is unlikely that the image belongs to the specified vertical. 608 UNLIKELY = 2; 609 610 // It is possible that the image belongs to the specified vertical. 611 POSSIBLE = 3; 612 613 // It is likely that the image belongs to the specified vertical. 614 LIKELY = 4; 615 616 // It is very likely that the image belongs to the specified vertical. 617 VERY_LIKELY = 5; 618} 619