xref: /aosp_15_r20/external/googleapis/google/cloud/vision/v1p1beta1/image_annotator.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2019 Google LLC.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15
16syntax = "proto3";
17
18package google.cloud.vision.v1p1beta1;
19
20import "google/api/annotations.proto";
21import "google/api/client.proto";
22import "google/api/field_behavior.proto";
23import "google/cloud/vision/v1p1beta1/geometry.proto";
24import "google/cloud/vision/v1p1beta1/text_annotation.proto";
25import "google/cloud/vision/v1p1beta1/web_detection.proto";
26import "google/rpc/status.proto";
27import "google/type/color.proto";
28import "google/type/latlng.proto";
29
30option cc_enable_arenas = true;
31option go_package = "cloud.google.com/go/vision/v2/apiv1p1beta1/visionpb;visionpb";
32option java_multiple_files = true;
33option java_outer_classname = "ImageAnnotatorProto";
34option java_package = "com.google.cloud.vision.v1p1beta1";
35
36// Service that performs Google Cloud Vision API detection tasks over client
37// images, such as face, landmark, logo, label, and text detection. The
38// ImageAnnotator service returns detected entities from the images.
39service ImageAnnotator {
40  option (google.api.default_host) = "vision.googleapis.com";
41  option (google.api.oauth_scopes) =
42      "https://www.googleapis.com/auth/cloud-platform,"
43      "https://www.googleapis.com/auth/cloud-vision";
44
45  // Run image detection and annotation for a batch of images.
46  rpc BatchAnnotateImages(BatchAnnotateImagesRequest)
47      returns (BatchAnnotateImagesResponse) {
48    option (google.api.http) = {
49      post: "/v1p1beta1/images:annotate"
50      body: "*"
51    };
52    option (google.api.method_signature) = "requests";
53  }
54}
55
56// Users describe the type of Google Cloud Vision API tasks to perform over
57// images by using *Feature*s. Each Feature indicates a type of image
58// detection task to perform. Features encode the Cloud Vision API
59// vertical to operate on and the number of top-scoring results to return.
60message Feature {
61  // Type of image feature.
62  enum Type {
63    // Unspecified feature type.
64    TYPE_UNSPECIFIED = 0;
65
66    // Run face detection.
67    FACE_DETECTION = 1;
68
69    // Run landmark detection.
70    LANDMARK_DETECTION = 2;
71
72    // Run logo detection.
73    LOGO_DETECTION = 3;
74
75    // Run label detection.
76    LABEL_DETECTION = 4;
77
78    // Run OCR.
79    TEXT_DETECTION = 5;
80
81    // Run dense text document OCR. Takes precedence when both
82    // DOCUMENT_TEXT_DETECTION and TEXT_DETECTION are present.
83    DOCUMENT_TEXT_DETECTION = 11;
84
85    // Run computer vision models to compute image safe-search properties.
86    SAFE_SEARCH_DETECTION = 6;
87
88    // Compute a set of image properties, such as the image's dominant colors.
89    IMAGE_PROPERTIES = 7;
90
91    // Run crop hints.
92    CROP_HINTS = 9;
93
94    // Run web detection.
95    WEB_DETECTION = 10;
96  }
97
98  // The feature type.
99  Type type = 1;
100
101  // Maximum number of results of this type.
102  int32 max_results = 2;
103
104  // Model to use for the feature.
105  // Supported values: "builtin/stable" (the default if unset) and
106  // "builtin/latest". `DOCUMENT_TEXT_DETECTION` and `TEXT_DETECTION` also
107  // support "builtin/weekly" for the bleeding edge release updated weekly.
108  string model = 3;
109}
110
111// External image source (Google Cloud Storage image location).
112message ImageSource {
113  // NOTE: For new code `image_uri` below is preferred.
114  // Google Cloud Storage image URI, which must be in the following form:
115  // `gs://bucket_name/object_name` (for details, see
116  // [Google Cloud Storage Request
117  // URIs](https://cloud.google.com/storage/docs/reference-uris)).
118  // NOTE: Cloud Storage object versioning is not supported.
119  string gcs_image_uri = 1;
120
121  // Image URI which supports:
122  // 1) Google Cloud Storage image URI, which must be in the following form:
123  // `gs://bucket_name/object_name` (for details, see
124  // [Google Cloud Storage Request
125  // URIs](https://cloud.google.com/storage/docs/reference-uris)).
126  // NOTE: Cloud Storage object versioning is not supported.
127  // 2) Publicly accessible image HTTP/HTTPS URL.
128  // This is preferred over the legacy `gcs_image_uri` above. When both
129  // `gcs_image_uri` and `image_uri` are specified, `image_uri` takes
130  // precedence.
131  string image_uri = 2;
132}
133
134// Client image to perform Google Cloud Vision API tasks over.
135message Image {
136  // Image content, represented as a stream of bytes.
137  // Note: as with all `bytes` fields, protobuffers use a pure binary
138  // representation, whereas JSON representations use base64.
139  bytes content = 1;
140
141  // Google Cloud Storage image location. If both `content` and `source`
142  // are provided for an image, `content` takes precedence and is
143  // used to perform the image annotation request.
144  ImageSource source = 2;
145}
146
147// A face annotation object contains the results of face detection.
148message FaceAnnotation {
149  // A face-specific landmark (for example, a face feature).
150  message Landmark {
151    // Face landmark (feature) type.
152    // Left and right are defined from the vantage of the viewer of the image
153    // without considering mirror projections typical of photos. So, `LEFT_EYE`,
154    // typically, is the person's right eye.
155    enum Type {
156      // Unknown face landmark detected. Should not be filled.
157      UNKNOWN_LANDMARK = 0;
158
159      // Left eye.
160      LEFT_EYE = 1;
161
162      // Right eye.
163      RIGHT_EYE = 2;
164
165      // Left of left eyebrow.
166      LEFT_OF_LEFT_EYEBROW = 3;
167
168      // Right of left eyebrow.
169      RIGHT_OF_LEFT_EYEBROW = 4;
170
171      // Left of right eyebrow.
172      LEFT_OF_RIGHT_EYEBROW = 5;
173
174      // Right of right eyebrow.
175      RIGHT_OF_RIGHT_EYEBROW = 6;
176
177      // Midpoint between eyes.
178      MIDPOINT_BETWEEN_EYES = 7;
179
180      // Nose tip.
181      NOSE_TIP = 8;
182
183      // Upper lip.
184      UPPER_LIP = 9;
185
186      // Lower lip.
187      LOWER_LIP = 10;
188
189      // Mouth left.
190      MOUTH_LEFT = 11;
191
192      // Mouth right.
193      MOUTH_RIGHT = 12;
194
195      // Mouth center.
196      MOUTH_CENTER = 13;
197
198      // Nose, bottom right.
199      NOSE_BOTTOM_RIGHT = 14;
200
201      // Nose, bottom left.
202      NOSE_BOTTOM_LEFT = 15;
203
204      // Nose, bottom center.
205      NOSE_BOTTOM_CENTER = 16;
206
207      // Left eye, top boundary.
208      LEFT_EYE_TOP_BOUNDARY = 17;
209
210      // Left eye, right corner.
211      LEFT_EYE_RIGHT_CORNER = 18;
212
213      // Left eye, bottom boundary.
214      LEFT_EYE_BOTTOM_BOUNDARY = 19;
215
216      // Left eye, left corner.
217      LEFT_EYE_LEFT_CORNER = 20;
218
219      // Right eye, top boundary.
220      RIGHT_EYE_TOP_BOUNDARY = 21;
221
222      // Right eye, right corner.
223      RIGHT_EYE_RIGHT_CORNER = 22;
224
225      // Right eye, bottom boundary.
226      RIGHT_EYE_BOTTOM_BOUNDARY = 23;
227
228      // Right eye, left corner.
229      RIGHT_EYE_LEFT_CORNER = 24;
230
231      // Left eyebrow, upper midpoint.
232      LEFT_EYEBROW_UPPER_MIDPOINT = 25;
233
234      // Right eyebrow, upper midpoint.
235      RIGHT_EYEBROW_UPPER_MIDPOINT = 26;
236
237      // Left ear tragion.
238      LEFT_EAR_TRAGION = 27;
239
240      // Right ear tragion.
241      RIGHT_EAR_TRAGION = 28;
242
243      // Left eye pupil.
244      LEFT_EYE_PUPIL = 29;
245
246      // Right eye pupil.
247      RIGHT_EYE_PUPIL = 30;
248
249      // Forehead glabella.
250      FOREHEAD_GLABELLA = 31;
251
252      // Chin gnathion.
253      CHIN_GNATHION = 32;
254
255      // Chin left gonion.
256      CHIN_LEFT_GONION = 33;
257
258      // Chin right gonion.
259      CHIN_RIGHT_GONION = 34;
260    }
261
262    // Face landmark type.
263    Type type = 3;
264
265    // Face landmark position.
266    Position position = 4;
267  }
268
269  // The bounding polygon around the face. The coordinates of the bounding box
270  // are in the original image's scale, as returned in `ImageParams`.
271  // The bounding box is computed to "frame" the face in accordance with human
272  // expectations. It is based on the landmarker results.
273  // Note that one or more x and/or y coordinates may not be generated in the
274  // `BoundingPoly` (the polygon will be unbounded) if only a partial face
275  // appears in the image to be annotated.
276  BoundingPoly bounding_poly = 1;
277
278  // The `fd_bounding_poly` bounding polygon is tighter than the
279  // `boundingPoly`, and encloses only the skin part of the face. Typically, it
280  // is used to eliminate the face from any image analysis that detects the
281  // "amount of skin" visible in an image. It is not based on the
282  // landmarker results, only on the initial face detection, hence
283  // the <code>fd</code> (face detection) prefix.
284  BoundingPoly fd_bounding_poly = 2;
285
286  // Detected face landmarks.
287  repeated Landmark landmarks = 3;
288
289  // Roll angle, which indicates the amount of clockwise/anti-clockwise rotation
290  // of the face relative to the image vertical about the axis perpendicular to
291  // the face. Range [-180,180].
292  float roll_angle = 4;
293
294  // Yaw angle, which indicates the leftward/rightward angle that the face is
295  // pointing relative to the vertical plane perpendicular to the image. Range
296  // [-180,180].
297  float pan_angle = 5;
298
299  // Pitch angle, which indicates the upwards/downwards angle that the face is
300  // pointing relative to the image's horizontal plane. Range [-180,180].
301  float tilt_angle = 6;
302
303  // Detection confidence. Range [0, 1].
304  float detection_confidence = 7;
305
306  // Face landmarking confidence. Range [0, 1].
307  float landmarking_confidence = 8;
308
309  // Joy likelihood.
310  Likelihood joy_likelihood = 9;
311
312  // Sorrow likelihood.
313  Likelihood sorrow_likelihood = 10;
314
315  // Anger likelihood.
316  Likelihood anger_likelihood = 11;
317
318  // Surprise likelihood.
319  Likelihood surprise_likelihood = 12;
320
321  // Under-exposed likelihood.
322  Likelihood under_exposed_likelihood = 13;
323
324  // Blurred likelihood.
325  Likelihood blurred_likelihood = 14;
326
327  // Headwear likelihood.
328  Likelihood headwear_likelihood = 15;
329}
330
331// Detected entity location information.
332message LocationInfo {
333  // lat/long location coordinates.
334  google.type.LatLng lat_lng = 1;
335}
336
337// A `Property` consists of a user-supplied name/value pair.
338message Property {
339  // Name of the property.
340  string name = 1;
341
342  // Value of the property.
343  string value = 2;
344
345  // Value of numeric properties.
346  uint64 uint64_value = 3;
347}
348
349// Set of detected entity features.
350message EntityAnnotation {
351  // Opaque entity ID. Some IDs may be available in
352  // [Google Knowledge Graph Search
353  // API](https://developers.google.com/knowledge-graph/).
354  string mid = 1;
355
356  // The language code for the locale in which the entity textual
357  // `description` is expressed.
358  string locale = 2;
359
360  // Entity textual description, expressed in its `locale` language.
361  string description = 3;
362
363  // Overall score of the result. Range [0, 1].
364  float score = 4;
365
366  // The accuracy of the entity detection in an image.
367  // For example, for an image in which the "Eiffel Tower" entity is detected,
368  // this field represents the confidence that there is a tower in the query
369  // image. Range [0, 1].
370  float confidence = 5;
371
372  // The relevancy of the ICA (Image Content Annotation) label to the
373  // image. For example, the relevancy of "tower" is likely higher to an image
374  // containing the detected "Eiffel Tower" than to an image containing a
375  // detected distant towering building, even though the confidence that
376  // there is a tower in each image may be the same. Range [0, 1].
377  float topicality = 6;
378
379  // Image region to which this entity belongs. Not produced
380  // for `LABEL_DETECTION` features.
381  BoundingPoly bounding_poly = 7;
382
383  // The location information for the detected entity. Multiple
384  // `LocationInfo` elements can be present because one location may
385  // indicate the location of the scene in the image, and another location
386  // may indicate the location of the place where the image was taken.
387  // Location information is usually present for landmarks.
388  repeated LocationInfo locations = 8;
389
390  // Some entities may have optional user-supplied `Property` (name/value)
391  // fields, such a score or string that qualifies the entity.
392  repeated Property properties = 9;
393}
394
395// Set of features pertaining to the image, computed by computer vision
396// methods over safe-search verticals (for example, adult, spoof, medical,
397// violence).
398message SafeSearchAnnotation {
399  // Represents the adult content likelihood for the image. Adult content may
400  // contain elements such as nudity, pornographic images or cartoons, or
401  // sexual activities.
402  Likelihood adult = 1;
403
404  // Spoof likelihood. The likelihood that an modification
405  // was made to the image's canonical version to make it appear
406  // funny or offensive.
407  Likelihood spoof = 2;
408
409  // Likelihood that this is a medical image.
410  Likelihood medical = 3;
411
412  // Likelihood that this image contains violent content.
413  Likelihood violence = 4;
414
415  // Likelihood that the request image contains racy content. Racy content may
416  // include (but is not limited to) skimpy or sheer clothing, strategically
417  // covered nudity, lewd or provocative poses, or close-ups of sensitive
418  // body areas.
419  Likelihood racy = 9;
420}
421
422// Rectangle determined by min and max `LatLng` pairs.
423message LatLongRect {
424  // Min lat/long pair.
425  google.type.LatLng min_lat_lng = 1;
426
427  // Max lat/long pair.
428  google.type.LatLng max_lat_lng = 2;
429}
430
431// Color information consists of RGB channels, score, and the fraction of
432// the image that the color occupies in the image.
433message ColorInfo {
434  // RGB components of the color.
435  google.type.Color color = 1;
436
437  // Image-specific score for this color. Value in range [0, 1].
438  float score = 2;
439
440  // The fraction of pixels the color occupies in the image.
441  // Value in range [0, 1].
442  float pixel_fraction = 3;
443}
444
445// Set of dominant colors and their corresponding scores.
446message DominantColorsAnnotation {
447  // RGB color values with their score and pixel fraction.
448  repeated ColorInfo colors = 1;
449}
450
451// Stores image properties, such as dominant colors.
452message ImageProperties {
453  // If present, dominant colors completed successfully.
454  DominantColorsAnnotation dominant_colors = 1;
455}
456
457// Single crop hint that is used to generate a new crop when serving an image.
458message CropHint {
459  // The bounding polygon for the crop region. The coordinates of the bounding
460  // box are in the original image's scale, as returned in `ImageParams`.
461  BoundingPoly bounding_poly = 1;
462
463  // Confidence of this being a salient region.  Range [0, 1].
464  float confidence = 2;
465
466  // Fraction of importance of this salient region with respect to the original
467  // image.
468  float importance_fraction = 3;
469}
470
471// Set of crop hints that are used to generate new crops when serving images.
472message CropHintsAnnotation {
473  // Crop hint results.
474  repeated CropHint crop_hints = 1;
475}
476
477// Parameters for crop hints annotation request.
478message CropHintsParams {
479  // Aspect ratios in floats, representing the ratio of the width to the height
480  // of the image. For example, if the desired aspect ratio is 4/3, the
481  // corresponding float value should be 1.33333.  If not specified, the
482  // best possible crop is returned. The number of provided aspect ratios is
483  // limited to a maximum of 16; any aspect ratios provided after the 16th are
484  // ignored.
485  repeated float aspect_ratios = 1;
486}
487
488// Parameters for web detection request.
489message WebDetectionParams {
490  // Whether to include results derived from the geo information in the image.
491  bool include_geo_results = 2;
492}
493
494// Parameters for text detections. This is used to control TEXT_DETECTION and
495// DOCUMENT_TEXT_DETECTION features.
496message TextDetectionParams {
497
498  // By default, Cloud Vision API only includes confidence score for
499  // DOCUMENT_TEXT_DETECTION result. Set the flag to true to include confidence
500  // score for TEXT_DETECTION as well.
501  bool enable_text_detection_confidence_score = 9;
502
503  // A list of advanced OCR options to fine-tune OCR behavior.
504  repeated string advanced_ocr_options = 11;
505}
506
507// Image context and/or feature-specific parameters.
508message ImageContext {
509  // lat/long rectangle that specifies the location of the image.
510  LatLongRect lat_long_rect = 1;
511
512  // List of languages to use for TEXT_DETECTION. In most cases, an empty value
513  // yields the best results since it enables automatic language detection. For
514  // languages based on the Latin alphabet, setting `language_hints` is not
515  // needed. In rare cases, when the language of the text in the image is known,
516  // setting a hint will help get better results (although it will be a
517  // significant hindrance if the hint is wrong). Text detection returns an
518  // error if one or more of the specified languages is not one of the
519  // [supported languages](https://cloud.google.com/vision/docs/languages).
520  repeated string language_hints = 2;
521
522  // Parameters for crop hints annotation request.
523  CropHintsParams crop_hints_params = 4;
524
525  // Parameters for web detection.
526  WebDetectionParams web_detection_params = 6;
527
528  // Parameters for text detection and document text detection.
529  TextDetectionParams text_detection_params = 12;
530}
531
532// Request for performing Google Cloud Vision API tasks over a user-provided
533// image, with user-requested features.
534message AnnotateImageRequest {
535  // The image to be processed.
536  Image image = 1;
537
538  // Requested features.
539  repeated Feature features = 2;
540
541  // Additional context that may accompany the image.
542  ImageContext image_context = 3;
543}
544
545// Response to an image annotation request.
546message AnnotateImageResponse {
547  // If present, face detection has completed successfully.
548  repeated FaceAnnotation face_annotations = 1;
549
550  // If present, landmark detection has completed successfully.
551  repeated EntityAnnotation landmark_annotations = 2;
552
553  // If present, logo detection has completed successfully.
554  repeated EntityAnnotation logo_annotations = 3;
555
556  // If present, label detection has completed successfully.
557  repeated EntityAnnotation label_annotations = 4;
558
559  // If present, text (OCR) detection has completed successfully.
560  repeated EntityAnnotation text_annotations = 5;
561
562  // If present, text (OCR) detection or document (OCR) text detection has
563  // completed successfully.
564  // This annotation provides the structural hierarchy for the OCR detected
565  // text.
566  TextAnnotation full_text_annotation = 12;
567
568  // If present, safe-search annotation has completed successfully.
569  SafeSearchAnnotation safe_search_annotation = 6;
570
571  // If present, image properties were extracted successfully.
572  ImageProperties image_properties_annotation = 8;
573
574  // If present, crop hints have completed successfully.
575  CropHintsAnnotation crop_hints_annotation = 11;
576
577  // If present, web detection has completed successfully.
578  WebDetection web_detection = 13;
579
580  // If set, represents the error message for the operation.
581  // Note that filled-in image annotations are guaranteed to be
582  // correct, even when `error` is set.
583  google.rpc.Status error = 9;
584}
585
586// Multiple image annotation requests are batched into a single service call.
587message BatchAnnotateImagesRequest {
588  // Required. Individual image annotation requests for this batch.
589  repeated AnnotateImageRequest requests = 1 [(google.api.field_behavior) = REQUIRED];
590}
591
592// Response to a batch image annotation request.
593message BatchAnnotateImagesResponse {
594  // Individual responses to image annotation requests within the batch.
595  repeated AnnotateImageResponse responses = 1;
596}
597
598// A bucketized representation of likelihood, which is intended to give clients
599// highly stable results across model upgrades.
600enum Likelihood {
601  // Unknown likelihood.
602  UNKNOWN = 0;
603
604  // It is very unlikely that the image belongs to the specified vertical.
605  VERY_UNLIKELY = 1;
606
607  // It is unlikely that the image belongs to the specified vertical.
608  UNLIKELY = 2;
609
610  // It is possible that the image belongs to the specified vertical.
611  POSSIBLE = 3;
612
613  // It is likely that the image belongs to the specified vertical.
614  LIKELY = 4;
615
616  // It is very likely that the image belongs to the specified vertical.
617  VERY_LIKELY = 5;
618}
619