xref: /aosp_15_r20/external/googleapis/google/cloud/documentai/v1beta1/document_understanding.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2019 Google LLC.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15
16syntax = "proto3";
17
18package google.cloud.documentai.v1beta1;
19
20import "google/api/annotations.proto";
21import "google/api/client.proto";
22import "google/api/field_behavior.proto";
23import "google/cloud/documentai/v1beta1/geometry.proto";
24import "google/longrunning/operations.proto";
25import "google/protobuf/timestamp.proto";
26
27option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta1";
28option go_package = "cloud.google.com/go/documentai/apiv1beta1/documentaipb;documentaipb";
29option java_multiple_files = true;
30option java_outer_classname = "DocumentAiProto";
31option java_package = "com.google.cloud.documentai.v1beta1";
32option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta1";
33option ruby_package = "Google::Cloud::DocumentAI::V1beta1";
34
35// Service to parse structured information from unstructured or semi-structured
36// documents using state-of-the-art Google AI such as natural language,
37// computer vision, and translation.
38service DocumentUnderstandingService {
39  option (google.api.default_host) = "documentai.googleapis.com";
40  option (google.api.oauth_scopes) =
41      "https://www.googleapis.com/auth/cloud-platform";
42
43  // LRO endpoint to batch process many documents.
44  rpc BatchProcessDocuments(BatchProcessDocumentsRequest)
45      returns (google.longrunning.Operation) {
46    option (google.api.http) = {
47      post: "/v1beta1/{parent=projects/*/locations/*}/documents:batchProcess"
48      body: "*"
49      additional_bindings {
50        post: "/v1beta1/{parent=projects/*}/documents:batchProcess"
51        body: "*"
52      }
53    };
54    option (google.api.method_signature) = "requests";
55    option (google.longrunning.operation_info) = {
56      response_type: "BatchProcessDocumentsResponse"
57      metadata_type: "OperationMetadata"
58    };
59  }
60}
61
62// Request to batch process documents as an asynchronous operation.
63message BatchProcessDocumentsRequest {
64  // Required. Individual requests for each document.
65  repeated ProcessDocumentRequest requests = 1
66      [(google.api.field_behavior) = REQUIRED];
67
68  // Target project and location to make a call.
69  //
70  // Format: `projects/{project-id}/locations/{location-id}`.
71  //
72  // If no location is specified, a region will be chosen automatically.
73  string parent = 2;
74}
75
76// Request to process one document.
77message ProcessDocumentRequest {
78  // Required. Information about the input file.
79  InputConfig input_config = 1 [(google.api.field_behavior) = REQUIRED];
80
81  // Required. The desired output location.
82  OutputConfig output_config = 2 [(google.api.field_behavior) = REQUIRED];
83
84  // Specifies a known document type for deeper structure detection. Valid
85  // values are currently "general" and "invoice". If not provided, "general"\
86  // is used as default. If any other value is given, the request is rejected.
87  string document_type = 3;
88
89  // Controls table extraction behavior. If not specified, the system will
90  // decide reasonable defaults.
91  TableExtractionParams table_extraction_params = 4;
92
93  // Controls form extraction behavior. If not specified, the system will
94  // decide reasonable defaults.
95  FormExtractionParams form_extraction_params = 5;
96
97  // Controls entity extraction behavior. If not specified, the system will
98  // decide reasonable defaults.
99  EntityExtractionParams entity_extraction_params = 6;
100
101  // Controls OCR behavior. If not specified, the system will decide reasonable
102  // defaults.
103  OcrParams ocr_params = 7;
104}
105
106// Response to an batch document processing request. This is returned in
107// the LRO Operation after the operation is complete.
108message BatchProcessDocumentsResponse {
109  // Responses for each individual document.
110  repeated ProcessDocumentResponse responses = 1;
111}
112
113// Response to a single document processing request.
114message ProcessDocumentResponse {
115  // Information about the input file. This is the same as the corresponding
116  // input config in the request.
117  InputConfig input_config = 1;
118
119  // The output location of the parsed responses. The responses are written to
120  // this location as JSON-serialized `Document` objects.
121  OutputConfig output_config = 2;
122}
123
124// Parameters to control Optical Character Recognition (OCR) behavior.
125message OcrParams {
126  // List of languages to use for OCR. In most cases, an empty value
127  // yields the best results since it enables automatic language detection. For
128  // languages based on the Latin alphabet, setting `language_hints` is not
129  // needed. In rare cases, when the language of the text in the image is known,
130  // setting a hint will help get better results (although it will be a
131  // significant hindrance if the hint is wrong). Document processing returns an
132  // error if one or more of the specified languages is not one of the
133  // supported languages.
134  repeated string language_hints = 1;
135}
136
137// Parameters to control table extraction behavior.
138message TableExtractionParams {
139  // Whether to enable table extraction.
140  bool enabled = 1;
141
142  // Optional. Table bounding box hints that can be provided to complex cases
143  // which our algorithm cannot locate the table(s) in.
144  repeated TableBoundHint table_bound_hints = 2
145      [(google.api.field_behavior) = OPTIONAL];
146
147  // Optional. Table header hints. The extraction will bias towards producing
148  // these terms as table headers, which may improve accuracy.
149  repeated string header_hints = 3 [(google.api.field_behavior) = OPTIONAL];
150
151  // Model version of the table extraction system. Default is "builtin/stable".
152  // Specify "builtin/latest" for the latest model.
153  string model_version = 4;
154}
155
156// A hint for a table bounding box on the page for table parsing.
157message TableBoundHint {
158  // Optional. Page number for multi-paged inputs this hint applies to. If not
159  // provided, this hint will apply to all pages by default. This value is
160  // 1-based.
161  int32 page_number = 1 [(google.api.field_behavior) = OPTIONAL];
162
163  // Bounding box hint for a table on this page. The coordinates must be
164  // normalized to [0,1] and the bounding box must be an axis-aligned rectangle.
165  BoundingPoly bounding_box = 2;
166}
167
168// Parameters to control form extraction behavior.
169message FormExtractionParams {
170  // Whether to enable form extraction.
171  bool enabled = 1;
172
173  // User can provide pairs of (key text, value type) to improve the parsing
174  // result.
175  //
176  // For example, if a document has a field called "Date" that holds a date
177  // value and a field called "Amount" that may hold either a currency value
178  // (e.g., "$500.00") or a simple number value (e.g., "20"), you could use the
179  // following hints: [ {"key": "Date", value_types: [ "DATE"]}, {"key":
180  // "Amount", "value_types": [ "PRICE", "NUMBER" ]} ]
181  //
182  // If the value type is unknown, but you want to provide hints for the keys,
183  // you can leave the value_types field blank. e.g. {"key": "Date",
184  // "value_types": []}
185  repeated KeyValuePairHint key_value_pair_hints = 2;
186
187  // Model version of the form extraction system. Default is
188  // "builtin/stable". Specify "builtin/latest" for the latest model.
189  string model_version = 3;
190}
191
192// User-provided hint for key value pair.
193message KeyValuePairHint {
194  // The key text for the hint.
195  string key = 1;
196
197  // Type of the value. This is case-insensitive, and could be one of:
198  // ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER,
199  // ID, NUMBER, EMAIL, PRICE, TERMS, DATE, NAME. Types not in this list will
200  // be ignored.
201  repeated string value_types = 2;
202}
203
204// Parameters to control entity extraction behavior.
205message EntityExtractionParams {
206  // Whether to enable entity extraction.
207  bool enabled = 1;
208
209  // Model version of the entity extraction. Default is
210  // "builtin/stable". Specify "builtin/latest" for the latest model.
211  string model_version = 2;
212}
213
214// The desired input location and metadata.
215message InputConfig {
216  // Required.
217  oneof source {
218    // The Google Cloud Storage location to read the input from. This must be a
219    // single file.
220    GcsSource gcs_source = 1;
221  }
222
223  // Required. Mimetype of the input. Current supported mimetypes are
224  // application/pdf, image/tiff, and image/gif.
225  string mime_type = 2 [(google.api.field_behavior) = REQUIRED];
226}
227
228// The desired output location and metadata.
229message OutputConfig {
230  // Required.
231  oneof destination {
232    // The Google Cloud Storage location to write the output to.
233    GcsDestination gcs_destination = 1;
234  }
235
236  // The max number of pages to include into each output Document shard JSON on
237  // Google Cloud Storage.
238  //
239  // The valid range is [1, 100]. If not specified, the default value is 20.
240  //
241  // For example, for one pdf file with 100 pages, 100 parsed pages will be
242  // produced. If `pages_per_shard` = 20, then 5 Document shard JSON files each
243  // containing 20 parsed pages will be written under the prefix
244  // [OutputConfig.gcs_destination.uri][] and suffix pages-x-to-y.json where
245  // x and y are 1-indexed page numbers.
246  //
247  // Example GCS outputs with 157 pages and pages_per_shard = 50:
248  //
249  // <prefix>pages-001-to-050.json
250  // <prefix>pages-051-to-100.json
251  // <prefix>pages-101-to-150.json
252  // <prefix>pages-151-to-157.json
253  int32 pages_per_shard = 2;
254}
255
256// The Google Cloud Storage location where the input file will be read from.
257message GcsSource {
258  string uri = 1 [(google.api.field_behavior) = REQUIRED];
259}
260
261// The Google Cloud Storage location where the output file will be written to.
262message GcsDestination {
263  string uri = 1 [(google.api.field_behavior) = REQUIRED];
264}
265
266// Contains metadata for the BatchProcessDocuments operation.
267message OperationMetadata {
268  enum State {
269    // The default value. This value is used if the state is omitted.
270    STATE_UNSPECIFIED = 0;
271
272    // Request is received.
273    ACCEPTED = 1;
274
275    // Request operation is waiting for scheduling.
276    WAITING = 2;
277
278    // Request is being processed.
279    RUNNING = 3;
280
281    // The batch processing completed successfully.
282    SUCCEEDED = 4;
283
284    // The batch processing was cancelled.
285    CANCELLED = 5;
286
287    // The batch processing has failed.
288    FAILED = 6;
289  }
290
291  // The state of the current batch processing.
292  State state = 1;
293
294  // A message providing more details about the current state of processing.
295  string state_message = 2;
296
297  // The creation time of the operation.
298  google.protobuf.Timestamp create_time = 3;
299
300  // The last update time of the operation.
301  google.protobuf.Timestamp update_time = 4;
302}
303