xref: /aosp_15_r20/external/googleapis/google/cloud/documentai/v1beta3/document_service.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.documentai.v1beta3;
18
19import "google/api/annotations.proto";
20import "google/api/client.proto";
21import "google/api/field_behavior.proto";
22import "google/api/resource.proto";
23import "google/cloud/documentai/v1beta3/dataset.proto";
24import "google/cloud/documentai/v1beta3/document.proto";
25import "google/cloud/documentai/v1beta3/document_io.proto";
26import "google/cloud/documentai/v1beta3/operation_metadata.proto";
27import "google/longrunning/operations.proto";
28import "google/protobuf/field_mask.proto";
29import "google/rpc/status.proto";
30
31option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3";
32option go_package = "cloud.google.com/go/documentai/apiv1beta3/documentaipb;documentaipb";
33option java_multiple_files = true;
34option java_outer_classname = "DocumentAiDocumentService";
35option java_package = "com.google.cloud.documentai.v1beta3";
36option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3";
37option ruby_package = "Google::Cloud::DocumentAI::V1beta3";
38
39// Service to call Cloud DocumentAI to manage document collection (dataset).
40service DocumentService {
41  option (google.api.default_host) = "documentai.googleapis.com";
42  option (google.api.oauth_scopes) =
43      "https://www.googleapis.com/auth/cloud-platform";
44
45  // Updates metadata associated with a dataset.
46  rpc UpdateDataset(UpdateDatasetRequest)
47      returns (google.longrunning.Operation) {
48    option (google.api.http) = {
49      patch: "/v1beta3/{dataset.name=projects/*/locations/*/processors/*/dataset}"
50      body: "dataset"
51    };
52    option (google.api.method_signature) = "dataset,update_mask";
53    option (google.longrunning.operation_info) = {
54      response_type: "Dataset"
55      metadata_type: "UpdateDatasetOperationMetadata"
56    };
57  }
58
59  // Import documents into a dataset.
60  rpc ImportDocuments(ImportDocumentsRequest)
61      returns (google.longrunning.Operation) {
62    option (google.api.http) = {
63      post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:importDocuments"
64      body: "*"
65    };
66    option (google.api.method_signature) = "dataset";
67    option (google.longrunning.operation_info) = {
68      response_type: "ImportDocumentsResponse"
69      metadata_type: "ImportDocumentsMetadata"
70    };
71  }
72
73  // Returns relevant fields present in the requested document.
74  rpc GetDocument(GetDocumentRequest) returns (GetDocumentResponse) {
75    option (google.api.http) = {
76      get: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:getDocument"
77    };
78    option (google.api.method_signature) = "dataset";
79  }
80
81  // Returns a list of documents present in the dataset.
82  rpc ListDocuments(ListDocumentsRequest) returns (ListDocumentsResponse) {
83    option (google.api.http) = {
84      post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:listDocuments"
85      body: "*"
86    };
87    option (google.api.method_signature) = "dataset";
88  }
89
90  // Deletes a set of documents.
91  rpc BatchDeleteDocuments(BatchDeleteDocumentsRequest)
92      returns (google.longrunning.Operation) {
93    option (google.api.http) = {
94      post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:batchDeleteDocuments"
95      body: "*"
96    };
97    option (google.api.method_signature) = "dataset";
98    option (google.longrunning.operation_info) = {
99      response_type: "BatchDeleteDocumentsResponse"
100      metadata_type: "BatchDeleteDocumentsMetadata"
101    };
102  }
103
104  // Gets the `DatasetSchema` of a `Dataset`.
105  rpc GetDatasetSchema(GetDatasetSchemaRequest) returns (DatasetSchema) {
106    option (google.api.http) = {
107      get: "/v1beta3/{name=projects/*/locations/*/processors/*/dataset/datasetSchema}"
108    };
109    option (google.api.method_signature) = "name";
110  }
111
112  // Updates a `DatasetSchema`.
113  rpc UpdateDatasetSchema(UpdateDatasetSchemaRequest) returns (DatasetSchema) {
114    option (google.api.http) = {
115      patch: "/v1beta3/{dataset_schema.name=projects/*/locations/*/processors/*/dataset/datasetSchema}"
116      body: "dataset_schema"
117    };
118    option (google.api.method_signature) = "dataset_schema,update_mask";
119  }
120}
121
122// Documents belonging to a dataset will be split into different groups
123// referred to as splits: train, test.
124enum DatasetSplitType {
125  // Default value if the enum is not set.
126  DATASET_SPLIT_TYPE_UNSPECIFIED = 0;
127
128  // Identifies the train documents.
129  DATASET_SPLIT_TRAIN = 1;
130
131  // Identifies the test documents.
132  DATASET_SPLIT_TEST = 2;
133
134  // Identifies the unassigned documents.
135  DATASET_SPLIT_UNASSIGNED = 3;
136}
137
138// Describes the labeling status of a document.
139enum DocumentLabelingState {
140  // Default value if the enum is not set.
141  DOCUMENT_LABELING_STATE_UNSPECIFIED = 0;
142
143  // Document has been labeled.
144  DOCUMENT_LABELED = 1;
145
146  // Document has not been labeled.
147  DOCUMENT_UNLABELED = 2;
148
149  // Document has been auto-labeled.
150  DOCUMENT_AUTO_LABELED = 3;
151}
152
153message UpdateDatasetRequest {
154  // Required. The `name` field of the `Dataset` is used to identify the
155  // resource to be updated.
156  Dataset dataset = 1 [(google.api.field_behavior) = REQUIRED];
157
158  // The update mask applies to the resource.
159  google.protobuf.FieldMask update_mask = 2;
160}
161
162message UpdateDatasetOperationMetadata {
163  // The basic metadata of the long-running operation.
164  CommonOperationMetadata common_metadata = 1;
165}
166
167message ImportDocumentsRequest {
168  // Config for importing documents.
169  // Each batch can have its own dataset split type.
170  message BatchDocumentsImportConfig {
171    // The config for auto-split.
172    message AutoSplitConfig {
173      // Ratio of training dataset split.
174      float training_split_ratio = 1;
175    }
176
177    oneof split_type_config {
178      // Target dataset split where the documents must be stored.
179      DatasetSplitType dataset_split = 2;
180
181      // If set, documents will be automatically split into training and test
182      // split category with the specified ratio.
183      AutoSplitConfig auto_split_config = 3;
184    }
185
186    // The common config to specify a set of documents used as input.
187    BatchDocumentsInputConfig batch_input_config = 1;
188  }
189
190  // Required. The dataset resource name.
191  // Format:
192  // projects/{project}/locations/{location}/processors/{processor}/dataset
193  string dataset = 1 [
194    (google.api.field_behavior) = REQUIRED,
195    (google.api.resource_reference) = {
196      type: "documentai.googleapis.com/Dataset"
197    }
198  ];
199
200  // Required. The Cloud Storage uri containing raw documents that must be
201  // imported.
202  repeated BatchDocumentsImportConfig batch_documents_import_configs = 4
203      [(google.api.field_behavior) = REQUIRED];
204}
205
206// Response of the import document operation.
207message ImportDocumentsResponse {}
208
209// Metadata of the import document operation.
210message ImportDocumentsMetadata {
211  // The status of each individual document in the import process.
212  message IndividualImportStatus {
213    // The source Cloud Storage URI of the document.
214    string input_gcs_source = 1;
215
216    // The status of the importing of the document.
217    google.rpc.Status status = 2;
218
219    // The document id of imported document if it was successful, otherwise
220    // empty.
221    DocumentId output_document_id = 4;
222  }
223
224  // The validation status of each import config. Status is set to an error if
225  // there are no documents to import in the `import_config`, or `OK` if the
226  // operation will try to proceed with at least one document.
227  message ImportConfigValidationResult {
228    // The source Cloud Storage URI specified in the import config.
229    string input_gcs_source = 1;
230
231    // The validation status of import config.
232    google.rpc.Status status = 2;
233  }
234
235  // The basic metadata of the long-running operation.
236  CommonOperationMetadata common_metadata = 1;
237
238  // The list of response details of each document.
239  repeated IndividualImportStatus individual_import_statuses = 2;
240
241  // Validation statuses of the batch documents import config.
242  repeated ImportConfigValidationResult import_config_validation_results = 4;
243
244  // Total number of the documents that are qualified for importing.
245  int32 total_document_count = 3;
246}
247
248message GetDocumentRequest {
249  // Required. The resource name of the dataset that the document belongs to .
250  // Format:
251  // projects/{project}/locations/{location}/processors/{processor}/dataset
252  string dataset = 1 [
253    (google.api.field_behavior) = REQUIRED,
254    (google.api.resource_reference) = {
255      type: "documentai.googleapis.com/Dataset"
256    }
257  ];
258
259  // Required. Document identifier.
260  DocumentId document_id = 2 [(google.api.field_behavior) = REQUIRED];
261
262  // If set, only fields listed here will be returned. Otherwise, all fields
263  // will be returned by default.
264  google.protobuf.FieldMask read_mask = 3;
265
266  // List of pages for which the fields specified in the `read_mask` must
267  // be served.
268  DocumentPageRange page_range = 4;
269}
270
271message GetDocumentResponse {
272  Document document = 1;
273}
274
275message ListDocumentsRequest {
276  // Required. The resource name of the dataset to be listed.
277  // Format:
278  // projects/{project}/locations/{location}/processors/{processor}/dataset
279  string dataset = 1 [
280    (google.api.field_behavior) = REQUIRED,
281    (google.api.resource_reference) = {
282      type: "documentai.googleapis.com/Dataset"
283    }
284  ];
285
286  // The maximum number of documents to return. The service may return
287  // fewer than this value.
288  // If unspecified, at most 20 documents will be returned.
289  // The maximum value is 100; values above 100 will be coerced to 100.
290  int32 page_size = 2;
291
292  // A page token, received from a previous `ListDocuments` call.
293  // Provide this to retrieve the subsequent page.
294  //
295  // When paginating, all other parameters provided to `ListDocuments`
296  // must match the call that provided the page token.
297  string page_token = 3;
298
299  // Optional. Query to filter the documents based on
300  // https://google.aip.dev/160.
301  // ## Currently support query strings are:
302  //
303  // `SplitType=DATASET_SPLIT_TEST|DATASET_SPLIT_TRAIN|DATASET_SPLIT_UNASSIGNED`
304  // - `LabelingState=DOCUMENT_LABELED|DOCUMENT_UNLABELED|DOCUMENT_AUTO_LABELED`
305  // - `DisplayName=\"file_name.pdf\"`
306  // - `EntityType=abc/def`
307  // - `TagName=\"auto-labeling-running\"|\"sampled\"`
308  //
309  // Note:
310  // - Only `AND`, `=` and `!=` are supported.
311  //     e.g. `DisplayName=file_name AND EntityType!=abc` IS supported.
312  // - Wildcard `*` is supported only in `DisplayName` filter
313  // - No duplicate filter keys are allowed,
314  //     e.g. `EntityType=a AND EntityType=b` is NOT supported.
315  // - String match is case sensitive (for filter `DisplayName` & `EntityType`).
316  string filter = 4 [(google.api.field_behavior) = OPTIONAL];
317
318  // Optional. Controls if the request requires a total size of matched
319  // documents. See
320  // [ListDocumentsResponse.total_size][google.cloud.documentai.v1beta3.ListDocumentsResponse.total_size].
321  //
322  // Enabling this flag may adversely impact performance.
323  //
324  // Defaults to false.
325  bool return_total_size = 6 [(google.api.field_behavior) = OPTIONAL];
326
327  // Optional. Number of results to skip beginning from the `page_token` if
328  // provided. https://google.aip.dev/158#skipping-results. It must be a
329  // non-negative integer. Negative values will be rejected. Note that this is
330  // not the number of pages to skip. If this value causes the cursor to move
331  // past the end of results,
332  // [ListDocumentsResponse.document_metadata][google.cloud.documentai.v1beta3.ListDocumentsResponse.document_metadata]
333  // and
334  // [ListDocumentsResponse.next_page_token][google.cloud.documentai.v1beta3.ListDocumentsResponse.next_page_token]
335  // will be empty.
336  int32 skip = 8 [(google.api.field_behavior) = OPTIONAL];
337}
338
339message ListDocumentsResponse {
340  // Document metadata corresponding to the listed documents.
341  repeated DocumentMetadata document_metadata = 1;
342
343  // A token, which can be sent as
344  // [ListDocumentsRequest.page_token][google.cloud.documentai.v1beta3.ListDocumentsRequest.page_token]
345  // to retrieve the next page. If this field is omitted, there are no
346  // subsequent pages.
347  string next_page_token = 2;
348
349  // Total count of documents queried.
350  int32 total_size = 3;
351}
352
353message BatchDeleteDocumentsRequest {
354  // Required. The dataset resource name.
355  // Format:
356  // projects/{project}/locations/{location}/processors/{processor}/dataset
357  string dataset = 1 [(google.api.field_behavior) = REQUIRED];
358
359  // Required. Dataset documents input. If given `filter`, all documents
360  // satisfying the filter will be deleted. If given documentIds, a maximum of
361  // 50 documents can be deleted in a batch. The request will be rejected if
362  // more than 50 document_ids are provided.
363  BatchDatasetDocuments dataset_documents = 3
364      [(google.api.field_behavior) = REQUIRED];
365}
366
367// Response of the delete documents operation.
368message BatchDeleteDocumentsResponse {}
369
370message BatchDeleteDocumentsMetadata {
371  // The status of each individual document in the batch delete process.
372  message IndividualBatchDeleteStatus {
373    // The document id of the document.
374    DocumentId document_id = 1;
375
376    // The status of deleting the document in storage.
377    google.rpc.Status status = 2;
378  }
379
380  // The basic metadata of the long-running operation.
381  CommonOperationMetadata common_metadata = 1;
382
383  // The list of response details of each document.
384  repeated IndividualBatchDeleteStatus individual_batch_delete_statuses = 2;
385
386  // Total number of documents deleting from dataset.
387  int32 total_document_count = 3;
388
389  // Total number of documents that failed to be deleted in storage.
390  int32 error_document_count = 4;
391}
392
393// Request for `GetDatasetSchema`.
394message GetDatasetSchemaRequest {
395  // Required. The dataset schema resource name.
396  // Format:
397  // projects/{project}/locations/{location}/processors/{processor}/dataset/datasetSchema
398  string name = 1 [
399    (google.api.field_behavior) = REQUIRED,
400    (google.api.resource_reference) = {
401      type: "documentai.googleapis.com/DatasetSchema"
402    }
403  ];
404
405  // If set, only returns the visible fields of the schema.
406  bool visible_fields_only = 2;
407}
408
409// Request for `UpdateDatasetSchema`.
410message UpdateDatasetSchemaRequest {
411  // Required. The name field of the `DatasetSchema` is used to identify the
412  // resource to be updated.
413  DatasetSchema dataset_schema = 1 [(google.api.field_behavior) = REQUIRED];
414
415  // The update mask applies to the resource.
416  google.protobuf.FieldMask update_mask = 2;
417}
418
419// Range of pages present in a document.
420message DocumentPageRange {
421  // First page number (one-based index) to be returned.
422  int32 start = 1;
423
424  // Last page number (one-based index) to be returned.
425  int32 end = 2;
426}
427
428// Metadata about a document.
429message DocumentMetadata {
430  // Document identifier.
431  DocumentId document_id = 1;
432
433  // Number of pages in the document.
434  int32 page_count = 2;
435
436  // Type of the dataset split to which the document belongs.
437  DatasetSplitType dataset_type = 3;
438
439  // Labeling state of the document.
440  DocumentLabelingState labeling_state = 5;
441
442  // The display name of the document.
443  string display_name = 6;
444}
445