1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.documentai.v1beta3; 18 19import "google/api/annotations.proto"; 20import "google/api/client.proto"; 21import "google/api/field_behavior.proto"; 22import "google/api/resource.proto"; 23import "google/cloud/documentai/v1beta3/dataset.proto"; 24import "google/cloud/documentai/v1beta3/document.proto"; 25import "google/cloud/documentai/v1beta3/document_io.proto"; 26import "google/cloud/documentai/v1beta3/operation_metadata.proto"; 27import "google/longrunning/operations.proto"; 28import "google/protobuf/field_mask.proto"; 29import "google/rpc/status.proto"; 30 31option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3"; 32option go_package = "cloud.google.com/go/documentai/apiv1beta3/documentaipb;documentaipb"; 33option java_multiple_files = true; 34option java_outer_classname = "DocumentAiDocumentService"; 35option java_package = "com.google.cloud.documentai.v1beta3"; 36option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3"; 37option ruby_package = "Google::Cloud::DocumentAI::V1beta3"; 38 39// Service to call Cloud DocumentAI to manage document collection (dataset). 40service DocumentService { 41 option (google.api.default_host) = "documentai.googleapis.com"; 42 option (google.api.oauth_scopes) = 43 "https://www.googleapis.com/auth/cloud-platform"; 44 45 // Updates metadata associated with a dataset. 46 rpc UpdateDataset(UpdateDatasetRequest) 47 returns (google.longrunning.Operation) { 48 option (google.api.http) = { 49 patch: "/v1beta3/{dataset.name=projects/*/locations/*/processors/*/dataset}" 50 body: "dataset" 51 }; 52 option (google.api.method_signature) = "dataset,update_mask"; 53 option (google.longrunning.operation_info) = { 54 response_type: "Dataset" 55 metadata_type: "UpdateDatasetOperationMetadata" 56 }; 57 } 58 59 // Import documents into a dataset. 60 rpc ImportDocuments(ImportDocumentsRequest) 61 returns (google.longrunning.Operation) { 62 option (google.api.http) = { 63 post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:importDocuments" 64 body: "*" 65 }; 66 option (google.api.method_signature) = "dataset"; 67 option (google.longrunning.operation_info) = { 68 response_type: "ImportDocumentsResponse" 69 metadata_type: "ImportDocumentsMetadata" 70 }; 71 } 72 73 // Returns relevant fields present in the requested document. 74 rpc GetDocument(GetDocumentRequest) returns (GetDocumentResponse) { 75 option (google.api.http) = { 76 get: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:getDocument" 77 }; 78 option (google.api.method_signature) = "dataset"; 79 } 80 81 // Returns a list of documents present in the dataset. 82 rpc ListDocuments(ListDocumentsRequest) returns (ListDocumentsResponse) { 83 option (google.api.http) = { 84 post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:listDocuments" 85 body: "*" 86 }; 87 option (google.api.method_signature) = "dataset"; 88 } 89 90 // Deletes a set of documents. 91 rpc BatchDeleteDocuments(BatchDeleteDocumentsRequest) 92 returns (google.longrunning.Operation) { 93 option (google.api.http) = { 94 post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:batchDeleteDocuments" 95 body: "*" 96 }; 97 option (google.api.method_signature) = "dataset"; 98 option (google.longrunning.operation_info) = { 99 response_type: "BatchDeleteDocumentsResponse" 100 metadata_type: "BatchDeleteDocumentsMetadata" 101 }; 102 } 103 104 // Gets the `DatasetSchema` of a `Dataset`. 105 rpc GetDatasetSchema(GetDatasetSchemaRequest) returns (DatasetSchema) { 106 option (google.api.http) = { 107 get: "/v1beta3/{name=projects/*/locations/*/processors/*/dataset/datasetSchema}" 108 }; 109 option (google.api.method_signature) = "name"; 110 } 111 112 // Updates a `DatasetSchema`. 113 rpc UpdateDatasetSchema(UpdateDatasetSchemaRequest) returns (DatasetSchema) { 114 option (google.api.http) = { 115 patch: "/v1beta3/{dataset_schema.name=projects/*/locations/*/processors/*/dataset/datasetSchema}" 116 body: "dataset_schema" 117 }; 118 option (google.api.method_signature) = "dataset_schema,update_mask"; 119 } 120} 121 122// Documents belonging to a dataset will be split into different groups 123// referred to as splits: train, test. 124enum DatasetSplitType { 125 // Default value if the enum is not set. 126 DATASET_SPLIT_TYPE_UNSPECIFIED = 0; 127 128 // Identifies the train documents. 129 DATASET_SPLIT_TRAIN = 1; 130 131 // Identifies the test documents. 132 DATASET_SPLIT_TEST = 2; 133 134 // Identifies the unassigned documents. 135 DATASET_SPLIT_UNASSIGNED = 3; 136} 137 138// Describes the labeling status of a document. 139enum DocumentLabelingState { 140 // Default value if the enum is not set. 141 DOCUMENT_LABELING_STATE_UNSPECIFIED = 0; 142 143 // Document has been labeled. 144 DOCUMENT_LABELED = 1; 145 146 // Document has not been labeled. 147 DOCUMENT_UNLABELED = 2; 148 149 // Document has been auto-labeled. 150 DOCUMENT_AUTO_LABELED = 3; 151} 152 153message UpdateDatasetRequest { 154 // Required. The `name` field of the `Dataset` is used to identify the 155 // resource to be updated. 156 Dataset dataset = 1 [(google.api.field_behavior) = REQUIRED]; 157 158 // The update mask applies to the resource. 159 google.protobuf.FieldMask update_mask = 2; 160} 161 162message UpdateDatasetOperationMetadata { 163 // The basic metadata of the long-running operation. 164 CommonOperationMetadata common_metadata = 1; 165} 166 167message ImportDocumentsRequest { 168 // Config for importing documents. 169 // Each batch can have its own dataset split type. 170 message BatchDocumentsImportConfig { 171 // The config for auto-split. 172 message AutoSplitConfig { 173 // Ratio of training dataset split. 174 float training_split_ratio = 1; 175 } 176 177 oneof split_type_config { 178 // Target dataset split where the documents must be stored. 179 DatasetSplitType dataset_split = 2; 180 181 // If set, documents will be automatically split into training and test 182 // split category with the specified ratio. 183 AutoSplitConfig auto_split_config = 3; 184 } 185 186 // The common config to specify a set of documents used as input. 187 BatchDocumentsInputConfig batch_input_config = 1; 188 } 189 190 // Required. The dataset resource name. 191 // Format: 192 // projects/{project}/locations/{location}/processors/{processor}/dataset 193 string dataset = 1 [ 194 (google.api.field_behavior) = REQUIRED, 195 (google.api.resource_reference) = { 196 type: "documentai.googleapis.com/Dataset" 197 } 198 ]; 199 200 // Required. The Cloud Storage uri containing raw documents that must be 201 // imported. 202 repeated BatchDocumentsImportConfig batch_documents_import_configs = 4 203 [(google.api.field_behavior) = REQUIRED]; 204} 205 206// Response of the import document operation. 207message ImportDocumentsResponse {} 208 209// Metadata of the import document operation. 210message ImportDocumentsMetadata { 211 // The status of each individual document in the import process. 212 message IndividualImportStatus { 213 // The source Cloud Storage URI of the document. 214 string input_gcs_source = 1; 215 216 // The status of the importing of the document. 217 google.rpc.Status status = 2; 218 219 // The document id of imported document if it was successful, otherwise 220 // empty. 221 DocumentId output_document_id = 4; 222 } 223 224 // The validation status of each import config. Status is set to an error if 225 // there are no documents to import in the `import_config`, or `OK` if the 226 // operation will try to proceed with at least one document. 227 message ImportConfigValidationResult { 228 // The source Cloud Storage URI specified in the import config. 229 string input_gcs_source = 1; 230 231 // The validation status of import config. 232 google.rpc.Status status = 2; 233 } 234 235 // The basic metadata of the long-running operation. 236 CommonOperationMetadata common_metadata = 1; 237 238 // The list of response details of each document. 239 repeated IndividualImportStatus individual_import_statuses = 2; 240 241 // Validation statuses of the batch documents import config. 242 repeated ImportConfigValidationResult import_config_validation_results = 4; 243 244 // Total number of the documents that are qualified for importing. 245 int32 total_document_count = 3; 246} 247 248message GetDocumentRequest { 249 // Required. The resource name of the dataset that the document belongs to . 250 // Format: 251 // projects/{project}/locations/{location}/processors/{processor}/dataset 252 string dataset = 1 [ 253 (google.api.field_behavior) = REQUIRED, 254 (google.api.resource_reference) = { 255 type: "documentai.googleapis.com/Dataset" 256 } 257 ]; 258 259 // Required. Document identifier. 260 DocumentId document_id = 2 [(google.api.field_behavior) = REQUIRED]; 261 262 // If set, only fields listed here will be returned. Otherwise, all fields 263 // will be returned by default. 264 google.protobuf.FieldMask read_mask = 3; 265 266 // List of pages for which the fields specified in the `read_mask` must 267 // be served. 268 DocumentPageRange page_range = 4; 269} 270 271message GetDocumentResponse { 272 Document document = 1; 273} 274 275message ListDocumentsRequest { 276 // Required. The resource name of the dataset to be listed. 277 // Format: 278 // projects/{project}/locations/{location}/processors/{processor}/dataset 279 string dataset = 1 [ 280 (google.api.field_behavior) = REQUIRED, 281 (google.api.resource_reference) = { 282 type: "documentai.googleapis.com/Dataset" 283 } 284 ]; 285 286 // The maximum number of documents to return. The service may return 287 // fewer than this value. 288 // If unspecified, at most 20 documents will be returned. 289 // The maximum value is 100; values above 100 will be coerced to 100. 290 int32 page_size = 2; 291 292 // A page token, received from a previous `ListDocuments` call. 293 // Provide this to retrieve the subsequent page. 294 // 295 // When paginating, all other parameters provided to `ListDocuments` 296 // must match the call that provided the page token. 297 string page_token = 3; 298 299 // Optional. Query to filter the documents based on 300 // https://google.aip.dev/160. 301 // ## Currently support query strings are: 302 // 303 // `SplitType=DATASET_SPLIT_TEST|DATASET_SPLIT_TRAIN|DATASET_SPLIT_UNASSIGNED` 304 // - `LabelingState=DOCUMENT_LABELED|DOCUMENT_UNLABELED|DOCUMENT_AUTO_LABELED` 305 // - `DisplayName=\"file_name.pdf\"` 306 // - `EntityType=abc/def` 307 // - `TagName=\"auto-labeling-running\"|\"sampled\"` 308 // 309 // Note: 310 // - Only `AND`, `=` and `!=` are supported. 311 // e.g. `DisplayName=file_name AND EntityType!=abc` IS supported. 312 // - Wildcard `*` is supported only in `DisplayName` filter 313 // - No duplicate filter keys are allowed, 314 // e.g. `EntityType=a AND EntityType=b` is NOT supported. 315 // - String match is case sensitive (for filter `DisplayName` & `EntityType`). 316 string filter = 4 [(google.api.field_behavior) = OPTIONAL]; 317 318 // Optional. Controls if the request requires a total size of matched 319 // documents. See 320 // [ListDocumentsResponse.total_size][google.cloud.documentai.v1beta3.ListDocumentsResponse.total_size]. 321 // 322 // Enabling this flag may adversely impact performance. 323 // 324 // Defaults to false. 325 bool return_total_size = 6 [(google.api.field_behavior) = OPTIONAL]; 326 327 // Optional. Number of results to skip beginning from the `page_token` if 328 // provided. https://google.aip.dev/158#skipping-results. It must be a 329 // non-negative integer. Negative values will be rejected. Note that this is 330 // not the number of pages to skip. If this value causes the cursor to move 331 // past the end of results, 332 // [ListDocumentsResponse.document_metadata][google.cloud.documentai.v1beta3.ListDocumentsResponse.document_metadata] 333 // and 334 // [ListDocumentsResponse.next_page_token][google.cloud.documentai.v1beta3.ListDocumentsResponse.next_page_token] 335 // will be empty. 336 int32 skip = 8 [(google.api.field_behavior) = OPTIONAL]; 337} 338 339message ListDocumentsResponse { 340 // Document metadata corresponding to the listed documents. 341 repeated DocumentMetadata document_metadata = 1; 342 343 // A token, which can be sent as 344 // [ListDocumentsRequest.page_token][google.cloud.documentai.v1beta3.ListDocumentsRequest.page_token] 345 // to retrieve the next page. If this field is omitted, there are no 346 // subsequent pages. 347 string next_page_token = 2; 348 349 // Total count of documents queried. 350 int32 total_size = 3; 351} 352 353message BatchDeleteDocumentsRequest { 354 // Required. The dataset resource name. 355 // Format: 356 // projects/{project}/locations/{location}/processors/{processor}/dataset 357 string dataset = 1 [(google.api.field_behavior) = REQUIRED]; 358 359 // Required. Dataset documents input. If given `filter`, all documents 360 // satisfying the filter will be deleted. If given documentIds, a maximum of 361 // 50 documents can be deleted in a batch. The request will be rejected if 362 // more than 50 document_ids are provided. 363 BatchDatasetDocuments dataset_documents = 3 364 [(google.api.field_behavior) = REQUIRED]; 365} 366 367// Response of the delete documents operation. 368message BatchDeleteDocumentsResponse {} 369 370message BatchDeleteDocumentsMetadata { 371 // The status of each individual document in the batch delete process. 372 message IndividualBatchDeleteStatus { 373 // The document id of the document. 374 DocumentId document_id = 1; 375 376 // The status of deleting the document in storage. 377 google.rpc.Status status = 2; 378 } 379 380 // The basic metadata of the long-running operation. 381 CommonOperationMetadata common_metadata = 1; 382 383 // The list of response details of each document. 384 repeated IndividualBatchDeleteStatus individual_batch_delete_statuses = 2; 385 386 // Total number of documents deleting from dataset. 387 int32 total_document_count = 3; 388 389 // Total number of documents that failed to be deleted in storage. 390 int32 error_document_count = 4; 391} 392 393// Request for `GetDatasetSchema`. 394message GetDatasetSchemaRequest { 395 // Required. The dataset schema resource name. 396 // Format: 397 // projects/{project}/locations/{location}/processors/{processor}/dataset/datasetSchema 398 string name = 1 [ 399 (google.api.field_behavior) = REQUIRED, 400 (google.api.resource_reference) = { 401 type: "documentai.googleapis.com/DatasetSchema" 402 } 403 ]; 404 405 // If set, only returns the visible fields of the schema. 406 bool visible_fields_only = 2; 407} 408 409// Request for `UpdateDatasetSchema`. 410message UpdateDatasetSchemaRequest { 411 // Required. The name field of the `DatasetSchema` is used to identify the 412 // resource to be updated. 413 DatasetSchema dataset_schema = 1 [(google.api.field_behavior) = REQUIRED]; 414 415 // The update mask applies to the resource. 416 google.protobuf.FieldMask update_mask = 2; 417} 418 419// Range of pages present in a document. 420message DocumentPageRange { 421 // First page number (one-based index) to be returned. 422 int32 start = 1; 423 424 // Last page number (one-based index) to be returned. 425 int32 end = 2; 426} 427 428// Metadata about a document. 429message DocumentMetadata { 430 // Document identifier. 431 DocumentId document_id = 1; 432 433 // Number of pages in the document. 434 int32 page_count = 2; 435 436 // Type of the dataset split to which the document belongs. 437 DatasetSplitType dataset_type = 3; 438 439 // Labeling state of the document. 440 DocumentLabelingState labeling_state = 5; 441 442 // The display name of the document. 443 string display_name = 6; 444} 445