documentai/v1beta3/document_service.proto

// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.documentai.v1beta3;

import "google/api/annotations.proto";
import "google/api/client.proto";
import "google/api/field_behavior.proto";
import "google/api/resource.proto";
import "google/cloud/documentai/v1beta3/dataset.proto";
import "google/cloud/documentai/v1beta3/document.proto";
import "google/cloud/documentai/v1beta3/document_io.proto";
import "google/cloud/documentai/v1beta3/operation_metadata.proto";
import "google/longrunning/operations.proto";
import "google/protobuf/field_mask.proto";
import "google/rpc/status.proto";

option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3";
option go_package = "cloud.google.com/go/documentai/apiv1beta3/documentaipb;documentaipb";
option java_multiple_files = true;
option java_outer_classname = "DocumentAiDocumentService";
option java_package = "com.google.cloud.documentai.v1beta3";
option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3";
option ruby_package = "Google::Cloud::DocumentAI::V1beta3";

// Service to call Cloud DocumentAI to manage document collection (dataset).
service DocumentService {
  option (google.api.default_host) = "documentai.googleapis.com";
  option (google.api.oauth_scopes) =
      "https://www.googleapis.com/auth/cloud-platform";

  // Updates metadata associated with a dataset.
  rpc UpdateDataset(UpdateDatasetRequest)
      returns (google.longrunning.Operation) {
    option (google.api.http) = {
      patch: "/v1beta3/{dataset.name=projects/*/locations/*/processors/*/dataset}"
      body: "dataset"
    };
    option (google.api.method_signature) = "dataset,update_mask";
    option (google.longrunning.operation_info) = {
      response_type: "Dataset"
      metadata_type: "UpdateDatasetOperationMetadata"
    };
  }

  // Import documents into a dataset.
  rpc ImportDocuments(ImportDocumentsRequest)
      returns (google.longrunning.Operation) {
    option (google.api.http) = {
      post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:importDocuments"
      body: "*"
    };
    option (google.api.method_signature) = "dataset";
    option (google.longrunning.operation_info) = {
      response_type: "ImportDocumentsResponse"
      metadata_type: "ImportDocumentsMetadata"
    };
  }

  // Returns relevant fields present in the requested document.
  rpc GetDocument(GetDocumentRequest) returns (GetDocumentResponse) {
    option (google.api.http) = {
      get: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:getDocument"
    };
    option (google.api.method_signature) = "dataset";
  }

  // Returns a list of documents present in the dataset.
  rpc ListDocuments(ListDocumentsRequest) returns (ListDocumentsResponse) {
    option (google.api.http) = {
      post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:listDocuments"
      body: "*"
    };
    option (google.api.method_signature) = "dataset";
  }

  // Deletes a set of documents.
  rpc BatchDeleteDocuments(BatchDeleteDocumentsRequest)
      returns (google.longrunning.Operation) {
    option (google.api.http) = {
      post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:batchDeleteDocuments"
      body: "*"
    };
    option (google.api.method_signature) = "dataset";
    option (google.longrunning.operation_info) = {
      response_type: "BatchDeleteDocumentsResponse"
      metadata_type: "BatchDeleteDocumentsMetadata"
    };
  }

  // Gets the `DatasetSchema` of a `Dataset`.
  rpc GetDatasetSchema(GetDatasetSchemaRequest) returns (DatasetSchema) {
    option (google.api.http) = {
      get: "/v1beta3/{name=projects/*/locations/*/processors/*/dataset/datasetSchema}"
    };
    option (google.api.method_signature) = "name";
  }

  // Updates a `DatasetSchema`.
  rpc UpdateDatasetSchema(UpdateDatasetSchemaRequest) returns (DatasetSchema) {
    option (google.api.http) = {
      patch: "/v1beta3/{dataset_schema.name=projects/*/locations/*/processors/*/dataset/datasetSchema}"
      body: "dataset_schema"
    };
    option (google.api.method_signature) = "dataset_schema,update_mask";
  }
}

// Documents belonging to a dataset will be split into different groups
// referred to as splits: train, test.
enum DatasetSplitType {
  // Default value if the enum is not set.
  DATASET_SPLIT_TYPE_UNSPECIFIED = 0;

  // Identifies the train documents.
  DATASET_SPLIT_TRAIN = 1;

  // Identifies the test documents.
  DATASET_SPLIT_TEST = 2;

  // Identifies the unassigned documents.
  DATASET_SPLIT_UNASSIGNED = 3;
}

// Describes the labeling status of a document.
enum DocumentLabelingState {
  // Default value if the enum is not set.
  DOCUMENT_LABELING_STATE_UNSPECIFIED = 0;

  // Document has been labeled.
  DOCUMENT_LABELED = 1;

  // Document has not been labeled.
  DOCUMENT_UNLABELED = 2;

  // Document has been auto-labeled.
  DOCUMENT_AUTO_LABELED = 3;
}

message UpdateDatasetRequest {
  // Required. The `name` field of the `Dataset` is used to identify the
  // resource to be updated.
  Dataset dataset = 1 [(google.api.field_behavior) = REQUIRED];

  // The update mask applies to the resource.
  google.protobuf.FieldMask update_mask = 2;
}

message UpdateDatasetOperationMetadata {
  // The basic metadata of the long-running operation.
  CommonOperationMetadata common_metadata = 1;
}

message ImportDocumentsRequest {
  // Config for importing documents.
  // Each batch can have its own dataset split type.
  message BatchDocumentsImportConfig {
    // The config for auto-split.
    message AutoSplitConfig {
      // Ratio of training dataset split.
      float training_split_ratio = 1;
    }

    oneof split_type_config {
      // Target dataset split where the documents must be stored.
      DatasetSplitType dataset_split = 2;

      // If set, documents will be automatically split into training and test
      // split category with the specified ratio.
      AutoSplitConfig auto_split_config = 3;
    }

    // The common config to specify a set of documents used as input.
    BatchDocumentsInputConfig batch_input_config = 1;
  }

  // Required. The dataset resource name.
  // Format:
  // projects/{project}/locations/{location}/processors/{processor}/dataset
  string dataset = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "documentai.googleapis.com/Dataset"
    }
  ];

  // Required. The Cloud Storage uri containing raw documents that must be
  // imported.
  repeated BatchDocumentsImportConfig batch_documents_import_configs = 4
      [(google.api.field_behavior) = REQUIRED];
}

// Response of the import document operation.
message ImportDocumentsResponse {}

// Metadata of the import document operation.
message ImportDocumentsMetadata {
  // The status of each individual document in the import process.
  message IndividualImportStatus {
    // The source Cloud Storage URI of the document.
    string input_gcs_source = 1;

    // The status of the importing of the document.
    google.rpc.Status status = 2;

    // The document id of imported document if it was successful, otherwise
    // empty.
    DocumentId output_document_id = 4;
  }

  // The validation status of each import config. Status is set to an error if
  // there are no documents to import in the `import_config`, or `OK` if the
  // operation will try to proceed with at least one document.
  message ImportConfigValidationResult {
    // The source Cloud Storage URI specified in the import config.
    string input_gcs_source = 1;

    // The validation status of import config.
    google.rpc.Status status = 2;
  }

  // The basic metadata of the long-running operation.
  CommonOperationMetadata common_metadata = 1;

  // The list of response details of each document.
  repeated IndividualImportStatus individual_import_statuses = 2;

  // Validation statuses of the batch documents import config.
  repeated ImportConfigValidationResult import_config_validation_results = 4;

  // Total number of the documents that are qualified for importing.
  int32 total_document_count = 3;
}

message GetDocumentRequest {
  // Required. The resource name of the dataset that the document belongs to .
  // Format:
  // projects/{project}/locations/{location}/processors/{processor}/dataset
  string dataset = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "documentai.googleapis.com/Dataset"
    }
  ];

  // Required. Document identifier.
  DocumentId document_id = 2 [(google.api.field_behavior) = REQUIRED];

  // If set, only fields listed here will be returned. Otherwise, all fields
  // will be returned by default.
  google.protobuf.FieldMask read_mask = 3;

  // List of pages for which the fields specified in the `read_mask` must
  // be served.
  DocumentPageRange page_range = 4;
}

message GetDocumentResponse {
  Document document = 1;
}

message ListDocumentsRequest {
  // Required. The resource name of the dataset to be listed.
  // Format:
  // projects/{project}/locations/{location}/processors/{processor}/dataset
  string dataset = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "documentai.googleapis.com/Dataset"
    }
  ];

  // The maximum number of documents to return. The service may return
  // fewer than this value.
  // If unspecified, at most 20 documents will be returned.
  // The maximum value is 100; values above 100 will be coerced to 100.
  int32 page_size = 2;

  // A page token, received from a previous `ListDocuments` call.
  // Provide this to retrieve the subsequent page.
  //
  // When paginating, all other parameters provided to `ListDocuments`
  // must match the call that provided the page token.
  string page_token = 3;

  // Optional. Query to filter the documents based on
  // https://google.aip.dev/160.
  // ## Currently support query strings are:
  //
  // `SplitType=DATASET_SPLIT_TEST|DATASET_SPLIT_TRAIN|DATASET_SPLIT_UNASSIGNED`
  // - `LabelingState=DOCUMENT_LABELED|DOCUMENT_UNLABELED|DOCUMENT_AUTO_LABELED`
  // - `DisplayName=\"file_name.pdf\"`
  // - `EntityType=abc/def`
  // - `TagName=\"auto-labeling-running\"|\"sampled\"`
  //
  // Note:
  // - Only `AND`, `=` and `!=` are supported.
  //     e.g. `DisplayName=file_name AND EntityType!=abc` IS supported.
  // - Wildcard `*` is supported only in `DisplayName` filter
  // - No duplicate filter keys are allowed,
  //     e.g. `EntityType=a AND EntityType=b` is NOT supported.
  // - String match is case sensitive (for filter `DisplayName` & `EntityType`).
  string filter = 4 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Controls if the request requires a total size of matched
  // documents. See
  // [ListDocumentsResponse.total_size][google.cloud.documentai.v1beta3.ListDocumentsResponse.total_size].
  //
  // Enabling this flag may adversely impact performance.
  //
  // Defaults to false.
  bool return_total_size = 6 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Number of results to skip beginning from the `page_token` if
  // provided. https://google.aip.dev/158#skipping-results. It must be a
  // non-negative integer. Negative values will be rejected. Note that this is
  // not the number of pages to skip. If this value causes the cursor to move
  // past the end of results,
  // [ListDocumentsResponse.document_metadata][google.cloud.documentai.v1beta3.ListDocumentsResponse.document_metadata]
  // and
  // [ListDocumentsResponse.next_page_token][google.cloud.documentai.v1beta3.ListDocumentsResponse.next_page_token]
  // will be empty.
  int32 skip = 8 [(google.api.field_behavior) = OPTIONAL];
}

message ListDocumentsResponse {
  // Document metadata corresponding to the listed documents.
  repeated DocumentMetadata document_metadata = 1;

  // A token, which can be sent as
  // [ListDocumentsRequest.page_token][google.cloud.documentai.v1beta3.ListDocumentsRequest.page_token]
  // to retrieve the next page. If this field is omitted, there are no
  // subsequent pages.
  string next_page_token = 2;

  // Total count of documents queried.
  int32 total_size = 3;
}

message BatchDeleteDocumentsRequest {
  // Required. The dataset resource name.
  // Format:
  // projects/{project}/locations/{location}/processors/{processor}/dataset
  string dataset = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Dataset documents input. If given `filter`, all documents
  // satisfying the filter will be deleted. If given documentIds, a maximum of
  // 50 documents can be deleted in a batch. The request will be rejected if
  // more than 50 document_ids are provided.
  BatchDatasetDocuments dataset_documents = 3
      [(google.api.field_behavior) = REQUIRED];
}

// Response of the delete documents operation.
message BatchDeleteDocumentsResponse {}

message BatchDeleteDocumentsMetadata {
  // The status of each individual document in the batch delete process.
  message IndividualBatchDeleteStatus {
    // The document id of the document.
    DocumentId document_id = 1;

    // The status of deleting the document in storage.
    google.rpc.Status status = 2;
  }

  // The basic metadata of the long-running operation.
  CommonOperationMetadata common_metadata = 1;

  // The list of response details of each document.
  repeated IndividualBatchDeleteStatus individual_batch_delete_statuses = 2;

  // Total number of documents deleting from dataset.
  int32 total_document_count = 3;

  // Total number of documents that failed to be deleted in storage.
  int32 error_document_count = 4;
}

// Request for `GetDatasetSchema`.
message GetDatasetSchemaRequest {
  // Required. The dataset schema resource name.
  // Format:
  // projects/{project}/locations/{location}/processors/{processor}/dataset/datasetSchema
  string name = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "documentai.googleapis.com/DatasetSchema"
    }
  ];

  // If set, only returns the visible fields of the schema.
  bool visible_fields_only = 2;
}

// Request for `UpdateDatasetSchema`.
message UpdateDatasetSchemaRequest {
  // Required. The name field of the `DatasetSchema` is used to identify the
  // resource to be updated.
  DatasetSchema dataset_schema = 1 [(google.api.field_behavior) = REQUIRED];

  // The update mask applies to the resource.
  google.protobuf.FieldMask update_mask = 2;
}

// Range of pages present in a document.
message DocumentPageRange {
  // First page number (one-based index) to be returned.
  int32 start = 1;

  // Last page number (one-based index) to be returned.
  int32 end = 2;
}

// Metadata about a document.
message DocumentMetadata {
  // Document identifier.
  DocumentId document_id = 1;

  // Number of pages in the document.
  int32 page_count = 2;

  // Type of the dataset split to which the document belongs.
  DatasetSplitType dataset_type = 3;

  // Labeling state of the document.
  DocumentLabelingState labeling_state = 5;

  // The display name of the document.
  string display_name = 6;
}