1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.documentai.v1beta3; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21import "google/cloud/documentai/v1beta3/document.proto"; 22import "google/cloud/documentai/v1beta3/document_io.proto"; 23import "google/cloud/documentai/v1beta3/document_schema.proto"; 24 25option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3"; 26option go_package = "cloud.google.com/go/documentai/apiv1beta3/documentaipb;documentaipb"; 27option java_multiple_files = true; 28option java_outer_classname = "DatasetProto"; 29option java_package = "com.google.cloud.documentai.v1beta3"; 30option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3"; 31option ruby_package = "Google::Cloud::DocumentAI::V1beta3"; 32option (google.api.resource_definition) = { 33 type: "contentwarehouse.googleapis.com/Schema" 34 pattern: "projects/{project}/locations/{location}/schemas/{schema}" 35}; 36 37// A singleton resource under a 38// [Processor][google.cloud.documentai.v1beta3.Processor] which configures a 39// collection of documents. 40message Dataset { 41 option (google.api.resource) = { 42 type: "documentai.googleapis.com/Dataset" 43 pattern: "projects/{project}/locations/{location}/processors/{processor}/dataset" 44 }; 45 46 // Configuration specific to the Cloud Storage-based implementation. 47 message GCSManagedConfig { 48 // Required. The Cloud Storage URI (a directory) where the documents 49 // belonging to the dataset must be stored. 50 GcsPrefix gcs_prefix = 1 [(google.api.field_behavior) = REQUIRED]; 51 } 52 53 // Configuration specific to the Document AI Warehouse-based implementation. 54 message DocumentWarehouseConfig { 55 // Output only. The collection in Document AI Warehouse associated with the 56 // dataset. 57 string collection = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 58 59 // Output only. The schema in Document AI Warehouse associated with the 60 // dataset. 61 string schema = 2 [ 62 (google.api.field_behavior) = OUTPUT_ONLY, 63 (google.api.resource_reference) = { 64 type: "contentwarehouse.googleapis.com/Schema" 65 } 66 ]; 67 } 68 69 // Configuration specific to an unmanaged dataset. 70 message UnmanagedDatasetConfig {} 71 72 // Configuration specific to spanner-based indexing. 73 message SpannerIndexingConfig {} 74 75 // Different states of a dataset. 76 enum State { 77 // Default unspecified enum, should not be used. 78 STATE_UNSPECIFIED = 0; 79 80 // Dataset has not been initialized. 81 UNINITIALIZED = 1; 82 83 // Dataset is being initialized. 84 INITIALIZING = 2; 85 86 // Dataset has been initialized. 87 INITIALIZED = 3; 88 } 89 90 oneof storage_source { 91 // Optional. User-managed Cloud Storage dataset configuration. Use this 92 // configuration if the dataset documents are stored under a user-managed 93 // Cloud Storage location. 94 GCSManagedConfig gcs_managed_config = 3 95 [(google.api.field_behavior) = OPTIONAL]; 96 97 // Optional. Deprecated. Warehouse-based dataset configuration is not 98 // supported. 99 DocumentWarehouseConfig document_warehouse_config = 5 100 [deprecated = true, (google.api.field_behavior) = OPTIONAL]; 101 102 // Optional. Unmanaged dataset configuration. Use this configuration if the 103 // dataset documents are managed by the document service internally (not 104 // user-managed). 105 UnmanagedDatasetConfig unmanaged_dataset_config = 6 106 [(google.api.field_behavior) = OPTIONAL]; 107 } 108 109 oneof indexing_source { 110 // Optional. A lightweight indexing source with low latency and high 111 // reliability, but lacking advanced features like CMEK and content-based 112 // search. 113 SpannerIndexingConfig spanner_indexing_config = 4 114 [(google.api.field_behavior) = OPTIONAL]; 115 } 116 117 // Dataset resource name. 118 // Format: 119 // `projects/{project}/locations/{location}/processors/{processor}/dataset` 120 string name = 1; 121 122 // Required. State of the dataset. Ignored when updating dataset. 123 State state = 2 [(google.api.field_behavior) = REQUIRED]; 124} 125 126// Document Identifier. 127message DocumentId { 128 // Identifies a document uniquely within the scope of a dataset in the 129 // user-managed Cloud Storage option. 130 message GCSManagedDocumentId { 131 // Required. The Cloud Storage URI where the actual document is stored. 132 string gcs_uri = 1 [(google.api.field_behavior) = REQUIRED]; 133 134 // Id of the document (indexed) managed by Content Warehouse. 135 string cw_doc_id = 2 [deprecated = true]; 136 } 137 138 // Identifies a document uniquely within the scope of a dataset in unmanaged 139 // option. 140 message UnmanagedDocumentId { 141 // Required. The id of the document. 142 string doc_id = 1 [(google.api.field_behavior) = REQUIRED]; 143 } 144 145 oneof type { 146 // A document id within user-managed Cloud Storage. 147 GCSManagedDocumentId gcs_managed_doc_id = 1; 148 149 // A document id within unmanaged dataset. 150 UnmanagedDocumentId unmanaged_doc_id = 4; 151 } 152 153 // Points to a specific revision of the document if set. 154 RevisionRef revision_ref = 3; 155} 156 157// Dataset Schema. 158message DatasetSchema { 159 option (google.api.resource) = { 160 type: "documentai.googleapis.com/DatasetSchema" 161 pattern: "projects/{project}/locations/{location}/processors/{processor}/dataset/datasetSchema" 162 }; 163 164 // Dataset schema resource name. 165 // Format: 166 // `projects/{project}/locations/{location}/processors/{processor}/dataset/datasetSchema` 167 string name = 1; 168 169 // Optional. Schema of the dataset. 170 DocumentSchema document_schema = 3 [(google.api.field_behavior) = OPTIONAL]; 171} 172 173// Dataset documents that the batch operation will be applied to. 174message BatchDatasetDocuments { 175 // List of individual DocumentIds. 176 message IndividualDocumentIds { 177 // Required. List of Document IDs indicating where the actual documents are 178 // stored. 179 repeated DocumentId document_ids = 1 180 [(google.api.field_behavior) = REQUIRED]; 181 } 182 183 oneof criteria { 184 // Document identifiers. 185 IndividualDocumentIds individual_document_ids = 1; 186 187 // A filter matching the documents. 188 // Follows the same format and restriction as 189 // [google.cloud.documentai.master.ListDocumentsRequest.filter]. 190 string filter = 2; 191 } 192} 193