xref: /aosp_15_r20/external/googleapis/google/cloud/documentai/v1beta3/dataset.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.documentai.v1beta3;
18
19import "google/api/field_behavior.proto";
20import "google/api/resource.proto";
21import "google/cloud/documentai/v1beta3/document.proto";
22import "google/cloud/documentai/v1beta3/document_io.proto";
23import "google/cloud/documentai/v1beta3/document_schema.proto";
24
25option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3";
26option go_package = "cloud.google.com/go/documentai/apiv1beta3/documentaipb;documentaipb";
27option java_multiple_files = true;
28option java_outer_classname = "DatasetProto";
29option java_package = "com.google.cloud.documentai.v1beta3";
30option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3";
31option ruby_package = "Google::Cloud::DocumentAI::V1beta3";
32option (google.api.resource_definition) = {
33  type: "contentwarehouse.googleapis.com/Schema"
34  pattern: "projects/{project}/locations/{location}/schemas/{schema}"
35};
36
37// A singleton resource under a
38// [Processor][google.cloud.documentai.v1beta3.Processor] which configures a
39// collection of documents.
40message Dataset {
41  option (google.api.resource) = {
42    type: "documentai.googleapis.com/Dataset"
43    pattern: "projects/{project}/locations/{location}/processors/{processor}/dataset"
44  };
45
46  // Configuration specific to the Cloud Storage-based implementation.
47  message GCSManagedConfig {
48    // Required. The Cloud Storage URI (a directory) where the documents
49    // belonging to the dataset must be stored.
50    GcsPrefix gcs_prefix = 1 [(google.api.field_behavior) = REQUIRED];
51  }
52
53  // Configuration specific to the Document AI Warehouse-based implementation.
54  message DocumentWarehouseConfig {
55    // Output only. The collection in Document AI Warehouse associated with the
56    // dataset.
57    string collection = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
58
59    // Output only. The schema in Document AI Warehouse associated with the
60    // dataset.
61    string schema = 2 [
62      (google.api.field_behavior) = OUTPUT_ONLY,
63      (google.api.resource_reference) = {
64        type: "contentwarehouse.googleapis.com/Schema"
65      }
66    ];
67  }
68
69  // Configuration specific to an unmanaged dataset.
70  message UnmanagedDatasetConfig {}
71
72  // Configuration specific to spanner-based indexing.
73  message SpannerIndexingConfig {}
74
75  // Different states of a dataset.
76  enum State {
77    // Default unspecified enum, should not be used.
78    STATE_UNSPECIFIED = 0;
79
80    // Dataset has not been initialized.
81    UNINITIALIZED = 1;
82
83    // Dataset is being initialized.
84    INITIALIZING = 2;
85
86    // Dataset has been initialized.
87    INITIALIZED = 3;
88  }
89
90  oneof storage_source {
91    // Optional. User-managed Cloud Storage dataset configuration. Use this
92    // configuration if the dataset documents are stored under a user-managed
93    // Cloud Storage location.
94    GCSManagedConfig gcs_managed_config = 3
95        [(google.api.field_behavior) = OPTIONAL];
96
97    // Optional. Deprecated. Warehouse-based dataset configuration is not
98    // supported.
99    DocumentWarehouseConfig document_warehouse_config = 5
100        [deprecated = true, (google.api.field_behavior) = OPTIONAL];
101
102    // Optional. Unmanaged dataset configuration. Use this configuration if the
103    // dataset documents are managed by the document service internally (not
104    // user-managed).
105    UnmanagedDatasetConfig unmanaged_dataset_config = 6
106        [(google.api.field_behavior) = OPTIONAL];
107  }
108
109  oneof indexing_source {
110    // Optional. A lightweight indexing source with low latency and high
111    // reliability, but lacking advanced features like CMEK and content-based
112    // search.
113    SpannerIndexingConfig spanner_indexing_config = 4
114        [(google.api.field_behavior) = OPTIONAL];
115  }
116
117  // Dataset resource name.
118  // Format:
119  // `projects/{project}/locations/{location}/processors/{processor}/dataset`
120  string name = 1;
121
122  // Required. State of the dataset. Ignored when updating dataset.
123  State state = 2 [(google.api.field_behavior) = REQUIRED];
124}
125
126// Document Identifier.
127message DocumentId {
128  // Identifies a document uniquely within the scope of a dataset in the
129  // user-managed Cloud Storage option.
130  message GCSManagedDocumentId {
131    // Required. The Cloud Storage URI where the actual document is stored.
132    string gcs_uri = 1 [(google.api.field_behavior) = REQUIRED];
133
134    // Id of the document (indexed) managed by Content Warehouse.
135    string cw_doc_id = 2 [deprecated = true];
136  }
137
138  // Identifies a document uniquely within the scope of a dataset in unmanaged
139  // option.
140  message UnmanagedDocumentId {
141    // Required. The id of the document.
142    string doc_id = 1 [(google.api.field_behavior) = REQUIRED];
143  }
144
145  oneof type {
146    // A document id within user-managed Cloud Storage.
147    GCSManagedDocumentId gcs_managed_doc_id = 1;
148
149    // A document id within unmanaged dataset.
150    UnmanagedDocumentId unmanaged_doc_id = 4;
151  }
152
153  // Points to a specific revision of the document if set.
154  RevisionRef revision_ref = 3;
155}
156
157// Dataset Schema.
158message DatasetSchema {
159  option (google.api.resource) = {
160    type: "documentai.googleapis.com/DatasetSchema"
161    pattern: "projects/{project}/locations/{location}/processors/{processor}/dataset/datasetSchema"
162  };
163
164  // Dataset schema resource name.
165  // Format:
166  // `projects/{project}/locations/{location}/processors/{processor}/dataset/datasetSchema`
167  string name = 1;
168
169  // Optional. Schema of the dataset.
170  DocumentSchema document_schema = 3 [(google.api.field_behavior) = OPTIONAL];
171}
172
173// Dataset documents that the batch operation will be applied to.
174message BatchDatasetDocuments {
175  // List of individual DocumentIds.
176  message IndividualDocumentIds {
177    // Required. List of Document IDs indicating where the actual documents are
178    // stored.
179    repeated DocumentId document_ids = 1
180        [(google.api.field_behavior) = REQUIRED];
181  }
182
183  oneof criteria {
184    // Document identifiers.
185    IndividualDocumentIds individual_document_ids = 1;
186
187    // A filter matching the documents.
188    // Follows the same format and restriction as
189    // [google.cloud.documentai.master.ListDocumentsRequest.filter].
190    string filter = 2;
191  }
192}
193