xref: /aosp_15_r20/external/googleapis/google/cloud/contentwarehouse/v1/pipelines.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.contentwarehouse.v1;
18
19import "google/api/field_behavior.proto";
20import "google/api/resource.proto";
21import "google/cloud/contentwarehouse/v1/common.proto";
22import "google/iam/v1/policy.proto";
23import "google/rpc/status.proto";
24
25option csharp_namespace = "Google.Cloud.ContentWarehouse.V1";
26option go_package = "cloud.google.com/go/contentwarehouse/apiv1/contentwarehousepb;contentwarehousepb";
27option java_multiple_files = true;
28option java_outer_classname = "PipelinesProto";
29option java_package = "com.google.cloud.contentwarehouse.v1";
30option php_namespace = "Google\\Cloud\\ContentWarehouse\\V1";
31option ruby_package = "Google::Cloud::ContentWarehouse::V1";
32option (google.api.resource_definition) = {
33  type: "cloudfunctions.googleapis.com/CloudFunction"
34  pattern: "projects/{project}/locations/{location}/functions/{function}"
35};
36
37// Response message of RunPipeline method.
38message RunPipelineResponse {}
39
40// Metadata message of RunPipeline method.
41message RunPipelineMetadata {
42  // The metadata message for GcsIngest pipeline.
43  message GcsIngestPipelineMetadata {
44    // The input Cloud Storage folder in this pipeline.
45    // Format: `gs://<bucket-name>/<folder-name>`.
46    string input_path = 1;
47  }
48
49  // The metadata message for Export-to-CDW pipeline.
50  message ExportToCdwPipelineMetadata {
51    // The input list of all the resource names of the documents to be exported.
52    repeated string documents = 1;
53
54    // The output CDW dataset resource name.
55    string doc_ai_dataset = 2;
56
57    // The output Cloud Storage folder in this pipeline.
58    string output_path = 3;
59  }
60
61  // The metadata message for Process-with-DocAi pipeline.
62  message ProcessWithDocAiPipelineMetadata {
63    // The input list of all the resource names of the documents to be
64    // processed.
65    repeated string documents = 1;
66
67    // The DocAI processor to process the documents with.
68    ProcessorInfo processor_info = 2;
69  }
70
71  // The status of processing a document.
72  message IndividualDocumentStatus {
73    // Document identifier of an existing document.
74    string document_id = 1;
75
76    // The status processing the document.
77    google.rpc.Status status = 2;
78  }
79
80  // Number of files that were processed by the pipeline.
81  int32 total_file_count = 1;
82
83  // Number of files that have failed at some point in the pipeline.
84  int32 failed_file_count = 2;
85
86  // User unique identification and groups information.
87  UserInfo user_info = 3;
88
89  // The pipeline metadata.
90  oneof pipeline_metadata {
91    // The pipeline metadata for GcsIngest pipeline.
92    GcsIngestPipelineMetadata gcs_ingest_pipeline_metadata = 4;
93
94    // The pipeline metadata for Export-to-CDW pipeline.
95    ExportToCdwPipelineMetadata export_to_cdw_pipeline_metadata = 6;
96
97    // The pipeline metadata for Process-with-DocAi pipeline.
98    ProcessWithDocAiPipelineMetadata process_with_doc_ai_pipeline_metadata = 7;
99  }
100
101  // The list of response details of each document.
102  repeated IndividualDocumentStatus individual_document_statuses = 5;
103}
104
105// The DocAI processor information.
106message ProcessorInfo {
107  // The processor resource name.
108  // Format is `projects/{project}/locations/{location}/processors/{processor}`,
109  // or
110  // `projects/{project}/locations/{location}/processors/{processor}/processorVersions/{processorVersion}`
111  string processor_name = 1;
112
113  // The processor will process the documents with this document type.
114  string document_type = 2;
115
116  // The Document schema resource name. All documents processed by this
117  // processor will use this schema.
118  // Format:
119  // projects/{project_number}/locations/{location}/documentSchemas/{document_schema_id}.
120  string schema_name = 3;
121}
122
123// The ingestion pipeline config.
124message IngestPipelineConfig {
125  // The document level acl policy config.
126  // This refers to an Identity and Access (IAM) policy, which specifies access
127  // controls for all documents ingested by the pipeline. The
128  // [role][google.iam.v1.Binding.role] and
129  // [members][google.iam.v1.Binding.role] under the policy needs to be
130  // specified.
131  //
132  // The following roles are supported for document level acl control:
133  // * roles/contentwarehouse.documentAdmin
134  // * roles/contentwarehouse.documentEditor
135  // * roles/contentwarehouse.documentViewer
136  //
137  // The following members are supported for document level acl control:
138  // * user:[email protected]
139  // * group:[email protected]
140  // Note that for documents searched with LLM, only single level user or group
141  // acl check is supported.
142  google.iam.v1.Policy document_acl_policy = 1;
143
144  // The document text extraction enabled flag.
145  // If the flag is set to true, DWH will perform text extraction on the raw
146  // document.
147  bool enable_document_text_extraction = 2;
148
149  // Optional. The name of the folder to which all ingested documents will be
150  // linked during ingestion process. Format is
151  // `projects/{project}/locations/{location}/documents/{folder_id}`
152  string folder = 3 [(google.api.field_behavior) = OPTIONAL];
153
154  // The Cloud Function resource name. The Cloud Function needs to live inside
155  // consumer project and is accessible to Document AI Warehouse P4SA.
156  // Only Cloud Functions V2 is supported. Cloud function execution should
157  // complete within 5 minutes or this file ingestion may fail due to timeout.
158  // Format: `https://{region}-{project_id}.cloudfunctions.net/{cloud_function}`
159  // The following keys are available the request json payload.
160  // * display_name
161  // * properties
162  // * plain_text
163  // * reference_id
164  // * document_schema_name
165  // * raw_document_path
166  // * raw_document_file_type
167  //
168  // The following keys from the cloud function json response payload will be
169  // ingested to the Document AI Warehouse as part of Document proto content
170  // and/or related information. The original values will be overridden if any
171  // key is present in the response.
172  // * display_name
173  // * properties
174  // * plain_text
175  // * document_acl_policy
176  // * folder
177  string cloud_function = 4 [(google.api.resource_reference) = {
178    type: "cloudfunctions.googleapis.com/CloudFunction"
179  }];
180}
181
182// The configuration of the Cloud Storage Ingestion pipeline.
183message GcsIngestPipeline {
184  // The input Cloud Storage folder. All files under this folder will be
185  // imported to Document Warehouse.
186  // Format: `gs://<bucket-name>/<folder-name>`.
187  string input_path = 1;
188
189  // The Document Warehouse schema resource name. All documents processed by
190  // this pipeline will use this schema.
191  // Format:
192  // projects/{project_number}/locations/{location}/documentSchemas/{document_schema_id}.
193  string schema_name = 2;
194
195  // The Doc AI processor type name. Only used when the format of ingested
196  // files is Doc AI Document proto format.
197  string processor_type = 3;
198
199  // The flag whether to skip ingested documents.
200  // If it is set to true, documents in Cloud Storage contains key "status" with
201  // value "status=ingested" in custom metadata will be skipped to ingest.
202  bool skip_ingested_documents = 4;
203
204  // Optional. The config for the Cloud Storage Ingestion pipeline.
205  // It provides additional customization options to run the pipeline and can be
206  // skipped if it is not applicable.
207  IngestPipelineConfig pipeline_config = 5
208      [(google.api.field_behavior) = OPTIONAL];
209}
210
211// The configuration of the Cloud Storage Ingestion with DocAI Processors
212// pipeline.
213message GcsIngestWithDocAiProcessorsPipeline {
214  // The input Cloud Storage folder. All files under this folder will be
215  // imported to Document Warehouse.
216  // Format: `gs://<bucket-name>/<folder-name>`.
217  string input_path = 1;
218
219  // The split and classify processor information.
220  // The split and classify result will be used to find a matched extract
221  // processor.
222  ProcessorInfo split_classify_processor_info = 2;
223
224  // The extract processors information.
225  // One matched extract processor will be used to process documents based on
226  // the classify processor result. If no classify processor is specified, the
227  // first extract processor will be used.
228  repeated ProcessorInfo extract_processor_infos = 3;
229
230  // The Cloud Storage folder path used to store the raw results from
231  // processors.
232  // Format: `gs://<bucket-name>/<folder-name>`.
233  string processor_results_folder_path = 4;
234
235  // The flag whether to skip ingested documents.
236  // If it is set to true, documents in Cloud Storage contains key "status" with
237  // value "status=ingested" in custom metadata will be skipped to ingest.
238  bool skip_ingested_documents = 5;
239
240  // Optional. The config for the Cloud Storage Ingestion with DocAI Processors
241  // pipeline. It provides additional customization options to run the pipeline
242  // and can be skipped if it is not applicable.
243  IngestPipelineConfig pipeline_config = 6
244      [(google.api.field_behavior) = OPTIONAL];
245}
246
247// The configuration of exporting documents from the Document Warehouse to CDW
248// pipeline.
249message ExportToCdwPipeline {
250  // The list of all the resource names of the documents to be processed.
251  // Format:
252  // projects/{project_number}/locations/{location}/documents/{document_id}.
253  repeated string documents = 1;
254
255  // The Cloud Storage folder path used to store the exported documents before
256  // being sent to CDW.
257  // Format: `gs://<bucket-name>/<folder-name>`.
258  string export_folder_path = 2;
259
260  // Optional. The CDW dataset resource name. This field is optional. If not
261  // set, the documents will be exported to Cloud Storage only. Format:
262  // projects/{project}/locations/{location}/processors/{processor}/dataset
263  string doc_ai_dataset = 3 [(google.api.field_behavior) = OPTIONAL];
264
265  // Ratio of training dataset split. When importing into Document AI Workbench,
266  // documents will be automatically split into training and test split category
267  // with the specified ratio. This field is required if doc_ai_dataset is set.
268  float training_split_ratio = 4;
269}
270
271// The configuration of processing documents in Document Warehouse with DocAi
272// processors pipeline.
273message ProcessWithDocAiPipeline {
274  // The list of all the resource names of the documents to be processed.
275  // Format:
276  // projects/{project_number}/locations/{location}/documents/{document_id}.
277  repeated string documents = 1;
278
279  // The Cloud Storage folder path used to store the exported documents before
280  // being sent to CDW.
281  // Format: `gs://<bucket-name>/<folder-name>`.
282  string export_folder_path = 2;
283
284  // The CDW processor information.
285  ProcessorInfo processor_info = 3;
286
287  // The Cloud Storage folder path used to store the raw results from
288  // processors.
289  // Format: `gs://<bucket-name>/<folder-name>`.
290  string processor_results_folder_path = 4;
291}
292