1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.contentwarehouse.v1; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21import "google/cloud/contentwarehouse/v1/common.proto"; 22import "google/iam/v1/policy.proto"; 23import "google/rpc/status.proto"; 24 25option csharp_namespace = "Google.Cloud.ContentWarehouse.V1"; 26option go_package = "cloud.google.com/go/contentwarehouse/apiv1/contentwarehousepb;contentwarehousepb"; 27option java_multiple_files = true; 28option java_outer_classname = "PipelinesProto"; 29option java_package = "com.google.cloud.contentwarehouse.v1"; 30option php_namespace = "Google\\Cloud\\ContentWarehouse\\V1"; 31option ruby_package = "Google::Cloud::ContentWarehouse::V1"; 32option (google.api.resource_definition) = { 33 type: "cloudfunctions.googleapis.com/CloudFunction" 34 pattern: "projects/{project}/locations/{location}/functions/{function}" 35}; 36 37// Response message of RunPipeline method. 38message RunPipelineResponse {} 39 40// Metadata message of RunPipeline method. 41message RunPipelineMetadata { 42 // The metadata message for GcsIngest pipeline. 43 message GcsIngestPipelineMetadata { 44 // The input Cloud Storage folder in this pipeline. 45 // Format: `gs://<bucket-name>/<folder-name>`. 46 string input_path = 1; 47 } 48 49 // The metadata message for Export-to-CDW pipeline. 50 message ExportToCdwPipelineMetadata { 51 // The input list of all the resource names of the documents to be exported. 52 repeated string documents = 1; 53 54 // The output CDW dataset resource name. 55 string doc_ai_dataset = 2; 56 57 // The output Cloud Storage folder in this pipeline. 58 string output_path = 3; 59 } 60 61 // The metadata message for Process-with-DocAi pipeline. 62 message ProcessWithDocAiPipelineMetadata { 63 // The input list of all the resource names of the documents to be 64 // processed. 65 repeated string documents = 1; 66 67 // The DocAI processor to process the documents with. 68 ProcessorInfo processor_info = 2; 69 } 70 71 // The status of processing a document. 72 message IndividualDocumentStatus { 73 // Document identifier of an existing document. 74 string document_id = 1; 75 76 // The status processing the document. 77 google.rpc.Status status = 2; 78 } 79 80 // Number of files that were processed by the pipeline. 81 int32 total_file_count = 1; 82 83 // Number of files that have failed at some point in the pipeline. 84 int32 failed_file_count = 2; 85 86 // User unique identification and groups information. 87 UserInfo user_info = 3; 88 89 // The pipeline metadata. 90 oneof pipeline_metadata { 91 // The pipeline metadata for GcsIngest pipeline. 92 GcsIngestPipelineMetadata gcs_ingest_pipeline_metadata = 4; 93 94 // The pipeline metadata for Export-to-CDW pipeline. 95 ExportToCdwPipelineMetadata export_to_cdw_pipeline_metadata = 6; 96 97 // The pipeline metadata for Process-with-DocAi pipeline. 98 ProcessWithDocAiPipelineMetadata process_with_doc_ai_pipeline_metadata = 7; 99 } 100 101 // The list of response details of each document. 102 repeated IndividualDocumentStatus individual_document_statuses = 5; 103} 104 105// The DocAI processor information. 106message ProcessorInfo { 107 // The processor resource name. 108 // Format is `projects/{project}/locations/{location}/processors/{processor}`, 109 // or 110 // `projects/{project}/locations/{location}/processors/{processor}/processorVersions/{processorVersion}` 111 string processor_name = 1; 112 113 // The processor will process the documents with this document type. 114 string document_type = 2; 115 116 // The Document schema resource name. All documents processed by this 117 // processor will use this schema. 118 // Format: 119 // projects/{project_number}/locations/{location}/documentSchemas/{document_schema_id}. 120 string schema_name = 3; 121} 122 123// The ingestion pipeline config. 124message IngestPipelineConfig { 125 // The document level acl policy config. 126 // This refers to an Identity and Access (IAM) policy, which specifies access 127 // controls for all documents ingested by the pipeline. The 128 // [role][google.iam.v1.Binding.role] and 129 // [members][google.iam.v1.Binding.role] under the policy needs to be 130 // specified. 131 // 132 // The following roles are supported for document level acl control: 133 // * roles/contentwarehouse.documentAdmin 134 // * roles/contentwarehouse.documentEditor 135 // * roles/contentwarehouse.documentViewer 136 // 137 // The following members are supported for document level acl control: 138 // * user:[email protected] 139 // * group:[email protected] 140 // Note that for documents searched with LLM, only single level user or group 141 // acl check is supported. 142 google.iam.v1.Policy document_acl_policy = 1; 143 144 // The document text extraction enabled flag. 145 // If the flag is set to true, DWH will perform text extraction on the raw 146 // document. 147 bool enable_document_text_extraction = 2; 148 149 // Optional. The name of the folder to which all ingested documents will be 150 // linked during ingestion process. Format is 151 // `projects/{project}/locations/{location}/documents/{folder_id}` 152 string folder = 3 [(google.api.field_behavior) = OPTIONAL]; 153 154 // The Cloud Function resource name. The Cloud Function needs to live inside 155 // consumer project and is accessible to Document AI Warehouse P4SA. 156 // Only Cloud Functions V2 is supported. Cloud function execution should 157 // complete within 5 minutes or this file ingestion may fail due to timeout. 158 // Format: `https://{region}-{project_id}.cloudfunctions.net/{cloud_function}` 159 // The following keys are available the request json payload. 160 // * display_name 161 // * properties 162 // * plain_text 163 // * reference_id 164 // * document_schema_name 165 // * raw_document_path 166 // * raw_document_file_type 167 // 168 // The following keys from the cloud function json response payload will be 169 // ingested to the Document AI Warehouse as part of Document proto content 170 // and/or related information. The original values will be overridden if any 171 // key is present in the response. 172 // * display_name 173 // * properties 174 // * plain_text 175 // * document_acl_policy 176 // * folder 177 string cloud_function = 4 [(google.api.resource_reference) = { 178 type: "cloudfunctions.googleapis.com/CloudFunction" 179 }]; 180} 181 182// The configuration of the Cloud Storage Ingestion pipeline. 183message GcsIngestPipeline { 184 // The input Cloud Storage folder. All files under this folder will be 185 // imported to Document Warehouse. 186 // Format: `gs://<bucket-name>/<folder-name>`. 187 string input_path = 1; 188 189 // The Document Warehouse schema resource name. All documents processed by 190 // this pipeline will use this schema. 191 // Format: 192 // projects/{project_number}/locations/{location}/documentSchemas/{document_schema_id}. 193 string schema_name = 2; 194 195 // The Doc AI processor type name. Only used when the format of ingested 196 // files is Doc AI Document proto format. 197 string processor_type = 3; 198 199 // The flag whether to skip ingested documents. 200 // If it is set to true, documents in Cloud Storage contains key "status" with 201 // value "status=ingested" in custom metadata will be skipped to ingest. 202 bool skip_ingested_documents = 4; 203 204 // Optional. The config for the Cloud Storage Ingestion pipeline. 205 // It provides additional customization options to run the pipeline and can be 206 // skipped if it is not applicable. 207 IngestPipelineConfig pipeline_config = 5 208 [(google.api.field_behavior) = OPTIONAL]; 209} 210 211// The configuration of the Cloud Storage Ingestion with DocAI Processors 212// pipeline. 213message GcsIngestWithDocAiProcessorsPipeline { 214 // The input Cloud Storage folder. All files under this folder will be 215 // imported to Document Warehouse. 216 // Format: `gs://<bucket-name>/<folder-name>`. 217 string input_path = 1; 218 219 // The split and classify processor information. 220 // The split and classify result will be used to find a matched extract 221 // processor. 222 ProcessorInfo split_classify_processor_info = 2; 223 224 // The extract processors information. 225 // One matched extract processor will be used to process documents based on 226 // the classify processor result. If no classify processor is specified, the 227 // first extract processor will be used. 228 repeated ProcessorInfo extract_processor_infos = 3; 229 230 // The Cloud Storage folder path used to store the raw results from 231 // processors. 232 // Format: `gs://<bucket-name>/<folder-name>`. 233 string processor_results_folder_path = 4; 234 235 // The flag whether to skip ingested documents. 236 // If it is set to true, documents in Cloud Storage contains key "status" with 237 // value "status=ingested" in custom metadata will be skipped to ingest. 238 bool skip_ingested_documents = 5; 239 240 // Optional. The config for the Cloud Storage Ingestion with DocAI Processors 241 // pipeline. It provides additional customization options to run the pipeline 242 // and can be skipped if it is not applicable. 243 IngestPipelineConfig pipeline_config = 6 244 [(google.api.field_behavior) = OPTIONAL]; 245} 246 247// The configuration of exporting documents from the Document Warehouse to CDW 248// pipeline. 249message ExportToCdwPipeline { 250 // The list of all the resource names of the documents to be processed. 251 // Format: 252 // projects/{project_number}/locations/{location}/documents/{document_id}. 253 repeated string documents = 1; 254 255 // The Cloud Storage folder path used to store the exported documents before 256 // being sent to CDW. 257 // Format: `gs://<bucket-name>/<folder-name>`. 258 string export_folder_path = 2; 259 260 // Optional. The CDW dataset resource name. This field is optional. If not 261 // set, the documents will be exported to Cloud Storage only. Format: 262 // projects/{project}/locations/{location}/processors/{processor}/dataset 263 string doc_ai_dataset = 3 [(google.api.field_behavior) = OPTIONAL]; 264 265 // Ratio of training dataset split. When importing into Document AI Workbench, 266 // documents will be automatically split into training and test split category 267 // with the specified ratio. This field is required if doc_ai_dataset is set. 268 float training_split_ratio = 4; 269} 270 271// The configuration of processing documents in Document Warehouse with DocAi 272// processors pipeline. 273message ProcessWithDocAiPipeline { 274 // The list of all the resource names of the documents to be processed. 275 // Format: 276 // projects/{project_number}/locations/{location}/documents/{document_id}. 277 repeated string documents = 1; 278 279 // The Cloud Storage folder path used to store the exported documents before 280 // being sent to CDW. 281 // Format: `gs://<bucket-name>/<folder-name>`. 282 string export_folder_path = 2; 283 284 // The CDW processor information. 285 ProcessorInfo processor_info = 3; 286 287 // The Cloud Storage folder path used to store the raw results from 288 // processors. 289 // Format: `gs://<bucket-name>/<folder-name>`. 290 string processor_results_folder_path = 4; 291} 292