1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.contentwarehouse.v1; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21import "google/cloud/documentai/v1/document.proto"; 22import "google/protobuf/timestamp.proto"; 23import "google/type/datetime.proto"; 24 25option csharp_namespace = "Google.Cloud.ContentWarehouse.V1"; 26option go_package = "cloud.google.com/go/contentwarehouse/apiv1/contentwarehousepb;contentwarehousepb"; 27option java_multiple_files = true; 28option java_outer_classname = "DocumentProto"; 29option java_package = "com.google.cloud.contentwarehouse.v1"; 30option php_namespace = "Google\\Cloud\\ContentWarehouse\\V1"; 31option ruby_package = "Google::Cloud::ContentWarehouse::V1"; 32 33// Defines the structure for content warehouse document proto. 34message Document { 35 option (google.api.resource) = { 36 type: "contentwarehouse.googleapis.com/Document" 37 pattern: "projects/{project}/locations/{location}/documents/{document}" 38 pattern: "projects/{project}/locations/{location}/documents/referenceId/{reference_id}" 39 }; 40 41 // The resource name of the document. 42 // Format: 43 // projects/{project_number}/locations/{location}/documents/{document_id}. 44 // 45 // The name is ignored when creating a document. 46 string name = 1; 47 48 // The reference ID set by customers. Must be unique per project and location. 49 string reference_id = 11; 50 51 // Required. Display name of the document given by the user. This name will be 52 // displayed in the UI. Customer can populate this field with the name of the 53 // document. This differs from the 'title' field as 'title' is optional and 54 // stores the top heading in the document. 55 string display_name = 2 [(google.api.field_behavior) = REQUIRED]; 56 57 // Title that describes the document. 58 // This can be the top heading or text that describes the document. 59 string title = 18; 60 61 // Uri to display the document, for example, in the UI. 62 string display_uri = 17; 63 64 // The Document schema name. 65 // Format: 66 // projects/{project_number}/locations/{location}/documentSchemas/{document_schema_id}. 67 string document_schema_name = 3 [(google.api.resource_reference) = { 68 type: "contentwarehouse.googleapis.com/DocumentSchema" 69 }]; 70 71 oneof structured_content { 72 // Other document format, such as PPTX, XLXS 73 string plain_text = 15; 74 75 // Document AI format to save the structured content, including OCR. 76 google.cloud.documentai.v1.Document cloud_ai_document = 4; 77 } 78 79 // A path linked to structured content file. 80 string structured_content_uri = 16 [deprecated = true]; 81 82 // Raw document file. 83 oneof raw_document { 84 // Raw document file in Cloud Storage path. 85 string raw_document_path = 5; 86 87 // Raw document content. 88 bytes inline_raw_document = 6; 89 } 90 91 // List of values that are user supplied metadata. 92 repeated Property properties = 7; 93 94 // Output only. The time when the document is last updated. 95 google.protobuf.Timestamp update_time = 8 96 [(google.api.field_behavior) = OUTPUT_ONLY]; 97 98 // Output only. The time when the document is created. 99 google.protobuf.Timestamp create_time = 9 100 [(google.api.field_behavior) = OUTPUT_ONLY]; 101 102 // This is used when DocAI was not used to load the document and parsing/ 103 // extracting is needed for the inline_raw_document. For example, if 104 // inline_raw_document is the byte representation of a PDF file, then 105 // this should be set to: RAW_DOCUMENT_FILE_TYPE_PDF. 106 RawDocumentFileType raw_document_file_type = 10; 107 108 // If true, makes the document visible to asynchronous policies and rules. 109 bool async_enabled = 12 [deprecated = true]; 110 111 // Indicates the category (image, audio, video etc.) of the original content. 112 ContentCategory content_category = 20; 113 114 // If true, text extraction will not be performed. 115 bool text_extraction_disabled = 19 [deprecated = true]; 116 117 // If true, text extraction will be performed. 118 bool text_extraction_enabled = 21; 119 120 // The user who creates the document. 121 string creator = 13; 122 123 // The user who lastly updates the document. 124 string updater = 14; 125 126 // Output only. If linked to a Collection with RetentionPolicy, the date when 127 // the document becomes mutable. 128 google.protobuf.Timestamp disposition_time = 22 129 [(google.api.field_behavior) = OUTPUT_ONLY]; 130 131 // Output only. Indicates if the document has a legal hold on it. 132 bool legal_hold = 23 [(google.api.field_behavior) = OUTPUT_ONLY]; 133} 134 135// References to the documents. 136message DocumentReference { 137 // Required. Name of the referenced document. 138 string document_name = 1 [ 139 (google.api.field_behavior) = REQUIRED, 140 (google.api.resource_reference) = { 141 type: "contentwarehouse.googleapis.com/Document" 142 } 143 ]; 144 145 // display_name of the referenced document; this name does not need to be 146 // consistent to the display_name in the Document proto, depending on the ACL 147 // constraint. 148 string display_name = 2; 149 150 // Stores the subset of the referenced document's content. 151 // This is useful to allow user peek the information of the referenced 152 // document. 153 string snippet = 3; 154 155 // The document type of the document being referenced. 156 bool document_is_folder = 4; 157 158 // Output only. The time when the document is last updated. 159 google.protobuf.Timestamp update_time = 5 160 [(google.api.field_behavior) = OUTPUT_ONLY]; 161 162 // Output only. The time when the document is created. 163 google.protobuf.Timestamp create_time = 6 164 [(google.api.field_behavior) = OUTPUT_ONLY]; 165 166 // Output only. The time when the document is deleted. 167 google.protobuf.Timestamp delete_time = 7 168 [(google.api.field_behavior) = OUTPUT_ONLY]; 169 170 // Document is a folder with retention policy. 171 bool document_is_retention_folder = 8; 172 173 // Document is a folder with legal hold. 174 bool document_is_legal_hold_folder = 9; 175} 176 177// Property of a document. 178message Property { 179 // Required. Must match the name of a PropertyDefinition in the 180 // DocumentSchema. 181 string name = 1 [(google.api.field_behavior) = REQUIRED]; 182 183 // Type of the property. 184 // Must match the property_options type of the matching PropertyDefinition. 185 // Value of the Property parsed into a specific data type. 186 // Specific type value(s) obtained from Document AIs Property.mention_text 187 // field. 188 oneof values { 189 // Integer property values. 190 IntegerArray integer_values = 2; 191 192 // Float property values. 193 FloatArray float_values = 3; 194 195 // String/text property values. 196 TextArray text_values = 4; 197 198 // Enum property values. 199 EnumArray enum_values = 5; 200 201 // Nested structured data property values. 202 PropertyArray property_values = 6; 203 204 // Date time property values. 205 // It is not supported by CMEK compliant deployment. 206 DateTimeArray date_time_values = 7; 207 208 // Map property values. 209 MapProperty map_property = 8; 210 211 // Timestamp property values. 212 // It is not supported by CMEK compliant deployment. 213 TimestampArray timestamp_values = 9; 214 } 215} 216 217// Integer values. 218message IntegerArray { 219 // List of integer values. 220 repeated int32 values = 1; 221} 222 223// Float values. 224message FloatArray { 225 // List of float values. 226 repeated float values = 1; 227} 228 229// String/text values. 230message TextArray { 231 // List of text values. 232 repeated string values = 1; 233} 234 235// Enum values. 236message EnumArray { 237 // List of enum values. 238 repeated string values = 1; 239} 240 241// DateTime values. 242message DateTimeArray { 243 // List of datetime values. 244 // Both OffsetDateTime and ZonedDateTime are supported. 245 repeated google.type.DateTime values = 1; 246} 247 248// Timestamp values. 249message TimestampArray { 250 // List of timestamp values. 251 repeated TimestampValue values = 1; 252} 253 254// Timestamp value type. 255message TimestampValue { 256 oneof value { 257 // Timestamp value 258 google.protobuf.Timestamp timestamp_value = 1; 259 260 // The string must represent a valid instant in UTC and is parsed using 261 // java.time.format.DateTimeFormatter.ISO_INSTANT. 262 // e.g. "2013-09-29T18:46:19Z" 263 string text_value = 2; 264 } 265} 266 267// Property values. 268message PropertyArray { 269 // List of property values. 270 repeated Property properties = 1; 271} 272 273// Map property value. 274// Represents a structured entries of key value pairs, consisting of field names 275// which map to dynamically typed values. 276message MapProperty { 277 // Unordered map of dynamically typed values. 278 map<string, Value> fields = 1; 279} 280 281// `Value` represents a dynamically typed value which can be either be 282// a float, a integer, a string, or a datetime value. A producer of value is 283// expected to set one of these variants. Absence of any variant indicates an 284// error. 285message Value { 286 // The kind of value. 287 oneof kind { 288 // Represents a float value. 289 float float_value = 1; 290 291 // Represents a integer value. 292 int32 int_value = 2; 293 294 // Represents a string value. 295 string string_value = 3; 296 297 // Represents an enum value. 298 EnumValue enum_value = 4; 299 300 // Represents a datetime value. 301 google.type.DateTime datetime_value = 5; 302 303 // Represents a timestamp value. 304 TimestampValue timestamp_value = 6; 305 306 // Represents a boolean value. 307 bool boolean_value = 7; 308 } 309} 310 311// Represents the string value of the enum field. 312message EnumValue { 313 // String value of the enum field. This must match defined set of enums 314 // in document schema using EnumTypeOptions. 315 string value = 1; 316} 317 318// When a raw document is supplied, this indicates the file format 319enum RawDocumentFileType { 320 // No raw document specified or it is non-parsable 321 RAW_DOCUMENT_FILE_TYPE_UNSPECIFIED = 0; 322 323 // Adobe PDF format 324 RAW_DOCUMENT_FILE_TYPE_PDF = 1; 325 326 // Microsoft Word format 327 RAW_DOCUMENT_FILE_TYPE_DOCX = 2; 328 329 // Microsoft Excel format 330 RAW_DOCUMENT_FILE_TYPE_XLSX = 3; 331 332 // Microsoft Powerpoint format 333 RAW_DOCUMENT_FILE_TYPE_PPTX = 4; 334 335 // UTF-8 encoded text format 336 RAW_DOCUMENT_FILE_TYPE_TEXT = 5; 337 338 // TIFF or TIF image file format 339 RAW_DOCUMENT_FILE_TYPE_TIFF = 6; 340} 341 342// When a raw document or structured content is supplied, this stores the 343// content category. 344enum ContentCategory { 345 // No category is specified. 346 CONTENT_CATEGORY_UNSPECIFIED = 0; 347 348 // Content is of image type. 349 CONTENT_CATEGORY_IMAGE = 1; 350 351 // Content is of audio type. 352 CONTENT_CATEGORY_AUDIO = 2; 353 354 // Content is of video type. 355 CONTENT_CATEGORY_VIDEO = 3; 356} 357