xref: /aosp_15_r20/external/googleapis/google/cloud/contentwarehouse/v1/document.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.contentwarehouse.v1;
18
19import "google/api/field_behavior.proto";
20import "google/api/resource.proto";
21import "google/cloud/documentai/v1/document.proto";
22import "google/protobuf/timestamp.proto";
23import "google/type/datetime.proto";
24
25option csharp_namespace = "Google.Cloud.ContentWarehouse.V1";
26option go_package = "cloud.google.com/go/contentwarehouse/apiv1/contentwarehousepb;contentwarehousepb";
27option java_multiple_files = true;
28option java_outer_classname = "DocumentProto";
29option java_package = "com.google.cloud.contentwarehouse.v1";
30option php_namespace = "Google\\Cloud\\ContentWarehouse\\V1";
31option ruby_package = "Google::Cloud::ContentWarehouse::V1";
32
33// Defines the structure for content warehouse document proto.
34message Document {
35  option (google.api.resource) = {
36    type: "contentwarehouse.googleapis.com/Document"
37    pattern: "projects/{project}/locations/{location}/documents/{document}"
38    pattern: "projects/{project}/locations/{location}/documents/referenceId/{reference_id}"
39  };
40
41  // The resource name of the document.
42  // Format:
43  // projects/{project_number}/locations/{location}/documents/{document_id}.
44  //
45  // The name is ignored when creating a document.
46  string name = 1;
47
48  // The reference ID set by customers. Must be unique per project and location.
49  string reference_id = 11;
50
51  // Required. Display name of the document given by the user. This name will be
52  // displayed in the UI. Customer can populate this field with the name of the
53  // document. This differs from the 'title' field as 'title' is optional and
54  // stores the top heading in the document.
55  string display_name = 2 [(google.api.field_behavior) = REQUIRED];
56
57  // Title that describes the document.
58  // This can be the top heading or text that describes the document.
59  string title = 18;
60
61  // Uri to display the document, for example, in the UI.
62  string display_uri = 17;
63
64  // The Document schema name.
65  // Format:
66  // projects/{project_number}/locations/{location}/documentSchemas/{document_schema_id}.
67  string document_schema_name = 3 [(google.api.resource_reference) = {
68    type: "contentwarehouse.googleapis.com/DocumentSchema"
69  }];
70
71  oneof structured_content {
72    // Other document format, such as PPTX, XLXS
73    string plain_text = 15;
74
75    // Document AI format to save the structured content, including OCR.
76    google.cloud.documentai.v1.Document cloud_ai_document = 4;
77  }
78
79  // A path linked to structured content file.
80  string structured_content_uri = 16 [deprecated = true];
81
82  // Raw document file.
83  oneof raw_document {
84    // Raw document file in Cloud Storage path.
85    string raw_document_path = 5;
86
87    // Raw document content.
88    bytes inline_raw_document = 6;
89  }
90
91  // List of values that are user supplied metadata.
92  repeated Property properties = 7;
93
94  // Output only. The time when the document is last updated.
95  google.protobuf.Timestamp update_time = 8
96      [(google.api.field_behavior) = OUTPUT_ONLY];
97
98  // Output only. The time when the document is created.
99  google.protobuf.Timestamp create_time = 9
100      [(google.api.field_behavior) = OUTPUT_ONLY];
101
102  // This is used when DocAI was not used to load the document and parsing/
103  // extracting is needed for the inline_raw_document.  For example, if
104  // inline_raw_document is the byte representation of a PDF file, then
105  // this should be set to: RAW_DOCUMENT_FILE_TYPE_PDF.
106  RawDocumentFileType raw_document_file_type = 10;
107
108  // If true, makes the document visible to asynchronous policies and rules.
109  bool async_enabled = 12 [deprecated = true];
110
111  // Indicates the category (image, audio, video etc.) of the original content.
112  ContentCategory content_category = 20;
113
114  // If true, text extraction will not be performed.
115  bool text_extraction_disabled = 19 [deprecated = true];
116
117  // If true, text extraction will be performed.
118  bool text_extraction_enabled = 21;
119
120  // The user who creates the document.
121  string creator = 13;
122
123  // The user who lastly updates the document.
124  string updater = 14;
125
126  // Output only. If linked to a Collection with RetentionPolicy, the date when
127  // the document becomes mutable.
128  google.protobuf.Timestamp disposition_time = 22
129      [(google.api.field_behavior) = OUTPUT_ONLY];
130
131  // Output only. Indicates if the document has a legal hold on it.
132  bool legal_hold = 23 [(google.api.field_behavior) = OUTPUT_ONLY];
133}
134
135// References to the documents.
136message DocumentReference {
137  // Required. Name of the referenced document.
138  string document_name = 1 [
139    (google.api.field_behavior) = REQUIRED,
140    (google.api.resource_reference) = {
141      type: "contentwarehouse.googleapis.com/Document"
142    }
143  ];
144
145  // display_name of the referenced document; this name does not need to be
146  // consistent to the display_name in the Document proto, depending on the ACL
147  // constraint.
148  string display_name = 2;
149
150  // Stores the subset of the referenced document's content.
151  // This is useful to allow user peek the information of the referenced
152  // document.
153  string snippet = 3;
154
155  // The document type of the document being referenced.
156  bool document_is_folder = 4;
157
158  // Output only. The time when the document is last updated.
159  google.protobuf.Timestamp update_time = 5
160      [(google.api.field_behavior) = OUTPUT_ONLY];
161
162  // Output only. The time when the document is created.
163  google.protobuf.Timestamp create_time = 6
164      [(google.api.field_behavior) = OUTPUT_ONLY];
165
166  // Output only. The time when the document is deleted.
167  google.protobuf.Timestamp delete_time = 7
168      [(google.api.field_behavior) = OUTPUT_ONLY];
169
170  // Document is a folder with retention policy.
171  bool document_is_retention_folder = 8;
172
173  // Document is a folder with legal hold.
174  bool document_is_legal_hold_folder = 9;
175}
176
177// Property of a document.
178message Property {
179  // Required. Must match the name of a PropertyDefinition in the
180  // DocumentSchema.
181  string name = 1 [(google.api.field_behavior) = REQUIRED];
182
183  // Type of the property.
184  // Must match the property_options type of the matching PropertyDefinition.
185  // Value of the Property parsed into a specific data type.
186  // Specific type value(s) obtained from Document AIs Property.mention_text
187  // field.
188  oneof values {
189    // Integer property values.
190    IntegerArray integer_values = 2;
191
192    // Float property values.
193    FloatArray float_values = 3;
194
195    // String/text property values.
196    TextArray text_values = 4;
197
198    // Enum property values.
199    EnumArray enum_values = 5;
200
201    // Nested structured data property values.
202    PropertyArray property_values = 6;
203
204    // Date time property values.
205    // It is not supported by CMEK compliant deployment.
206    DateTimeArray date_time_values = 7;
207
208    // Map property values.
209    MapProperty map_property = 8;
210
211    // Timestamp property values.
212    // It is not supported by CMEK compliant deployment.
213    TimestampArray timestamp_values = 9;
214  }
215}
216
217// Integer values.
218message IntegerArray {
219  // List of integer values.
220  repeated int32 values = 1;
221}
222
223// Float values.
224message FloatArray {
225  // List of float values.
226  repeated float values = 1;
227}
228
229// String/text values.
230message TextArray {
231  // List of text values.
232  repeated string values = 1;
233}
234
235// Enum values.
236message EnumArray {
237  // List of enum values.
238  repeated string values = 1;
239}
240
241// DateTime values.
242message DateTimeArray {
243  // List of datetime values.
244  // Both OffsetDateTime and ZonedDateTime are supported.
245  repeated google.type.DateTime values = 1;
246}
247
248// Timestamp values.
249message TimestampArray {
250  // List of timestamp values.
251  repeated TimestampValue values = 1;
252}
253
254// Timestamp value type.
255message TimestampValue {
256  oneof value {
257    // Timestamp value
258    google.protobuf.Timestamp timestamp_value = 1;
259
260    // The string must represent a valid instant in UTC and is parsed using
261    // java.time.format.DateTimeFormatter.ISO_INSTANT.
262    // e.g. "2013-09-29T18:46:19Z"
263    string text_value = 2;
264  }
265}
266
267// Property values.
268message PropertyArray {
269  // List of property values.
270  repeated Property properties = 1;
271}
272
273// Map property value.
274// Represents a structured entries of key value pairs, consisting of field names
275// which map to dynamically typed values.
276message MapProperty {
277  // Unordered map of dynamically typed values.
278  map<string, Value> fields = 1;
279}
280
281// `Value` represents a dynamically typed value which can be either be
282// a float, a integer, a string, or a datetime value. A producer of value is
283// expected to set one of these variants. Absence of any variant indicates an
284// error.
285message Value {
286  // The kind of value.
287  oneof kind {
288    // Represents a float value.
289    float float_value = 1;
290
291    // Represents a integer value.
292    int32 int_value = 2;
293
294    // Represents a string value.
295    string string_value = 3;
296
297    // Represents an enum value.
298    EnumValue enum_value = 4;
299
300    // Represents a datetime value.
301    google.type.DateTime datetime_value = 5;
302
303    // Represents a timestamp value.
304    TimestampValue timestamp_value = 6;
305
306    // Represents a boolean value.
307    bool boolean_value = 7;
308  }
309}
310
311// Represents the string value of the enum field.
312message EnumValue {
313  // String value of the enum field. This must match defined set of enums
314  // in document schema using EnumTypeOptions.
315  string value = 1;
316}
317
318// When a raw document is supplied, this indicates the file format
319enum RawDocumentFileType {
320  // No raw document specified or it is non-parsable
321  RAW_DOCUMENT_FILE_TYPE_UNSPECIFIED = 0;
322
323  // Adobe PDF format
324  RAW_DOCUMENT_FILE_TYPE_PDF = 1;
325
326  // Microsoft Word format
327  RAW_DOCUMENT_FILE_TYPE_DOCX = 2;
328
329  // Microsoft Excel format
330  RAW_DOCUMENT_FILE_TYPE_XLSX = 3;
331
332  // Microsoft Powerpoint format
333  RAW_DOCUMENT_FILE_TYPE_PPTX = 4;
334
335  // UTF-8 encoded text format
336  RAW_DOCUMENT_FILE_TYPE_TEXT = 5;
337
338  // TIFF or TIF image file format
339  RAW_DOCUMENT_FILE_TYPE_TIFF = 6;
340}
341
342// When a raw document or structured content is supplied, this stores the
343// content category.
344enum ContentCategory {
345  // No category is specified.
346  CONTENT_CATEGORY_UNSPECIFIED = 0;
347
348  // Content is of image type.
349  CONTENT_CATEGORY_IMAGE = 1;
350
351  // Content is of audio type.
352  CONTENT_CATEGORY_AUDIO = 2;
353
354  // Content is of video type.
355  CONTENT_CATEGORY_VIDEO = 3;
356}
357