xref: /aosp_15_r20/external/googleapis/google/cloud/documentai/v1beta3/document_io.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.documentai.v1beta3;
18
19import "google/protobuf/field_mask.proto";
20
21option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3";
22option go_package = "cloud.google.com/go/documentai/apiv1beta3/documentaipb;documentaipb";
23option java_multiple_files = true;
24option java_outer_classname = "DocumentIoProto";
25option java_package = "com.google.cloud.documentai.v1beta3";
26option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3";
27option ruby_package = "Google::Cloud::DocumentAI::V1beta3";
28
29// Payload message of raw document content (bytes).
30message RawDocument {
31  // Inline document content.
32  bytes content = 1;
33
34  // An IANA MIME type (RFC6838) indicating the nature and format of the
35  // [content][google.cloud.documentai.v1beta3.RawDocument.content].
36  string mime_type = 2;
37
38  // The display name of the document, it supports all Unicode characters except
39  // the following:
40  // `*`, `?`, `[`, `]`, `%`, `{`, `}`,`'`, `\"`, `,`
41  // `~`, `=` and `:` are reserved.
42  // If not specified, a default ID is generated.
43  string display_name = 3;
44}
45
46// Specifies a document stored on Cloud Storage.
47message GcsDocument {
48  // The Cloud Storage object uri.
49  string gcs_uri = 1;
50
51  // An IANA MIME type (RFC6838) of the content.
52  string mime_type = 2;
53}
54
55// Specifies a set of documents on Cloud Storage.
56message GcsDocuments {
57  // The list of documents.
58  repeated GcsDocument documents = 1;
59}
60
61// Specifies all documents on Cloud Storage with a common prefix.
62message GcsPrefix {
63  // The URI prefix.
64  string gcs_uri_prefix = 1;
65}
66
67// The common config to specify a set of documents used as input.
68message BatchDocumentsInputConfig {
69  // The source.
70  oneof source {
71    // The set of documents that match the specified Cloud Storage `gcs_prefix`.
72    GcsPrefix gcs_prefix = 1;
73
74    // The set of documents individually specified on Cloud Storage.
75    GcsDocuments gcs_documents = 2;
76  }
77}
78
79// Config that controls the output of documents. All documents will be written
80// as a JSON file.
81message DocumentOutputConfig {
82  // The configuration used when outputting documents.
83  message GcsOutputConfig {
84    // The sharding config for the output document.
85    message ShardingConfig {
86      // The number of pages per shard.
87      int32 pages_per_shard = 1;
88
89      // The number of overlapping pages between consecutive shards.
90      int32 pages_overlap = 2;
91    }
92
93    // The Cloud Storage uri (a directory) of the output.
94    string gcs_uri = 1;
95
96    // Specifies which fields to include in the output documents.
97    // Only supports top level document and pages field so it must be in the
98    // form of `{document_field_name}` or `pages.{page_field_name}`.
99    google.protobuf.FieldMask field_mask = 2;
100
101    // Specifies the sharding config for the output document.
102    ShardingConfig sharding_config = 3;
103  }
104
105  // The destination of the results.
106  oneof destination {
107    // Output config to write the results to Cloud Storage.
108    GcsOutputConfig gcs_output_config = 1;
109  }
110}
111
112// Config for Document OCR.
113message OcrConfig {
114  // Hints for OCR Engine
115  message Hints {
116    // List of BCP-47 language codes to use for OCR. In most cases, not
117    // specifying it yields the best results since it enables automatic language
118    // detection. For languages based on the Latin alphabet, setting hints is
119    // not needed. In rare cases, when the language of the text in the
120    // image is known, setting a hint will help get better results (although it
121    // will be a significant hindrance if the hint is wrong).
122    repeated string language_hints = 1;
123  }
124
125  // Configurations for premium OCR features.
126  message PremiumFeatures {
127    // Turn on selection mark detector in OCR engine. Only available in OCR 2.0
128    // (and later) processors.
129    bool enable_selection_mark_detection = 3;
130
131    // Turn on font identification model and return font style information.
132    bool compute_style_info = 4;
133
134    // Turn on the model that can extract LaTeX math formulas.
135    bool enable_math_ocr = 5;
136  }
137
138  // Hints for the OCR model.
139  Hints hints = 2;
140
141  // Enables special handling for PDFs with existing text information. Results
142  // in better text extraction quality in such PDF inputs.
143  bool enable_native_pdf_parsing = 3;
144
145  // Enables intelligent document quality scores after OCR. Can help with
146  // diagnosing why OCR responses are of poor quality for a given input.
147  // Adds additional latency comparable to regular OCR to the process call.
148  bool enable_image_quality_scores = 4;
149
150  // A list of advanced OCR options to further fine-tune OCR behavior. Current
151  // valid values are:
152  //
153  // - `legacy_layout`: a heuristics layout detection algorithm, which serves as
154  // an alternative to the current ML-based layout detection algorithm.
155  // Customers can choose the best suitable layout algorithm based on their
156  // situation.
157  repeated string advanced_ocr_options = 5;
158
159  // Includes symbol level OCR information if set to true.
160  bool enable_symbol = 6;
161
162  // Turn on font identification model and return font style information.
163  // Deprecated, use
164  // [PremiumFeatures.compute_style_info][google.cloud.documentai.v1beta3.OcrConfig.PremiumFeatures.compute_style_info]
165  // instead.
166  bool compute_style_info = 8 [deprecated = true];
167
168  // Turn off character box detector in OCR engine. Character box detection is
169  // enabled by default in OCR 2.0 (and later) processors.
170  bool disable_character_boxes_detection = 10;
171
172  // Configurations for premium OCR features.
173  PremiumFeatures premium_features = 11;
174}
175