1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.documentai.v1beta3; 18 19import "google/protobuf/field_mask.proto"; 20 21option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3"; 22option go_package = "cloud.google.com/go/documentai/apiv1beta3/documentaipb;documentaipb"; 23option java_multiple_files = true; 24option java_outer_classname = "DocumentIoProto"; 25option java_package = "com.google.cloud.documentai.v1beta3"; 26option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3"; 27option ruby_package = "Google::Cloud::DocumentAI::V1beta3"; 28 29// Payload message of raw document content (bytes). 30message RawDocument { 31 // Inline document content. 32 bytes content = 1; 33 34 // An IANA MIME type (RFC6838) indicating the nature and format of the 35 // [content][google.cloud.documentai.v1beta3.RawDocument.content]. 36 string mime_type = 2; 37 38 // The display name of the document, it supports all Unicode characters except 39 // the following: 40 // `*`, `?`, `[`, `]`, `%`, `{`, `}`,`'`, `\"`, `,` 41 // `~`, `=` and `:` are reserved. 42 // If not specified, a default ID is generated. 43 string display_name = 3; 44} 45 46// Specifies a document stored on Cloud Storage. 47message GcsDocument { 48 // The Cloud Storage object uri. 49 string gcs_uri = 1; 50 51 // An IANA MIME type (RFC6838) of the content. 52 string mime_type = 2; 53} 54 55// Specifies a set of documents on Cloud Storage. 56message GcsDocuments { 57 // The list of documents. 58 repeated GcsDocument documents = 1; 59} 60 61// Specifies all documents on Cloud Storage with a common prefix. 62message GcsPrefix { 63 // The URI prefix. 64 string gcs_uri_prefix = 1; 65} 66 67// The common config to specify a set of documents used as input. 68message BatchDocumentsInputConfig { 69 // The source. 70 oneof source { 71 // The set of documents that match the specified Cloud Storage `gcs_prefix`. 72 GcsPrefix gcs_prefix = 1; 73 74 // The set of documents individually specified on Cloud Storage. 75 GcsDocuments gcs_documents = 2; 76 } 77} 78 79// Config that controls the output of documents. All documents will be written 80// as a JSON file. 81message DocumentOutputConfig { 82 // The configuration used when outputting documents. 83 message GcsOutputConfig { 84 // The sharding config for the output document. 85 message ShardingConfig { 86 // The number of pages per shard. 87 int32 pages_per_shard = 1; 88 89 // The number of overlapping pages between consecutive shards. 90 int32 pages_overlap = 2; 91 } 92 93 // The Cloud Storage uri (a directory) of the output. 94 string gcs_uri = 1; 95 96 // Specifies which fields to include in the output documents. 97 // Only supports top level document and pages field so it must be in the 98 // form of `{document_field_name}` or `pages.{page_field_name}`. 99 google.protobuf.FieldMask field_mask = 2; 100 101 // Specifies the sharding config for the output document. 102 ShardingConfig sharding_config = 3; 103 } 104 105 // The destination of the results. 106 oneof destination { 107 // Output config to write the results to Cloud Storage. 108 GcsOutputConfig gcs_output_config = 1; 109 } 110} 111 112// Config for Document OCR. 113message OcrConfig { 114 // Hints for OCR Engine 115 message Hints { 116 // List of BCP-47 language codes to use for OCR. In most cases, not 117 // specifying it yields the best results since it enables automatic language 118 // detection. For languages based on the Latin alphabet, setting hints is 119 // not needed. In rare cases, when the language of the text in the 120 // image is known, setting a hint will help get better results (although it 121 // will be a significant hindrance if the hint is wrong). 122 repeated string language_hints = 1; 123 } 124 125 // Configurations for premium OCR features. 126 message PremiumFeatures { 127 // Turn on selection mark detector in OCR engine. Only available in OCR 2.0 128 // (and later) processors. 129 bool enable_selection_mark_detection = 3; 130 131 // Turn on font identification model and return font style information. 132 bool compute_style_info = 4; 133 134 // Turn on the model that can extract LaTeX math formulas. 135 bool enable_math_ocr = 5; 136 } 137 138 // Hints for the OCR model. 139 Hints hints = 2; 140 141 // Enables special handling for PDFs with existing text information. Results 142 // in better text extraction quality in such PDF inputs. 143 bool enable_native_pdf_parsing = 3; 144 145 // Enables intelligent document quality scores after OCR. Can help with 146 // diagnosing why OCR responses are of poor quality for a given input. 147 // Adds additional latency comparable to regular OCR to the process call. 148 bool enable_image_quality_scores = 4; 149 150 // A list of advanced OCR options to further fine-tune OCR behavior. Current 151 // valid values are: 152 // 153 // - `legacy_layout`: a heuristics layout detection algorithm, which serves as 154 // an alternative to the current ML-based layout detection algorithm. 155 // Customers can choose the best suitable layout algorithm based on their 156 // situation. 157 repeated string advanced_ocr_options = 5; 158 159 // Includes symbol level OCR information if set to true. 160 bool enable_symbol = 6; 161 162 // Turn on font identification model and return font style information. 163 // Deprecated, use 164 // [PremiumFeatures.compute_style_info][google.cloud.documentai.v1beta3.OcrConfig.PremiumFeatures.compute_style_info] 165 // instead. 166 bool compute_style_info = 8 [deprecated = true]; 167 168 // Turn off character box detector in OCR engine. Character box detection is 169 // enabled by default in OCR 2.0 (and later) processors. 170 bool disable_character_boxes_detection = 10; 171 172 // Configurations for premium OCR features. 173 PremiumFeatures premium_features = 11; 174} 175