1// Copyright 2022 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.vision.v1;
18
19import "google/cloud/vision/v1/geometry.proto";
20
21option cc_enable_arenas = true;
22option go_package = "cloud.google.com/go/vision/v2/apiv1/visionpb;visionpb";
23option java_multiple_files = true;
24option java_outer_classname = "TextAnnotationProto";
25option java_package = "com.google.cloud.vision.v1";
26option objc_class_prefix = "GCVN";
27
28// TextAnnotation contains a structured representation of OCR extracted text.
29// The hierarchy of an OCR extracted text structure is like this:
30//     TextAnnotation -> Page -> Block -> Paragraph -> Word -> Symbol
31// Each structural component, starting from Page, may further have their own
32// properties. Properties describe detected languages, breaks etc.. Please refer
33// to the [TextAnnotation.TextProperty][google.cloud.vision.v1.TextAnnotation.TextProperty] message definition below for more
34// detail.
35message TextAnnotation {
36  // Detected language for a structural component.
37  message DetectedLanguage {
38    // The BCP-47 language code, such as "en-US" or "sr-Latn". For more
39    // information, see
40    // http://www.unicode.org/reports/tr35/#Unicode_locale_identifier.
41    string language_code = 1;
42
43    // Confidence of detected language. Range [0, 1].
44    float confidence = 2;
45  }
46
47  // Detected start or end of a structural component.
48  message DetectedBreak {
49    // Enum to denote the type of break found. New line, space etc.
50    enum BreakType {
51      // Unknown break label type.
52      UNKNOWN = 0;
53
54      // Regular space.
55      SPACE = 1;
56
57      // Sure space (very wide).
58      SURE_SPACE = 2;
59
60      // Line-wrapping break.
61      EOL_SURE_SPACE = 3;
62
63      // End-line hyphen that is not present in text; does not co-occur with
64      // `SPACE`, `LEADER_SPACE`, or `LINE_BREAK`.
65      HYPHEN = 4;
66
67      // Line break that ends a paragraph.
68      LINE_BREAK = 5;
69    }
70
71    // Detected break type.
72    BreakType type = 1;
73
74    // True if break prepends the element.
75    bool is_prefix = 2;
76  }
77
78  // Additional information detected on the structural component.
79  message TextProperty {
80    // A list of detected languages together with confidence.
81    repeated DetectedLanguage detected_languages = 1;
82
83    // Detected start or end of a text segment.
84    DetectedBreak detected_break = 2;
85  }
86
87  // List of pages detected by OCR.
88  repeated Page pages = 1;
89
90  // UTF-8 text detected on the pages.
91  string text = 2;
92}
93
94// Detected page from OCR.
95message Page {
96  // Additional information detected on the page.
97  TextAnnotation.TextProperty property = 1;
98
99  // Page width. For PDFs the unit is points. For images (including
100  // TIFFs) the unit is pixels.
101  int32 width = 2;
102
103  // Page height. For PDFs the unit is points. For images (including
104  // TIFFs) the unit is pixels.
105  int32 height = 3;
106
107  // List of blocks of text, images etc on this page.
108  repeated Block blocks = 4;
109
110  // Confidence of the OCR results on the page. Range [0, 1].
111  float confidence = 5;
112}
113
114// Logical element on the page.
115message Block {
116  // Type of a block (text, image etc) as identified by OCR.
117  enum BlockType {
118    // Unknown block type.
119    UNKNOWN = 0;
120
121    // Regular text block.
122    TEXT = 1;
123
124    // Table block.
125    TABLE = 2;
126
127    // Image block.
128    PICTURE = 3;
129
130    // Horizontal/vertical line box.
131    RULER = 4;
132
133    // Barcode block.
134    BARCODE = 5;
135  }
136
137  // Additional information detected for the block.
138  TextAnnotation.TextProperty property = 1;
139
140  // The bounding box for the block.
141  // The vertices are in the order of top-left, top-right, bottom-right,
142  // bottom-left. When a rotation of the bounding box is detected the rotation
143  // is represented as around the top-left corner as defined when the text is
144  // read in the 'natural' orientation.
145  // For example:
146  //
147  // * when the text is horizontal it might look like:
148  //
149  //         0----1
150  //         |    |
151  //         3----2
152  //
153  // * when it's rotated 180 degrees around the top-left corner it becomes:
154  //
155  //         2----3
156  //         |    |
157  //         1----0
158  //
159  //   and the vertex order will still be (0, 1, 2, 3).
160  BoundingPoly bounding_box = 2;
161
162  // List of paragraphs in this block (if this blocks is of type text).
163  repeated Paragraph paragraphs = 3;
164
165  // Detected block type (text, image etc) for this block.
166  BlockType block_type = 4;
167
168  // Confidence of the OCR results on the block. Range [0, 1].
169  float confidence = 5;
170}
171
172// Structural unit of text representing a number of words in certain order.
173message Paragraph {
174  // Additional information detected for the paragraph.
175  TextAnnotation.TextProperty property = 1;
176
177  // The bounding box for the paragraph.
178  // The vertices are in the order of top-left, top-right, bottom-right,
179  // bottom-left. When a rotation of the bounding box is detected the rotation
180  // is represented as around the top-left corner as defined when the text is
181  // read in the 'natural' orientation.
182  // For example:
183  //   * when the text is horizontal it might look like:
184  //      0----1
185  //      |    |
186  //      3----2
187  //   * when it's rotated 180 degrees around the top-left corner it becomes:
188  //      2----3
189  //      |    |
190  //      1----0
191  //   and the vertex order will still be (0, 1, 2, 3).
192  BoundingPoly bounding_box = 2;
193
194  // List of all words in this paragraph.
195  repeated Word words = 3;
196
197  // Confidence of the OCR results for the paragraph. Range [0, 1].
198  float confidence = 4;
199}
200
201// A word representation.
202message Word {
203  // Additional information detected for the word.
204  TextAnnotation.TextProperty property = 1;
205
206  // The bounding box for the word.
207  // The vertices are in the order of top-left, top-right, bottom-right,
208  // bottom-left. When a rotation of the bounding box is detected the rotation
209  // is represented as around the top-left corner as defined when the text is
210  // read in the 'natural' orientation.
211  // For example:
212  //   * when the text is horizontal it might look like:
213  //      0----1
214  //      |    |
215  //      3----2
216  //   * when it's rotated 180 degrees around the top-left corner it becomes:
217  //      2----3
218  //      |    |
219  //      1----0
220  //   and the vertex order will still be (0, 1, 2, 3).
221  BoundingPoly bounding_box = 2;
222
223  // List of symbols in the word.
224  // The order of the symbols follows the natural reading order.
225  repeated Symbol symbols = 3;
226
227  // Confidence of the OCR results for the word. Range [0, 1].
228  float confidence = 4;
229}
230
231// A single symbol representation.
232message Symbol {
233  // Additional information detected for the symbol.
234  TextAnnotation.TextProperty property = 1;
235
236  // The bounding box for the symbol.
237  // The vertices are in the order of top-left, top-right, bottom-right,
238  // bottom-left. When a rotation of the bounding box is detected the rotation
239  // is represented as around the top-left corner as defined when the text is
240  // read in the 'natural' orientation.
241  // For example:
242  //   * when the text is horizontal it might look like:
243  //      0----1
244  //      |    |
245  //      3----2
246  //   * when it's rotated 180 degrees around the top-left corner it becomes:
247  //      2----3
248  //      |    |
249  //      1----0
250  //   and the vertex order will still be (0, 1, 2, 3).
251  BoundingPoly bounding_box = 2;
252
253  // The actual UTF-8 representation of the symbol.
254  string text = 3;
255
256  // Confidence of the OCR results for the symbol. Range [0, 1].
257  float confidence = 4;
258}
259