xref: /aosp_15_r20/external/googleapis/google/cloud/bigquery/storage/v1/stream.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.bigquery.storage.v1;
18
19import "google/api/field_behavior.proto";
20import "google/api/resource.proto";
21import "google/cloud/bigquery/storage/v1/arrow.proto";
22import "google/cloud/bigquery/storage/v1/avro.proto";
23import "google/cloud/bigquery/storage/v1/table.proto";
24import "google/protobuf/timestamp.proto";
25
26option csharp_namespace = "Google.Cloud.BigQuery.Storage.V1";
27option go_package = "cloud.google.com/go/bigquery/storage/apiv1/storagepb;storagepb";
28option java_multiple_files = true;
29option java_outer_classname = "StreamProto";
30option java_package = "com.google.cloud.bigquery.storage.v1";
31option php_namespace = "Google\\Cloud\\BigQuery\\Storage\\V1";
32
33// Data format for input or output data.
34enum DataFormat {
35  // Data format is unspecified.
36  DATA_FORMAT_UNSPECIFIED = 0;
37
38  // Avro is a standard open source row based file format.
39  // See https://avro.apache.org/ for more details.
40  AVRO = 1;
41
42  // Arrow is a standard open source column-based message format.
43  // See https://arrow.apache.org/ for more details.
44  ARROW = 2;
45}
46
47// Information about the ReadSession.
48message ReadSession {
49  option (google.api.resource) = {
50    type: "bigquerystorage.googleapis.com/ReadSession"
51    pattern: "projects/{project}/locations/{location}/sessions/{session}"
52  };
53
54  // Additional attributes when reading a table.
55  message TableModifiers {
56    // The snapshot time of the table. If not set, interpreted as now.
57    google.protobuf.Timestamp snapshot_time = 1;
58  }
59
60  // Options dictating how we read a table.
61  message TableReadOptions {
62    // Specifies which compression codec to attempt on the entire serialized
63    // response payload (either Arrow record batch or Avro rows). This is
64    // not to be confused with the Apache Arrow native compression codecs
65    // specified in ArrowSerializationOptions. For performance reasons, when
66    // creating a read session requesting Arrow responses, setting both native
67    // Arrow compression and application-level response compression will not be
68    // allowed - choose, at most, one kind of compression.
69    enum ResponseCompressionCodec {
70      // Default is no compression.
71      RESPONSE_COMPRESSION_CODEC_UNSPECIFIED = 0;
72
73      // Use raw LZ4 compression.
74      RESPONSE_COMPRESSION_CODEC_LZ4 = 2;
75    }
76
77    // Optional. The names of the fields in the table to be returned. If no
78    // field names are specified, then all fields in the table are returned.
79    //
80    // Nested fields -- the child elements of a STRUCT field -- can be selected
81    // individually using their fully-qualified names, and will be returned as
82    // record fields containing only the selected nested fields. If a STRUCT
83    // field is specified in the selected fields list, all of the child elements
84    // will be returned.
85    //
86    // As an example, consider a table with the following schema:
87    //
88    //   {
89    //       "name": "struct_field",
90    //       "type": "RECORD",
91    //       "mode": "NULLABLE",
92    //       "fields": [
93    //           {
94    //               "name": "string_field1",
95    //               "type": "STRING",
96    // .              "mode": "NULLABLE"
97    //           },
98    //           {
99    //               "name": "string_field2",
100    //               "type": "STRING",
101    //               "mode": "NULLABLE"
102    //           }
103    //       ]
104    //   }
105    //
106    // Specifying "struct_field" in the selected fields list will result in a
107    // read session schema with the following logical structure:
108    //
109    //   struct_field {
110    //       string_field1
111    //       string_field2
112    //   }
113    //
114    // Specifying "struct_field.string_field1" in the selected fields list will
115    // result in a read session schema with the following logical structure:
116    //
117    //   struct_field {
118    //       string_field1
119    //   }
120    //
121    // The order of the fields in the read session schema is derived from the
122    // table schema and does not correspond to the order in which the fields are
123    // specified in this list.
124    repeated string selected_fields = 1;
125
126    // SQL text filtering statement, similar to a WHERE clause in a query.
127    // Aggregates are not supported.
128    //
129    // Examples: "int_field > 5"
130    //           "date_field = CAST('2014-9-27' as DATE)"
131    //           "nullable_field is not NULL"
132    //           "st_equals(geo_field, st_geofromtext("POINT(2, 2)"))"
133    //           "numeric_field BETWEEN 1.0 AND 5.0"
134    //
135    // Restricted to a maximum length for 1 MB.
136    string row_restriction = 2;
137
138    oneof output_format_serialization_options {
139      // Optional. Options specific to the Apache Arrow output format.
140      ArrowSerializationOptions arrow_serialization_options = 3
141          [(google.api.field_behavior) = OPTIONAL];
142
143      // Optional. Options specific to the Apache Avro output format
144      AvroSerializationOptions avro_serialization_options = 4
145          [(google.api.field_behavior) = OPTIONAL];
146    }
147
148    // Optional. Specifies a table sampling percentage. Specifically, the query
149    // planner will use TABLESAMPLE SYSTEM (sample_percentage PERCENT). The
150    // sampling percentage is applied at the data block granularity. It will
151    // randomly choose for each data block whether to read the rows in that data
152    // block. For more details, see
153    // https://cloud.google.com/bigquery/docs/table-sampling)
154    optional double sample_percentage = 5
155        [(google.api.field_behavior) = OPTIONAL];
156
157    // Optional. Set response_compression_codec when creating a read session to
158    // enable application-level compression of ReadRows responses.
159    optional ResponseCompressionCodec response_compression_codec = 6
160        [(google.api.field_behavior) = OPTIONAL];
161  }
162
163  // Output only. Unique identifier for the session, in the form
164  // `projects/{project_id}/locations/{location}/sessions/{session_id}`.
165  string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
166
167  // Output only. Time at which the session becomes invalid. After this time,
168  // subsequent requests to read this Session will return errors. The
169  // expire_time is automatically assigned and currently cannot be specified or
170  // updated.
171  google.protobuf.Timestamp expire_time = 2
172      [(google.api.field_behavior) = OUTPUT_ONLY];
173
174  // Immutable. Data format of the output data. DATA_FORMAT_UNSPECIFIED not
175  // supported.
176  DataFormat data_format = 3 [(google.api.field_behavior) = IMMUTABLE];
177
178  // The schema for the read. If read_options.selected_fields is set, the
179  // schema may be different from the table schema as it will only contain
180  // the selected fields.
181  oneof schema {
182    // Output only. Avro schema.
183    AvroSchema avro_schema = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
184
185    // Output only. Arrow schema.
186    ArrowSchema arrow_schema = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
187  }
188
189  // Immutable. Table that this ReadSession is reading from, in the form
190  // `projects/{project_id}/datasets/{dataset_id}/tables/{table_id}`
191  string table = 6 [
192    (google.api.field_behavior) = IMMUTABLE,
193    (google.api.resource_reference) = { type: "bigquery.googleapis.com/Table" }
194  ];
195
196  // Optional. Any modifiers which are applied when reading from the specified
197  // table.
198  TableModifiers table_modifiers = 7 [(google.api.field_behavior) = OPTIONAL];
199
200  // Optional. Read options for this session (e.g. column selection, filters).
201  TableReadOptions read_options = 8 [(google.api.field_behavior) = OPTIONAL];
202
203  // Output only. A list of streams created with the session.
204  //
205  // At least one stream is created with the session. In the future, larger
206  // request_stream_count values *may* result in this list being unpopulated,
207  // in that case, the user will need to use a List method to get the streams
208  // instead, which is not yet available.
209  repeated ReadStream streams = 10 [(google.api.field_behavior) = OUTPUT_ONLY];
210
211  // Output only. An estimate on the number of bytes this session will scan when
212  // all streams are completely consumed. This estimate is based on
213  // metadata from the table which might be incomplete or stale.
214  int64 estimated_total_bytes_scanned = 12
215      [(google.api.field_behavior) = OUTPUT_ONLY];
216
217  // Output only. A pre-projected estimate of the total physical size of files
218  // (in bytes) that this session will scan when all streams are consumed. This
219  // estimate is independent of the selected columns and can be based on
220  // incomplete or stale metadata from the table.  This field is only set for
221  // BigLake tables.
222  int64 estimated_total_physical_file_size = 15
223      [(google.api.field_behavior) = OUTPUT_ONLY];
224
225  // Output only. An estimate on the number of rows present in this session's
226  // streams. This estimate is based on metadata from the table which might be
227  // incomplete or stale.
228  int64 estimated_row_count = 14 [(google.api.field_behavior) = OUTPUT_ONLY];
229
230  // Optional. ID set by client to annotate a session identity.  This does not
231  // need to be strictly unique, but instead the same ID should be used to group
232  // logically connected sessions (e.g. All using the same ID for all sessions
233  // needed to complete a Spark SQL query is reasonable).
234  //
235  // Maximum length is 256 bytes.
236  string trace_id = 13 [(google.api.field_behavior) = OPTIONAL];
237}
238
239// Information about a single stream that gets data out of the storage system.
240// Most of the information about `ReadStream` instances is aggregated, making
241// `ReadStream` lightweight.
242message ReadStream {
243  option (google.api.resource) = {
244    type: "bigquerystorage.googleapis.com/ReadStream"
245    pattern: "projects/{project}/locations/{location}/sessions/{session}/streams/{stream}"
246  };
247
248  // Output only. Name of the stream, in the form
249  // `projects/{project_id}/locations/{location}/sessions/{session_id}/streams/{stream_id}`.
250  string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
251}
252
253// WriteStreamView is a view enum that controls what details about a write
254// stream should be returned.
255enum WriteStreamView {
256  // The default / unset value.
257  WRITE_STREAM_VIEW_UNSPECIFIED = 0;
258
259  // The BASIC projection returns basic metadata about a write stream.  The
260  // basic view does not include schema information.  This is the default view
261  // returned by GetWriteStream.
262  BASIC = 1;
263
264  // The FULL projection returns all available write stream metadata, including
265  // the schema.  CreateWriteStream returns the full projection of write stream
266  // metadata.
267  FULL = 2;
268}
269
270// Information about a single stream that gets data inside the storage system.
271message WriteStream {
272  option (google.api.resource) = {
273    type: "bigquerystorage.googleapis.com/WriteStream"
274    pattern: "projects/{project}/datasets/{dataset}/tables/{table}/streams/{stream}"
275  };
276
277  // Type enum of the stream.
278  enum Type {
279    // Unknown type.
280    TYPE_UNSPECIFIED = 0;
281
282    // Data will commit automatically and appear as soon as the write is
283    // acknowledged.
284    COMMITTED = 1;
285
286    // Data is invisible until the stream is committed.
287    PENDING = 2;
288
289    // Data is only visible up to the offset to which it was flushed.
290    BUFFERED = 3;
291  }
292
293  // Mode enum of the stream.
294  enum WriteMode {
295    // Unknown type.
296    WRITE_MODE_UNSPECIFIED = 0;
297
298    // Insert new records into the table.
299    // It is the default value if customers do not specify it.
300    INSERT = 1;
301  }
302
303  // Output only. Name of the stream, in the form
304  // `projects/{project}/datasets/{dataset}/tables/{table}/streams/{stream}`.
305  string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
306
307  // Immutable. Type of the stream.
308  Type type = 2 [(google.api.field_behavior) = IMMUTABLE];
309
310  // Output only. Create time of the stream. For the _default stream, this is
311  // the creation_time of the table.
312  google.protobuf.Timestamp create_time = 3
313      [(google.api.field_behavior) = OUTPUT_ONLY];
314
315  // Output only. Commit time of the stream.
316  // If a stream is of `COMMITTED` type, then it will have a commit_time same as
317  // `create_time`. If the stream is of `PENDING` type, empty commit_time
318  // means it is not committed.
319  google.protobuf.Timestamp commit_time = 4
320      [(google.api.field_behavior) = OUTPUT_ONLY];
321
322  // Output only. The schema of the destination table. It is only returned in
323  // `CreateWriteStream` response. Caller should generate data that's
324  // compatible with this schema to send in initial `AppendRowsRequest`.
325  // The table schema could go out of date during the life time of the stream.
326  TableSchema table_schema = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
327
328  // Immutable. Mode of the stream.
329  WriteMode write_mode = 7 [(google.api.field_behavior) = IMMUTABLE];
330
331  // Immutable. The geographic location where the stream's dataset resides. See
332  // https://cloud.google.com/bigquery/docs/locations for supported
333  // locations.
334  string location = 8 [(google.api.field_behavior) = IMMUTABLE];
335}
336