1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.bigquery.storage.v1; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21import "google/cloud/bigquery/storage/v1/arrow.proto"; 22import "google/cloud/bigquery/storage/v1/avro.proto"; 23import "google/cloud/bigquery/storage/v1/table.proto"; 24import "google/protobuf/timestamp.proto"; 25 26option csharp_namespace = "Google.Cloud.BigQuery.Storage.V1"; 27option go_package = "cloud.google.com/go/bigquery/storage/apiv1/storagepb;storagepb"; 28option java_multiple_files = true; 29option java_outer_classname = "StreamProto"; 30option java_package = "com.google.cloud.bigquery.storage.v1"; 31option php_namespace = "Google\\Cloud\\BigQuery\\Storage\\V1"; 32 33// Data format for input or output data. 34enum DataFormat { 35 // Data format is unspecified. 36 DATA_FORMAT_UNSPECIFIED = 0; 37 38 // Avro is a standard open source row based file format. 39 // See https://avro.apache.org/ for more details. 40 AVRO = 1; 41 42 // Arrow is a standard open source column-based message format. 43 // See https://arrow.apache.org/ for more details. 44 ARROW = 2; 45} 46 47// Information about the ReadSession. 48message ReadSession { 49 option (google.api.resource) = { 50 type: "bigquerystorage.googleapis.com/ReadSession" 51 pattern: "projects/{project}/locations/{location}/sessions/{session}" 52 }; 53 54 // Additional attributes when reading a table. 55 message TableModifiers { 56 // The snapshot time of the table. If not set, interpreted as now. 57 google.protobuf.Timestamp snapshot_time = 1; 58 } 59 60 // Options dictating how we read a table. 61 message TableReadOptions { 62 // Specifies which compression codec to attempt on the entire serialized 63 // response payload (either Arrow record batch or Avro rows). This is 64 // not to be confused with the Apache Arrow native compression codecs 65 // specified in ArrowSerializationOptions. For performance reasons, when 66 // creating a read session requesting Arrow responses, setting both native 67 // Arrow compression and application-level response compression will not be 68 // allowed - choose, at most, one kind of compression. 69 enum ResponseCompressionCodec { 70 // Default is no compression. 71 RESPONSE_COMPRESSION_CODEC_UNSPECIFIED = 0; 72 73 // Use raw LZ4 compression. 74 RESPONSE_COMPRESSION_CODEC_LZ4 = 2; 75 } 76 77 // Optional. The names of the fields in the table to be returned. If no 78 // field names are specified, then all fields in the table are returned. 79 // 80 // Nested fields -- the child elements of a STRUCT field -- can be selected 81 // individually using their fully-qualified names, and will be returned as 82 // record fields containing only the selected nested fields. If a STRUCT 83 // field is specified in the selected fields list, all of the child elements 84 // will be returned. 85 // 86 // As an example, consider a table with the following schema: 87 // 88 // { 89 // "name": "struct_field", 90 // "type": "RECORD", 91 // "mode": "NULLABLE", 92 // "fields": [ 93 // { 94 // "name": "string_field1", 95 // "type": "STRING", 96 // . "mode": "NULLABLE" 97 // }, 98 // { 99 // "name": "string_field2", 100 // "type": "STRING", 101 // "mode": "NULLABLE" 102 // } 103 // ] 104 // } 105 // 106 // Specifying "struct_field" in the selected fields list will result in a 107 // read session schema with the following logical structure: 108 // 109 // struct_field { 110 // string_field1 111 // string_field2 112 // } 113 // 114 // Specifying "struct_field.string_field1" in the selected fields list will 115 // result in a read session schema with the following logical structure: 116 // 117 // struct_field { 118 // string_field1 119 // } 120 // 121 // The order of the fields in the read session schema is derived from the 122 // table schema and does not correspond to the order in which the fields are 123 // specified in this list. 124 repeated string selected_fields = 1; 125 126 // SQL text filtering statement, similar to a WHERE clause in a query. 127 // Aggregates are not supported. 128 // 129 // Examples: "int_field > 5" 130 // "date_field = CAST('2014-9-27' as DATE)" 131 // "nullable_field is not NULL" 132 // "st_equals(geo_field, st_geofromtext("POINT(2, 2)"))" 133 // "numeric_field BETWEEN 1.0 AND 5.0" 134 // 135 // Restricted to a maximum length for 1 MB. 136 string row_restriction = 2; 137 138 oneof output_format_serialization_options { 139 // Optional. Options specific to the Apache Arrow output format. 140 ArrowSerializationOptions arrow_serialization_options = 3 141 [(google.api.field_behavior) = OPTIONAL]; 142 143 // Optional. Options specific to the Apache Avro output format 144 AvroSerializationOptions avro_serialization_options = 4 145 [(google.api.field_behavior) = OPTIONAL]; 146 } 147 148 // Optional. Specifies a table sampling percentage. Specifically, the query 149 // planner will use TABLESAMPLE SYSTEM (sample_percentage PERCENT). The 150 // sampling percentage is applied at the data block granularity. It will 151 // randomly choose for each data block whether to read the rows in that data 152 // block. For more details, see 153 // https://cloud.google.com/bigquery/docs/table-sampling) 154 optional double sample_percentage = 5 155 [(google.api.field_behavior) = OPTIONAL]; 156 157 // Optional. Set response_compression_codec when creating a read session to 158 // enable application-level compression of ReadRows responses. 159 optional ResponseCompressionCodec response_compression_codec = 6 160 [(google.api.field_behavior) = OPTIONAL]; 161 } 162 163 // Output only. Unique identifier for the session, in the form 164 // `projects/{project_id}/locations/{location}/sessions/{session_id}`. 165 string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 166 167 // Output only. Time at which the session becomes invalid. After this time, 168 // subsequent requests to read this Session will return errors. The 169 // expire_time is automatically assigned and currently cannot be specified or 170 // updated. 171 google.protobuf.Timestamp expire_time = 2 172 [(google.api.field_behavior) = OUTPUT_ONLY]; 173 174 // Immutable. Data format of the output data. DATA_FORMAT_UNSPECIFIED not 175 // supported. 176 DataFormat data_format = 3 [(google.api.field_behavior) = IMMUTABLE]; 177 178 // The schema for the read. If read_options.selected_fields is set, the 179 // schema may be different from the table schema as it will only contain 180 // the selected fields. 181 oneof schema { 182 // Output only. Avro schema. 183 AvroSchema avro_schema = 4 [(google.api.field_behavior) = OUTPUT_ONLY]; 184 185 // Output only. Arrow schema. 186 ArrowSchema arrow_schema = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; 187 } 188 189 // Immutable. Table that this ReadSession is reading from, in the form 190 // `projects/{project_id}/datasets/{dataset_id}/tables/{table_id}` 191 string table = 6 [ 192 (google.api.field_behavior) = IMMUTABLE, 193 (google.api.resource_reference) = { type: "bigquery.googleapis.com/Table" } 194 ]; 195 196 // Optional. Any modifiers which are applied when reading from the specified 197 // table. 198 TableModifiers table_modifiers = 7 [(google.api.field_behavior) = OPTIONAL]; 199 200 // Optional. Read options for this session (e.g. column selection, filters). 201 TableReadOptions read_options = 8 [(google.api.field_behavior) = OPTIONAL]; 202 203 // Output only. A list of streams created with the session. 204 // 205 // At least one stream is created with the session. In the future, larger 206 // request_stream_count values *may* result in this list being unpopulated, 207 // in that case, the user will need to use a List method to get the streams 208 // instead, which is not yet available. 209 repeated ReadStream streams = 10 [(google.api.field_behavior) = OUTPUT_ONLY]; 210 211 // Output only. An estimate on the number of bytes this session will scan when 212 // all streams are completely consumed. This estimate is based on 213 // metadata from the table which might be incomplete or stale. 214 int64 estimated_total_bytes_scanned = 12 215 [(google.api.field_behavior) = OUTPUT_ONLY]; 216 217 // Output only. A pre-projected estimate of the total physical size of files 218 // (in bytes) that this session will scan when all streams are consumed. This 219 // estimate is independent of the selected columns and can be based on 220 // incomplete or stale metadata from the table. This field is only set for 221 // BigLake tables. 222 int64 estimated_total_physical_file_size = 15 223 [(google.api.field_behavior) = OUTPUT_ONLY]; 224 225 // Output only. An estimate on the number of rows present in this session's 226 // streams. This estimate is based on metadata from the table which might be 227 // incomplete or stale. 228 int64 estimated_row_count = 14 [(google.api.field_behavior) = OUTPUT_ONLY]; 229 230 // Optional. ID set by client to annotate a session identity. This does not 231 // need to be strictly unique, but instead the same ID should be used to group 232 // logically connected sessions (e.g. All using the same ID for all sessions 233 // needed to complete a Spark SQL query is reasonable). 234 // 235 // Maximum length is 256 bytes. 236 string trace_id = 13 [(google.api.field_behavior) = OPTIONAL]; 237} 238 239// Information about a single stream that gets data out of the storage system. 240// Most of the information about `ReadStream` instances is aggregated, making 241// `ReadStream` lightweight. 242message ReadStream { 243 option (google.api.resource) = { 244 type: "bigquerystorage.googleapis.com/ReadStream" 245 pattern: "projects/{project}/locations/{location}/sessions/{session}/streams/{stream}" 246 }; 247 248 // Output only. Name of the stream, in the form 249 // `projects/{project_id}/locations/{location}/sessions/{session_id}/streams/{stream_id}`. 250 string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 251} 252 253// WriteStreamView is a view enum that controls what details about a write 254// stream should be returned. 255enum WriteStreamView { 256 // The default / unset value. 257 WRITE_STREAM_VIEW_UNSPECIFIED = 0; 258 259 // The BASIC projection returns basic metadata about a write stream. The 260 // basic view does not include schema information. This is the default view 261 // returned by GetWriteStream. 262 BASIC = 1; 263 264 // The FULL projection returns all available write stream metadata, including 265 // the schema. CreateWriteStream returns the full projection of write stream 266 // metadata. 267 FULL = 2; 268} 269 270// Information about a single stream that gets data inside the storage system. 271message WriteStream { 272 option (google.api.resource) = { 273 type: "bigquerystorage.googleapis.com/WriteStream" 274 pattern: "projects/{project}/datasets/{dataset}/tables/{table}/streams/{stream}" 275 }; 276 277 // Type enum of the stream. 278 enum Type { 279 // Unknown type. 280 TYPE_UNSPECIFIED = 0; 281 282 // Data will commit automatically and appear as soon as the write is 283 // acknowledged. 284 COMMITTED = 1; 285 286 // Data is invisible until the stream is committed. 287 PENDING = 2; 288 289 // Data is only visible up to the offset to which it was flushed. 290 BUFFERED = 3; 291 } 292 293 // Mode enum of the stream. 294 enum WriteMode { 295 // Unknown type. 296 WRITE_MODE_UNSPECIFIED = 0; 297 298 // Insert new records into the table. 299 // It is the default value if customers do not specify it. 300 INSERT = 1; 301 } 302 303 // Output only. Name of the stream, in the form 304 // `projects/{project}/datasets/{dataset}/tables/{table}/streams/{stream}`. 305 string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 306 307 // Immutable. Type of the stream. 308 Type type = 2 [(google.api.field_behavior) = IMMUTABLE]; 309 310 // Output only. Create time of the stream. For the _default stream, this is 311 // the creation_time of the table. 312 google.protobuf.Timestamp create_time = 3 313 [(google.api.field_behavior) = OUTPUT_ONLY]; 314 315 // Output only. Commit time of the stream. 316 // If a stream is of `COMMITTED` type, then it will have a commit_time same as 317 // `create_time`. If the stream is of `PENDING` type, empty commit_time 318 // means it is not committed. 319 google.protobuf.Timestamp commit_time = 4 320 [(google.api.field_behavior) = OUTPUT_ONLY]; 321 322 // Output only. The schema of the destination table. It is only returned in 323 // `CreateWriteStream` response. Caller should generate data that's 324 // compatible with this schema to send in initial `AppendRowsRequest`. 325 // The table schema could go out of date during the life time of the stream. 326 TableSchema table_schema = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; 327 328 // Immutable. Mode of the stream. 329 WriteMode write_mode = 7 [(google.api.field_behavior) = IMMUTABLE]; 330 331 // Immutable. The geographic location where the stream's dataset resides. See 332 // https://cloud.google.com/bigquery/docs/locations for supported 333 // locations. 334 string location = 8 [(google.api.field_behavior) = IMMUTABLE]; 335} 336