storage/v1/stream.proto

// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.bigquery.storage.v1;

import "google/api/field_behavior.proto";
import "google/api/resource.proto";
import "google/cloud/bigquery/storage/v1/arrow.proto";
import "google/cloud/bigquery/storage/v1/avro.proto";
import "google/cloud/bigquery/storage/v1/table.proto";
import "google/protobuf/timestamp.proto";

option csharp_namespace = "Google.Cloud.BigQuery.Storage.V1";
option go_package = "cloud.google.com/go/bigquery/storage/apiv1/storagepb;storagepb";
option java_multiple_files = true;
option java_outer_classname = "StreamProto";
option java_package = "com.google.cloud.bigquery.storage.v1";
option php_namespace = "Google\\Cloud\\BigQuery\\Storage\\V1";

// Data format for input or output data.
enum DataFormat {
  // Data format is unspecified.
  DATA_FORMAT_UNSPECIFIED = 0;

  // Avro is a standard open source row based file format.
  // See https://avro.apache.org/ for more details.
  AVRO = 1;

  // Arrow is a standard open source column-based message format.
  // See https://arrow.apache.org/ for more details.
  ARROW = 2;
}

// Information about the ReadSession.
message ReadSession {
  option (google.api.resource) = {
    type: "bigquerystorage.googleapis.com/ReadSession"
    pattern: "projects/{project}/locations/{location}/sessions/{session}"
  };

  // Additional attributes when reading a table.
  message TableModifiers {
    // The snapshot time of the table. If not set, interpreted as now.
    google.protobuf.Timestamp snapshot_time = 1;
  }

  // Options dictating how we read a table.
  message TableReadOptions {
    // Specifies which compression codec to attempt on the entire serialized
    // response payload (either Arrow record batch or Avro rows). This is
    // not to be confused with the Apache Arrow native compression codecs
    // specified in ArrowSerializationOptions. For performance reasons, when
    // creating a read session requesting Arrow responses, setting both native
    // Arrow compression and application-level response compression will not be
    // allowed - choose, at most, one kind of compression.
    enum ResponseCompressionCodec {
      // Default is no compression.
      RESPONSE_COMPRESSION_CODEC_UNSPECIFIED = 0;

      // Use raw LZ4 compression.
      RESPONSE_COMPRESSION_CODEC_LZ4 = 2;
    }

    // Optional. The names of the fields in the table to be returned. If no
    // field names are specified, then all fields in the table are returned.
    //
    // Nested fields -- the child elements of a STRUCT field -- can be selected
    // individually using their fully-qualified names, and will be returned as
    // record fields containing only the selected nested fields. If a STRUCT
    // field is specified in the selected fields list, all of the child elements
    // will be returned.
    //
    // As an example, consider a table with the following schema:
    //
    //   {
    //       "name": "struct_field",
    //       "type": "RECORD",
    //       "mode": "NULLABLE",
    //       "fields": [
    //           {
    //               "name": "string_field1",
    //               "type": "STRING",
    // .              "mode": "NULLABLE"
    //           },
    //           {
    //               "name": "string_field2",
    //               "type": "STRING",
    //               "mode": "NULLABLE"
    //           }
    //       ]
    //   }
    //
    // Specifying "struct_field" in the selected fields list will result in a
    // read session schema with the following logical structure:
    //
    //   struct_field {
    //       string_field1
    //       string_field2
    //   }
    //
    // Specifying "struct_field.string_field1" in the selected fields list will
    // result in a read session schema with the following logical structure:
    //
    //   struct_field {
    //       string_field1
    //   }
    //
    // The order of the fields in the read session schema is derived from the
    // table schema and does not correspond to the order in which the fields are
    // specified in this list.
    repeated string selected_fields = 1;

    // SQL text filtering statement, similar to a WHERE clause in a query.
    // Aggregates are not supported.
    //
    // Examples: "int_field > 5"
    //           "date_field = CAST('2014-9-27' as DATE)"
    //           "nullable_field is not NULL"
    //           "st_equals(geo_field, st_geofromtext("POINT(2, 2)"))"
    //           "numeric_field BETWEEN 1.0 AND 5.0"
    //
    // Restricted to a maximum length for 1 MB.
    string row_restriction = 2;

    oneof output_format_serialization_options {
      // Optional. Options specific to the Apache Arrow output format.
      ArrowSerializationOptions arrow_serialization_options = 3
          [(google.api.field_behavior) = OPTIONAL];

      // Optional. Options specific to the Apache Avro output format
      AvroSerializationOptions avro_serialization_options = 4
          [(google.api.field_behavior) = OPTIONAL];
    }

    // Optional. Specifies a table sampling percentage. Specifically, the query
    // planner will use TABLESAMPLE SYSTEM (sample_percentage PERCENT). The
    // sampling percentage is applied at the data block granularity. It will
    // randomly choose for each data block whether to read the rows in that data
    // block. For more details, see
    // https://cloud.google.com/bigquery/docs/table-sampling)
    optional double sample_percentage = 5
        [(google.api.field_behavior) = OPTIONAL];

    // Optional. Set response_compression_codec when creating a read session to
    // enable application-level compression of ReadRows responses.
    optional ResponseCompressionCodec response_compression_codec = 6
        [(google.api.field_behavior) = OPTIONAL];
  }

  // Output only. Unique identifier for the session, in the form
  // `projects/{project_id}/locations/{location}/sessions/{session_id}`.
  string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Time at which the session becomes invalid. After this time,
  // subsequent requests to read this Session will return errors. The
  // expire_time is automatically assigned and currently cannot be specified or
  // updated.
  google.protobuf.Timestamp expire_time = 2
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Immutable. Data format of the output data. DATA_FORMAT_UNSPECIFIED not
  // supported.
  DataFormat data_format = 3 [(google.api.field_behavior) = IMMUTABLE];

  // The schema for the read. If read_options.selected_fields is set, the
  // schema may be different from the table schema as it will only contain
  // the selected fields.
  oneof schema {
    // Output only. Avro schema.
    AvroSchema avro_schema = 4 [(google.api.field_behavior) = OUTPUT_ONLY];

    // Output only. Arrow schema.
    ArrowSchema arrow_schema = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
  }

  // Immutable. Table that this ReadSession is reading from, in the form
  // `projects/{project_id}/datasets/{dataset_id}/tables/{table_id}`
  string table = 6 [
    (google.api.field_behavior) = IMMUTABLE,
    (google.api.resource_reference) = { type: "bigquery.googleapis.com/Table" }
  ];

  // Optional. Any modifiers which are applied when reading from the specified
  // table.
  TableModifiers table_modifiers = 7 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Read options for this session (e.g. column selection, filters).
  TableReadOptions read_options = 8 [(google.api.field_behavior) = OPTIONAL];

  // Output only. A list of streams created with the session.
  //
  // At least one stream is created with the session. In the future, larger
  // request_stream_count values *may* result in this list being unpopulated,
  // in that case, the user will need to use a List method to get the streams
  // instead, which is not yet available.
  repeated ReadStream streams = 10 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. An estimate on the number of bytes this session will scan when
  // all streams are completely consumed. This estimate is based on
  // metadata from the table which might be incomplete or stale.
  int64 estimated_total_bytes_scanned = 12
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. A pre-projected estimate of the total physical size of files
  // (in bytes) that this session will scan when all streams are consumed. This
  // estimate is independent of the selected columns and can be based on
  // incomplete or stale metadata from the table.  This field is only set for
  // BigLake tables.
  int64 estimated_total_physical_file_size = 15
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. An estimate on the number of rows present in this session's
  // streams. This estimate is based on metadata from the table which might be
  // incomplete or stale.
  int64 estimated_row_count = 14 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Optional. ID set by client to annotate a session identity.  This does not
  // need to be strictly unique, but instead the same ID should be used to group
  // logically connected sessions (e.g. All using the same ID for all sessions
  // needed to complete a Spark SQL query is reasonable).
  //
  // Maximum length is 256 bytes.
  string trace_id = 13 [(google.api.field_behavior) = OPTIONAL];
}

// Information about a single stream that gets data out of the storage system.
// Most of the information about `ReadStream` instances is aggregated, making
// `ReadStream` lightweight.
message ReadStream {
  option (google.api.resource) = {
    type: "bigquerystorage.googleapis.com/ReadStream"
    pattern: "projects/{project}/locations/{location}/sessions/{session}/streams/{stream}"
  };

  // Output only. Name of the stream, in the form
  // `projects/{project_id}/locations/{location}/sessions/{session_id}/streams/{stream_id}`.
  string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// WriteStreamView is a view enum that controls what details about a write
// stream should be returned.
enum WriteStreamView {
  // The default / unset value.
  WRITE_STREAM_VIEW_UNSPECIFIED = 0;

  // The BASIC projection returns basic metadata about a write stream.  The
  // basic view does not include schema information.  This is the default view
  // returned by GetWriteStream.
  BASIC = 1;

  // The FULL projection returns all available write stream metadata, including
  // the schema.  CreateWriteStream returns the full projection of write stream
  // metadata.
  FULL = 2;
}

// Information about a single stream that gets data inside the storage system.
message WriteStream {
  option (google.api.resource) = {
    type: "bigquerystorage.googleapis.com/WriteStream"
    pattern: "projects/{project}/datasets/{dataset}/tables/{table}/streams/{stream}"
  };

  // Type enum of the stream.
  enum Type {
    // Unknown type.
    TYPE_UNSPECIFIED = 0;

    // Data will commit automatically and appear as soon as the write is
    // acknowledged.
    COMMITTED = 1;

    // Data is invisible until the stream is committed.
    PENDING = 2;

    // Data is only visible up to the offset to which it was flushed.
    BUFFERED = 3;
  }

  // Mode enum of the stream.
  enum WriteMode {
    // Unknown type.
    WRITE_MODE_UNSPECIFIED = 0;

    // Insert new records into the table.
    // It is the default value if customers do not specify it.
    INSERT = 1;
  }

  // Output only. Name of the stream, in the form
  // `projects/{project}/datasets/{dataset}/tables/{table}/streams/{stream}`.
  string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Immutable. Type of the stream.
  Type type = 2 [(google.api.field_behavior) = IMMUTABLE];

  // Output only. Create time of the stream. For the _default stream, this is
  // the creation_time of the table.
  google.protobuf.Timestamp create_time = 3
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Commit time of the stream.
  // If a stream is of `COMMITTED` type, then it will have a commit_time same as
  // `create_time`. If the stream is of `PENDING` type, empty commit_time
  // means it is not committed.
  google.protobuf.Timestamp commit_time = 4
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. The schema of the destination table. It is only returned in
  // `CreateWriteStream` response. Caller should generate data that's
  // compatible with this schema to send in initial `AppendRowsRequest`.
  // The table schema could go out of date during the life time of the stream.
  TableSchema table_schema = 5 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Immutable. Mode of the stream.
  WriteMode write_mode = 7 [(google.api.field_behavior) = IMMUTABLE];

  // Immutable. The geographic location where the stream's dataset resides. See
  // https://cloud.google.com/bigquery/docs/locations for supported
  // locations.
  string location = 8 [(google.api.field_behavior) = IMMUTABLE];
}