dataflow/v1beta3/environment.proto

// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.dataflow.v1beta3;

import "google/api/field_behavior.proto";
import "google/protobuf/any.proto";
import "google/protobuf/struct.proto";

option csharp_namespace = "Google.Cloud.Dataflow.V1Beta3";
option go_package = "cloud.google.com/go/dataflow/apiv1beta3/dataflowpb;dataflowpb";
option java_multiple_files = true;
option java_outer_classname = "EnvironmentProto";
option java_package = "com.google.dataflow.v1beta3";
option php_namespace = "Google\\Cloud\\Dataflow\\V1beta3";
option ruby_package = "Google::Cloud::Dataflow::V1beta3";

// Describes the environment in which a Dataflow Job runs.
message Environment {
  // The prefix of the resources the system should use for temporary
  // storage.  The system will append the suffix "/temp-{JOBNAME} to
  // this resource prefix, where {JOBNAME} is the value of the
  // job_name field.  The resulting bucket and object prefix is used
  // as the prefix of the resources used to store temporary data
  // needed during the job execution.  NOTE: This will override the
  // value in taskrunner_settings.
  // The supported resource type is:
  //
  // Google Cloud Storage:
  //
  //   storage.googleapis.com/{bucket}/{object}
  //   bucket.storage.googleapis.com/{object}
  string temp_storage_prefix = 1;

  // The type of cluster manager API to use.  If unknown or
  // unspecified, the service will attempt to choose a reasonable
  // default.  This should be in the form of the API service name,
  // e.g. "compute.googleapis.com".
  string cluster_manager_api_service = 2;

  // The list of experiments to enable. This field should be used for SDK
  // related experiments and not for service related experiments. The proper
  // field for service related experiments is service_options.
  repeated string experiments = 3;

  // The list of service options to enable. This field should be used for
  // service related experiments only. These experiments, when graduating to GA,
  // should be replaced by dedicated fields or become default (i.e. always on).
  repeated string service_options = 16;

  // If set, contains the Cloud KMS key identifier used to encrypt data
  // at rest, AKA a Customer Managed Encryption Key (CMEK).
  //
  // Format:
  //   projects/PROJECT_ID/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY
  string service_kms_key_name = 12;

  // The worker pools. At least one "harness" worker pool must be
  // specified in order for the job to have workers.
  repeated WorkerPool worker_pools = 4;

  // A description of the process that generated the request.
  google.protobuf.Struct user_agent = 5;

  // A structure describing which components and their versions of the service
  // are required in order to run the job.
  google.protobuf.Struct version = 6;

  // The dataset for the current project where various workflow
  // related tables are stored.
  //
  // The supported resource type is:
  //
  // Google BigQuery:
  //   bigquery.googleapis.com/{dataset}
  string dataset = 7;

  // The Cloud Dataflow SDK pipeline options specified by the user. These
  // options are passed through the service and are used to recreate the
  // SDK pipeline options on the worker in a language agnostic and platform
  // independent way.
  google.protobuf.Struct sdk_pipeline_options = 8;

  // Experimental settings.
  google.protobuf.Any internal_experiments = 9;

  // Identity to run virtual machines as. Defaults to the default account.
  string service_account_email = 10;

  // Which Flexible Resource Scheduling mode to run in.
  FlexResourceSchedulingGoal flex_resource_scheduling_goal = 11;

  // The Compute Engine region
  // (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in
  // which worker processing should occur, e.g. "us-west1". Mutually exclusive
  // with worker_zone. If neither worker_region nor worker_zone is specified,
  // default to the control plane's region.
  string worker_region = 13;

  // The Compute Engine zone
  // (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in
  // which worker processing should occur, e.g. "us-west1-a". Mutually exclusive
  // with worker_region. If neither worker_region nor worker_zone is specified,
  // a zone in the control plane's region is chosen based on available capacity.
  string worker_zone = 14;

  // Output only. The shuffle mode used for the job.
  ShuffleMode shuffle_mode = 15 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Any debugging options to be supplied to the job.
  DebugOptions debug_options = 17;
}

// The packages that must be installed in order for a worker to run the
// steps of the Cloud Dataflow job that will be assigned to its worker
// pool.
//
// This is the mechanism by which the Cloud Dataflow SDK causes code to
// be loaded onto the workers. For example, the Cloud Dataflow Java SDK
// might use this to install jars containing the user's code and all of the
// various dependencies (libraries, data files, etc.) required in order
// for that code to run.
message Package {
  // The name of the package.
  string name = 1;

  // The resource to read the package from. The supported resource type is:
  //
  // Google Cloud Storage:
  //
  //   storage.googleapis.com/{bucket}
  //   bucket.storage.googleapis.com/
  string location = 2;
}

// Specifies the processing model used by a
// [google.dataflow.v1beta3.Job], which determines the way the Job is
// managed by the Cloud Dataflow service (how workers are scheduled, how
// inputs are sharded, etc).
enum JobType {
  // The type of the job is unspecified, or unknown.
  JOB_TYPE_UNKNOWN = 0;

  // A batch job with a well-defined end point: data is read, data is
  // processed, data is written, and the job is done.
  JOB_TYPE_BATCH = 1;

  // A continuously streaming job with no end: data is read,
  // processed, and written continuously.
  JOB_TYPE_STREAMING = 2;
}

// Specifies the resource to optimize for in Flexible Resource Scheduling.
enum FlexResourceSchedulingGoal {
  // Run in the default mode.
  FLEXRS_UNSPECIFIED = 0;

  // Optimize for lower execution time.
  FLEXRS_SPEED_OPTIMIZED = 1;

  // Optimize for lower cost.
  FLEXRS_COST_OPTIMIZED = 2;
}

// Describes the data disk used by a workflow job.
message Disk {
  // Size of disk in GB.  If zero or unspecified, the service will
  // attempt to choose a reasonable default.
  int32 size_gb = 1;

  // Disk storage type, as defined by Google Compute Engine.  This
  // must be a disk type appropriate to the project and zone in which
  // the workers will run.  If unknown or unspecified, the service
  // will attempt to choose a reasonable default.
  //
  // For example, the standard persistent disk type is a resource name
  // typically ending in "pd-standard".  If SSD persistent disks are
  // available, the resource name typically ends with "pd-ssd".  The
  // actual valid values are defined the Google Compute Engine API,
  // not by the Cloud Dataflow API; consult the Google Compute Engine
  // documentation for more information about determining the set of
  // available disk types for a particular project and zone.
  //
  // Google Compute Engine Disk types are local to a particular
  // project in a particular zone, and so the resource name will
  // typically look something like this:
  //
  // compute.googleapis.com/projects/project-id/zones/zone/diskTypes/pd-standard
  string disk_type = 2;

  // Directory in a VM where disk is mounted.
  string mount_point = 3;
}

// Provides data to pass through to the worker harness.
message WorkerSettings {
  // The base URL for accessing Google Cloud APIs.
  //
  // When workers access Google Cloud APIs, they logically do so via
  // relative URLs.  If this field is specified, it supplies the base
  // URL to use for resolving these relative URLs.  The normative
  // algorithm used is defined by RFC 1808, "Relative Uniform Resource
  // Locators".
  //
  // If not specified, the default value is "http://www.googleapis.com/"
  string base_url = 1;

  // Whether to send work progress updates to the service.
  bool reporting_enabled = 2;

  // The Cloud Dataflow service path relative to the root URL, for example,
  // "dataflow/v1b3/projects".
  string service_path = 3;

  // The Shuffle service path relative to the root URL, for example,
  // "shuffle/v1beta1".
  string shuffle_service_path = 4;

  // The ID of the worker running this pipeline.
  string worker_id = 5;

  // The prefix of the resources the system should use for temporary
  // storage.
  //
  // The supported resource type is:
  //
  // Google Cloud Storage:
  //
  //   storage.googleapis.com/{bucket}/{object}
  //   bucket.storage.googleapis.com/{object}
  string temp_storage_prefix = 6;
}

// Taskrunner configuration settings.
message TaskRunnerSettings {
  // The UNIX user ID on the worker VM to use for tasks launched by
  // taskrunner; e.g. "root".
  string task_user = 1;

  // The UNIX group ID on the worker VM to use for tasks launched by
  // taskrunner; e.g. "wheel".
  string task_group = 2;

  // The OAuth2 scopes to be requested by the taskrunner in order to
  // access the Cloud Dataflow API.
  repeated string oauth_scopes = 3;

  // The base URL for the taskrunner to use when accessing Google Cloud APIs.
  //
  // When workers access Google Cloud APIs, they logically do so via
  // relative URLs.  If this field is specified, it supplies the base
  // URL to use for resolving these relative URLs.  The normative
  // algorithm used is defined by RFC 1808, "Relative Uniform Resource
  // Locators".
  //
  // If not specified, the default value is "http://www.googleapis.com/"
  string base_url = 4;

  // The API version of endpoint, e.g. "v1b3"
  string dataflow_api_version = 5;

  // The settings to pass to the parallel worker harness.
  WorkerSettings parallel_worker_settings = 6;

  // The location on the worker for task-specific subdirectories.
  string base_task_dir = 7;

  // Whether to continue taskrunner if an exception is hit.
  bool continue_on_exception = 8;

  // Whether to send taskrunner log info to Google Compute Engine VM serial
  // console.
  bool log_to_serialconsole = 9;

  // Whether to also send taskrunner log info to stderr.
  bool alsologtostderr = 10;

  // Indicates where to put logs.  If this is not specified, the logs
  // will not be uploaded.
  //
  // The supported resource type is:
  //
  // Google Cloud Storage:
  //   storage.googleapis.com/{bucket}/{object}
  //   bucket.storage.googleapis.com/{object}
  string log_upload_location = 11;

  // The directory on the VM to store logs.
  string log_dir = 12;

  // The prefix of the resources the taskrunner should use for
  // temporary storage.
  //
  // The supported resource type is:
  //
  // Google Cloud Storage:
  //   storage.googleapis.com/{bucket}/{object}
  //   bucket.storage.googleapis.com/{object}
  string temp_storage_prefix = 13;

  // The command to launch the worker harness.
  string harness_command = 14;

  // The file to store the workflow in.
  string workflow_file_name = 15;

  // The file to store preprocessing commands in.
  string commandlines_file_name = 16;

  // The ID string of the VM.
  string vm_id = 17;

  // The suggested backend language.
  string language_hint = 18;

  // The streaming worker main class name.
  string streaming_worker_main_class = 19;
}

// Specifies what happens to a resource when a Cloud Dataflow
// [google.dataflow.v1beta3.Job][google.dataflow.v1beta3.Job] has completed.
enum TeardownPolicy {
  // The teardown policy isn't specified, or is unknown.
  TEARDOWN_POLICY_UNKNOWN = 0;

  // Always teardown the resource.
  TEARDOWN_ALWAYS = 1;

  // Teardown the resource on success. This is useful for debugging
  // failures.
  TEARDOWN_ON_SUCCESS = 2;

  // Never teardown the resource. This is useful for debugging and
  // development.
  TEARDOWN_NEVER = 3;
}

// The default set of packages to be staged on a pool of workers.
enum DefaultPackageSet {
  // The default set of packages to stage is unknown, or unspecified.
  DEFAULT_PACKAGE_SET_UNKNOWN = 0;

  // Indicates that no packages should be staged at the worker unless
  // explicitly specified by the job.
  DEFAULT_PACKAGE_SET_NONE = 1;

  // Stage packages typically useful to workers written in Java.
  DEFAULT_PACKAGE_SET_JAVA = 2;

  // Stage packages typically useful to workers written in Python.
  DEFAULT_PACKAGE_SET_PYTHON = 3;
}

// Specifies the algorithm used to determine the number of worker
// processes to run at any given point in time, based on the amount of
// data left to process, the number of workers, and how quickly
// existing workers are processing data.
enum AutoscalingAlgorithm {
  // The algorithm is unknown, or unspecified.
  AUTOSCALING_ALGORITHM_UNKNOWN = 0;

  // Disable autoscaling.
  AUTOSCALING_ALGORITHM_NONE = 1;

  // Increase worker count over time to reduce job execution time.
  AUTOSCALING_ALGORITHM_BASIC = 2;
}

// Settings for WorkerPool autoscaling.
message AutoscalingSettings {
  // The algorithm to use for autoscaling.
  AutoscalingAlgorithm algorithm = 1;

  // The maximum number of workers to cap scaling at.
  int32 max_num_workers = 2;
}

// Specifies how IP addresses should be allocated to the worker machines.
enum WorkerIPAddressConfiguration {
  // The configuration is unknown, or unspecified.
  WORKER_IP_UNSPECIFIED = 0;

  // Workers should have public IP addresses.
  WORKER_IP_PUBLIC = 1;

  // Workers should have private IP addresses.
  WORKER_IP_PRIVATE = 2;
}

// Defines a SDK harness container for executing Dataflow pipelines.
message SdkHarnessContainerImage {
  // A docker container image that resides in Google Container Registry.
  string container_image = 1;

  // If true, recommends the Dataflow service to use only one core per SDK
  // container instance with this image. If false (or unset) recommends using
  // more than one core per SDK container instance with this image for
  // efficiency. Note that Dataflow service may choose to override this property
  // if needed.
  bool use_single_core_per_container = 2;

  // Environment ID for the Beam runner API proto Environment that corresponds
  // to the current SDK Harness.
  string environment_id = 3;

  // The set of capabilities enumerated in the above Environment proto. See also
  // https://github.com/apache/beam/blob/master/model/pipeline/src/main/proto/beam_runner_api.proto
  repeated string capabilities = 4;
}

// Describes one particular pool of Cloud Dataflow workers to be
// instantiated by the Cloud Dataflow service in order to perform the
// computations required by a job.  Note that a workflow job may use
// multiple pools, in order to match the various computational
// requirements of the various stages of the job.
message WorkerPool {
  // The kind of the worker pool; currently only `harness` and `shuffle`
  // are supported.
  string kind = 1;

  // Number of Google Compute Engine workers in this pool needed to
  // execute the job.  If zero or unspecified, the service will
  // attempt to choose a reasonable default.
  int32 num_workers = 2;

  // Packages to be installed on workers.
  repeated Package packages = 3;

  // The default package set to install.  This allows the service to
  // select a default set of packages which are useful to worker
  // harnesses written in a particular language.
  DefaultPackageSet default_package_set = 4;

  // Machine type (e.g. "n1-standard-1").  If empty or unspecified, the
  // service will attempt to choose a reasonable default.
  string machine_type = 5;

  // Sets the policy for determining when to turndown worker pool.
  // Allowed values are: `TEARDOWN_ALWAYS`, `TEARDOWN_ON_SUCCESS`, and
  // `TEARDOWN_NEVER`.
  // `TEARDOWN_ALWAYS` means workers are always torn down regardless of whether
  // the job succeeds. `TEARDOWN_ON_SUCCESS` means workers are torn down
  // if the job succeeds. `TEARDOWN_NEVER` means the workers are never torn
  // down.
  //
  // If the workers are not torn down by the service, they will
  // continue to run and use Google Compute Engine VM resources in the
  // user's project until they are explicitly terminated by the user.
  // Because of this, Google recommends using the `TEARDOWN_ALWAYS`
  // policy except for small, manually supervised test jobs.
  //
  // If unknown or unspecified, the service will attempt to choose a reasonable
  // default.
  TeardownPolicy teardown_policy = 6;

  // Size of root disk for VMs, in GB.  If zero or unspecified, the service will
  // attempt to choose a reasonable default.
  int32 disk_size_gb = 7;

  // Type of root disk for VMs.  If empty or unspecified, the service will
  // attempt to choose a reasonable default.
  string disk_type = 16;

  // Fully qualified source image for disks.
  string disk_source_image = 8;

  // Zone to run the worker pools in.  If empty or unspecified, the service
  // will attempt to choose a reasonable default.
  string zone = 9;

  // Settings passed through to Google Compute Engine workers when
  // using the standard Dataflow task runner.  Users should ignore
  // this field.
  TaskRunnerSettings taskrunner_settings = 10;

  // The action to take on host maintenance, as defined by the Google
  // Compute Engine API.
  string on_host_maintenance = 11;

  // Data disks that are used by a VM in this workflow.
  repeated Disk data_disks = 12;

  // Metadata to set on the Google Compute Engine VMs.
  map<string, string> metadata = 13;

  // Settings for autoscaling of this WorkerPool.
  AutoscalingSettings autoscaling_settings = 14;

  // Extra arguments for this worker pool.
  google.protobuf.Any pool_args = 15;

  // Network to which VMs will be assigned.  If empty or unspecified,
  // the service will use the network "default".
  string network = 17;

  // Subnetwork to which VMs will be assigned, if desired.  Expected to be of
  // the form "regions/REGION/subnetworks/SUBNETWORK".
  string subnetwork = 19;

  // Required. Docker container image that executes the Cloud Dataflow worker
  // harness, residing in Google Container Registry.
  //
  // Deprecated for the Fn API path. Use sdk_harness_container_images instead.
  string worker_harness_container_image = 18;

  // The number of threads per worker harness. If empty or unspecified, the
  // service will choose a number of threads (according to the number of cores
  // on the selected machine type for batch, or 1 by convention for streaming).
  int32 num_threads_per_worker = 20;

  // Configuration for VM IPs.
  WorkerIPAddressConfiguration ip_configuration = 21;

  // Set of SDK harness containers needed to execute this pipeline. This will
  // only be set in the Fn API path. For non-cross-language pipelines this
  // should have only one entry. Cross-language pipelines will have two or more
  // entries.
  repeated SdkHarnessContainerImage sdk_harness_container_images = 22;
}

// Specifies the shuffle mode used by a
// [google.dataflow.v1beta3.Job], which determines the approach data is shuffled
// during processing. More details in:
// https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#dataflow-shuffle
enum ShuffleMode {
  // Shuffle mode information is not available.
  SHUFFLE_MODE_UNSPECIFIED = 0;

  // Shuffle is done on the worker VMs.
  VM_BASED = 1;

  // Shuffle is done on the service side.
  SERVICE_BASED = 2;
}

// Describes any options that have an effect on the debugging of pipelines.
message DebugOptions {
  // When true, enables the logging of the literal hot key to the user's Cloud
  // Logging.
  bool enable_hot_key_logging = 1;
}