1// Copyright 2022 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.dataflow.v1beta3; 18 19import "google/api/field_behavior.proto"; 20import "google/protobuf/any.proto"; 21import "google/protobuf/struct.proto"; 22 23option csharp_namespace = "Google.Cloud.Dataflow.V1Beta3"; 24option go_package = "cloud.google.com/go/dataflow/apiv1beta3/dataflowpb;dataflowpb"; 25option java_multiple_files = true; 26option java_outer_classname = "EnvironmentProto"; 27option java_package = "com.google.dataflow.v1beta3"; 28option php_namespace = "Google\\Cloud\\Dataflow\\V1beta3"; 29option ruby_package = "Google::Cloud::Dataflow::V1beta3"; 30 31// Describes the environment in which a Dataflow Job runs. 32message Environment { 33 // The prefix of the resources the system should use for temporary 34 // storage. The system will append the suffix "/temp-{JOBNAME} to 35 // this resource prefix, where {JOBNAME} is the value of the 36 // job_name field. The resulting bucket and object prefix is used 37 // as the prefix of the resources used to store temporary data 38 // needed during the job execution. NOTE: This will override the 39 // value in taskrunner_settings. 40 // The supported resource type is: 41 // 42 // Google Cloud Storage: 43 // 44 // storage.googleapis.com/{bucket}/{object} 45 // bucket.storage.googleapis.com/{object} 46 string temp_storage_prefix = 1; 47 48 // The type of cluster manager API to use. If unknown or 49 // unspecified, the service will attempt to choose a reasonable 50 // default. This should be in the form of the API service name, 51 // e.g. "compute.googleapis.com". 52 string cluster_manager_api_service = 2; 53 54 // The list of experiments to enable. This field should be used for SDK 55 // related experiments and not for service related experiments. The proper 56 // field for service related experiments is service_options. 57 repeated string experiments = 3; 58 59 // The list of service options to enable. This field should be used for 60 // service related experiments only. These experiments, when graduating to GA, 61 // should be replaced by dedicated fields or become default (i.e. always on). 62 repeated string service_options = 16; 63 64 // If set, contains the Cloud KMS key identifier used to encrypt data 65 // at rest, AKA a Customer Managed Encryption Key (CMEK). 66 // 67 // Format: 68 // projects/PROJECT_ID/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY 69 string service_kms_key_name = 12; 70 71 // The worker pools. At least one "harness" worker pool must be 72 // specified in order for the job to have workers. 73 repeated WorkerPool worker_pools = 4; 74 75 // A description of the process that generated the request. 76 google.protobuf.Struct user_agent = 5; 77 78 // A structure describing which components and their versions of the service 79 // are required in order to run the job. 80 google.protobuf.Struct version = 6; 81 82 // The dataset for the current project where various workflow 83 // related tables are stored. 84 // 85 // The supported resource type is: 86 // 87 // Google BigQuery: 88 // bigquery.googleapis.com/{dataset} 89 string dataset = 7; 90 91 // The Cloud Dataflow SDK pipeline options specified by the user. These 92 // options are passed through the service and are used to recreate the 93 // SDK pipeline options on the worker in a language agnostic and platform 94 // independent way. 95 google.protobuf.Struct sdk_pipeline_options = 8; 96 97 // Experimental settings. 98 google.protobuf.Any internal_experiments = 9; 99 100 // Identity to run virtual machines as. Defaults to the default account. 101 string service_account_email = 10; 102 103 // Which Flexible Resource Scheduling mode to run in. 104 FlexResourceSchedulingGoal flex_resource_scheduling_goal = 11; 105 106 // The Compute Engine region 107 // (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in 108 // which worker processing should occur, e.g. "us-west1". Mutually exclusive 109 // with worker_zone. If neither worker_region nor worker_zone is specified, 110 // default to the control plane's region. 111 string worker_region = 13; 112 113 // The Compute Engine zone 114 // (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in 115 // which worker processing should occur, e.g. "us-west1-a". Mutually exclusive 116 // with worker_region. If neither worker_region nor worker_zone is specified, 117 // a zone in the control plane's region is chosen based on available capacity. 118 string worker_zone = 14; 119 120 // Output only. The shuffle mode used for the job. 121 ShuffleMode shuffle_mode = 15 [(google.api.field_behavior) = OUTPUT_ONLY]; 122 123 // Any debugging options to be supplied to the job. 124 DebugOptions debug_options = 17; 125} 126 127// The packages that must be installed in order for a worker to run the 128// steps of the Cloud Dataflow job that will be assigned to its worker 129// pool. 130// 131// This is the mechanism by which the Cloud Dataflow SDK causes code to 132// be loaded onto the workers. For example, the Cloud Dataflow Java SDK 133// might use this to install jars containing the user's code and all of the 134// various dependencies (libraries, data files, etc.) required in order 135// for that code to run. 136message Package { 137 // The name of the package. 138 string name = 1; 139 140 // The resource to read the package from. The supported resource type is: 141 // 142 // Google Cloud Storage: 143 // 144 // storage.googleapis.com/{bucket} 145 // bucket.storage.googleapis.com/ 146 string location = 2; 147} 148 149// Specifies the processing model used by a 150// [google.dataflow.v1beta3.Job], which determines the way the Job is 151// managed by the Cloud Dataflow service (how workers are scheduled, how 152// inputs are sharded, etc). 153enum JobType { 154 // The type of the job is unspecified, or unknown. 155 JOB_TYPE_UNKNOWN = 0; 156 157 // A batch job with a well-defined end point: data is read, data is 158 // processed, data is written, and the job is done. 159 JOB_TYPE_BATCH = 1; 160 161 // A continuously streaming job with no end: data is read, 162 // processed, and written continuously. 163 JOB_TYPE_STREAMING = 2; 164} 165 166// Specifies the resource to optimize for in Flexible Resource Scheduling. 167enum FlexResourceSchedulingGoal { 168 // Run in the default mode. 169 FLEXRS_UNSPECIFIED = 0; 170 171 // Optimize for lower execution time. 172 FLEXRS_SPEED_OPTIMIZED = 1; 173 174 // Optimize for lower cost. 175 FLEXRS_COST_OPTIMIZED = 2; 176} 177 178// Describes the data disk used by a workflow job. 179message Disk { 180 // Size of disk in GB. If zero or unspecified, the service will 181 // attempt to choose a reasonable default. 182 int32 size_gb = 1; 183 184 // Disk storage type, as defined by Google Compute Engine. This 185 // must be a disk type appropriate to the project and zone in which 186 // the workers will run. If unknown or unspecified, the service 187 // will attempt to choose a reasonable default. 188 // 189 // For example, the standard persistent disk type is a resource name 190 // typically ending in "pd-standard". If SSD persistent disks are 191 // available, the resource name typically ends with "pd-ssd". The 192 // actual valid values are defined the Google Compute Engine API, 193 // not by the Cloud Dataflow API; consult the Google Compute Engine 194 // documentation for more information about determining the set of 195 // available disk types for a particular project and zone. 196 // 197 // Google Compute Engine Disk types are local to a particular 198 // project in a particular zone, and so the resource name will 199 // typically look something like this: 200 // 201 // compute.googleapis.com/projects/project-id/zones/zone/diskTypes/pd-standard 202 string disk_type = 2; 203 204 // Directory in a VM where disk is mounted. 205 string mount_point = 3; 206} 207 208// Provides data to pass through to the worker harness. 209message WorkerSettings { 210 // The base URL for accessing Google Cloud APIs. 211 // 212 // When workers access Google Cloud APIs, they logically do so via 213 // relative URLs. If this field is specified, it supplies the base 214 // URL to use for resolving these relative URLs. The normative 215 // algorithm used is defined by RFC 1808, "Relative Uniform Resource 216 // Locators". 217 // 218 // If not specified, the default value is "http://www.googleapis.com/" 219 string base_url = 1; 220 221 // Whether to send work progress updates to the service. 222 bool reporting_enabled = 2; 223 224 // The Cloud Dataflow service path relative to the root URL, for example, 225 // "dataflow/v1b3/projects". 226 string service_path = 3; 227 228 // The Shuffle service path relative to the root URL, for example, 229 // "shuffle/v1beta1". 230 string shuffle_service_path = 4; 231 232 // The ID of the worker running this pipeline. 233 string worker_id = 5; 234 235 // The prefix of the resources the system should use for temporary 236 // storage. 237 // 238 // The supported resource type is: 239 // 240 // Google Cloud Storage: 241 // 242 // storage.googleapis.com/{bucket}/{object} 243 // bucket.storage.googleapis.com/{object} 244 string temp_storage_prefix = 6; 245} 246 247// Taskrunner configuration settings. 248message TaskRunnerSettings { 249 // The UNIX user ID on the worker VM to use for tasks launched by 250 // taskrunner; e.g. "root". 251 string task_user = 1; 252 253 // The UNIX group ID on the worker VM to use for tasks launched by 254 // taskrunner; e.g. "wheel". 255 string task_group = 2; 256 257 // The OAuth2 scopes to be requested by the taskrunner in order to 258 // access the Cloud Dataflow API. 259 repeated string oauth_scopes = 3; 260 261 // The base URL for the taskrunner to use when accessing Google Cloud APIs. 262 // 263 // When workers access Google Cloud APIs, they logically do so via 264 // relative URLs. If this field is specified, it supplies the base 265 // URL to use for resolving these relative URLs. The normative 266 // algorithm used is defined by RFC 1808, "Relative Uniform Resource 267 // Locators". 268 // 269 // If not specified, the default value is "http://www.googleapis.com/" 270 string base_url = 4; 271 272 // The API version of endpoint, e.g. "v1b3" 273 string dataflow_api_version = 5; 274 275 // The settings to pass to the parallel worker harness. 276 WorkerSettings parallel_worker_settings = 6; 277 278 // The location on the worker for task-specific subdirectories. 279 string base_task_dir = 7; 280 281 // Whether to continue taskrunner if an exception is hit. 282 bool continue_on_exception = 8; 283 284 // Whether to send taskrunner log info to Google Compute Engine VM serial 285 // console. 286 bool log_to_serialconsole = 9; 287 288 // Whether to also send taskrunner log info to stderr. 289 bool alsologtostderr = 10; 290 291 // Indicates where to put logs. If this is not specified, the logs 292 // will not be uploaded. 293 // 294 // The supported resource type is: 295 // 296 // Google Cloud Storage: 297 // storage.googleapis.com/{bucket}/{object} 298 // bucket.storage.googleapis.com/{object} 299 string log_upload_location = 11; 300 301 // The directory on the VM to store logs. 302 string log_dir = 12; 303 304 // The prefix of the resources the taskrunner should use for 305 // temporary storage. 306 // 307 // The supported resource type is: 308 // 309 // Google Cloud Storage: 310 // storage.googleapis.com/{bucket}/{object} 311 // bucket.storage.googleapis.com/{object} 312 string temp_storage_prefix = 13; 313 314 // The command to launch the worker harness. 315 string harness_command = 14; 316 317 // The file to store the workflow in. 318 string workflow_file_name = 15; 319 320 // The file to store preprocessing commands in. 321 string commandlines_file_name = 16; 322 323 // The ID string of the VM. 324 string vm_id = 17; 325 326 // The suggested backend language. 327 string language_hint = 18; 328 329 // The streaming worker main class name. 330 string streaming_worker_main_class = 19; 331} 332 333// Specifies what happens to a resource when a Cloud Dataflow 334// [google.dataflow.v1beta3.Job][google.dataflow.v1beta3.Job] has completed. 335enum TeardownPolicy { 336 // The teardown policy isn't specified, or is unknown. 337 TEARDOWN_POLICY_UNKNOWN = 0; 338 339 // Always teardown the resource. 340 TEARDOWN_ALWAYS = 1; 341 342 // Teardown the resource on success. This is useful for debugging 343 // failures. 344 TEARDOWN_ON_SUCCESS = 2; 345 346 // Never teardown the resource. This is useful for debugging and 347 // development. 348 TEARDOWN_NEVER = 3; 349} 350 351// The default set of packages to be staged on a pool of workers. 352enum DefaultPackageSet { 353 // The default set of packages to stage is unknown, or unspecified. 354 DEFAULT_PACKAGE_SET_UNKNOWN = 0; 355 356 // Indicates that no packages should be staged at the worker unless 357 // explicitly specified by the job. 358 DEFAULT_PACKAGE_SET_NONE = 1; 359 360 // Stage packages typically useful to workers written in Java. 361 DEFAULT_PACKAGE_SET_JAVA = 2; 362 363 // Stage packages typically useful to workers written in Python. 364 DEFAULT_PACKAGE_SET_PYTHON = 3; 365} 366 367// Specifies the algorithm used to determine the number of worker 368// processes to run at any given point in time, based on the amount of 369// data left to process, the number of workers, and how quickly 370// existing workers are processing data. 371enum AutoscalingAlgorithm { 372 // The algorithm is unknown, or unspecified. 373 AUTOSCALING_ALGORITHM_UNKNOWN = 0; 374 375 // Disable autoscaling. 376 AUTOSCALING_ALGORITHM_NONE = 1; 377 378 // Increase worker count over time to reduce job execution time. 379 AUTOSCALING_ALGORITHM_BASIC = 2; 380} 381 382// Settings for WorkerPool autoscaling. 383message AutoscalingSettings { 384 // The algorithm to use for autoscaling. 385 AutoscalingAlgorithm algorithm = 1; 386 387 // The maximum number of workers to cap scaling at. 388 int32 max_num_workers = 2; 389} 390 391// Specifies how IP addresses should be allocated to the worker machines. 392enum WorkerIPAddressConfiguration { 393 // The configuration is unknown, or unspecified. 394 WORKER_IP_UNSPECIFIED = 0; 395 396 // Workers should have public IP addresses. 397 WORKER_IP_PUBLIC = 1; 398 399 // Workers should have private IP addresses. 400 WORKER_IP_PRIVATE = 2; 401} 402 403// Defines a SDK harness container for executing Dataflow pipelines. 404message SdkHarnessContainerImage { 405 // A docker container image that resides in Google Container Registry. 406 string container_image = 1; 407 408 // If true, recommends the Dataflow service to use only one core per SDK 409 // container instance with this image. If false (or unset) recommends using 410 // more than one core per SDK container instance with this image for 411 // efficiency. Note that Dataflow service may choose to override this property 412 // if needed. 413 bool use_single_core_per_container = 2; 414 415 // Environment ID for the Beam runner API proto Environment that corresponds 416 // to the current SDK Harness. 417 string environment_id = 3; 418 419 // The set of capabilities enumerated in the above Environment proto. See also 420 // https://github.com/apache/beam/blob/master/model/pipeline/src/main/proto/beam_runner_api.proto 421 repeated string capabilities = 4; 422} 423 424// Describes one particular pool of Cloud Dataflow workers to be 425// instantiated by the Cloud Dataflow service in order to perform the 426// computations required by a job. Note that a workflow job may use 427// multiple pools, in order to match the various computational 428// requirements of the various stages of the job. 429message WorkerPool { 430 // The kind of the worker pool; currently only `harness` and `shuffle` 431 // are supported. 432 string kind = 1; 433 434 // Number of Google Compute Engine workers in this pool needed to 435 // execute the job. If zero or unspecified, the service will 436 // attempt to choose a reasonable default. 437 int32 num_workers = 2; 438 439 // Packages to be installed on workers. 440 repeated Package packages = 3; 441 442 // The default package set to install. This allows the service to 443 // select a default set of packages which are useful to worker 444 // harnesses written in a particular language. 445 DefaultPackageSet default_package_set = 4; 446 447 // Machine type (e.g. "n1-standard-1"). If empty or unspecified, the 448 // service will attempt to choose a reasonable default. 449 string machine_type = 5; 450 451 // Sets the policy for determining when to turndown worker pool. 452 // Allowed values are: `TEARDOWN_ALWAYS`, `TEARDOWN_ON_SUCCESS`, and 453 // `TEARDOWN_NEVER`. 454 // `TEARDOWN_ALWAYS` means workers are always torn down regardless of whether 455 // the job succeeds. `TEARDOWN_ON_SUCCESS` means workers are torn down 456 // if the job succeeds. `TEARDOWN_NEVER` means the workers are never torn 457 // down. 458 // 459 // If the workers are not torn down by the service, they will 460 // continue to run and use Google Compute Engine VM resources in the 461 // user's project until they are explicitly terminated by the user. 462 // Because of this, Google recommends using the `TEARDOWN_ALWAYS` 463 // policy except for small, manually supervised test jobs. 464 // 465 // If unknown or unspecified, the service will attempt to choose a reasonable 466 // default. 467 TeardownPolicy teardown_policy = 6; 468 469 // Size of root disk for VMs, in GB. If zero or unspecified, the service will 470 // attempt to choose a reasonable default. 471 int32 disk_size_gb = 7; 472 473 // Type of root disk for VMs. If empty or unspecified, the service will 474 // attempt to choose a reasonable default. 475 string disk_type = 16; 476 477 // Fully qualified source image for disks. 478 string disk_source_image = 8; 479 480 // Zone to run the worker pools in. If empty or unspecified, the service 481 // will attempt to choose a reasonable default. 482 string zone = 9; 483 484 // Settings passed through to Google Compute Engine workers when 485 // using the standard Dataflow task runner. Users should ignore 486 // this field. 487 TaskRunnerSettings taskrunner_settings = 10; 488 489 // The action to take on host maintenance, as defined by the Google 490 // Compute Engine API. 491 string on_host_maintenance = 11; 492 493 // Data disks that are used by a VM in this workflow. 494 repeated Disk data_disks = 12; 495 496 // Metadata to set on the Google Compute Engine VMs. 497 map<string, string> metadata = 13; 498 499 // Settings for autoscaling of this WorkerPool. 500 AutoscalingSettings autoscaling_settings = 14; 501 502 // Extra arguments for this worker pool. 503 google.protobuf.Any pool_args = 15; 504 505 // Network to which VMs will be assigned. If empty or unspecified, 506 // the service will use the network "default". 507 string network = 17; 508 509 // Subnetwork to which VMs will be assigned, if desired. Expected to be of 510 // the form "regions/REGION/subnetworks/SUBNETWORK". 511 string subnetwork = 19; 512 513 // Required. Docker container image that executes the Cloud Dataflow worker 514 // harness, residing in Google Container Registry. 515 // 516 // Deprecated for the Fn API path. Use sdk_harness_container_images instead. 517 string worker_harness_container_image = 18; 518 519 // The number of threads per worker harness. If empty or unspecified, the 520 // service will choose a number of threads (according to the number of cores 521 // on the selected machine type for batch, or 1 by convention for streaming). 522 int32 num_threads_per_worker = 20; 523 524 // Configuration for VM IPs. 525 WorkerIPAddressConfiguration ip_configuration = 21; 526 527 // Set of SDK harness containers needed to execute this pipeline. This will 528 // only be set in the Fn API path. For non-cross-language pipelines this 529 // should have only one entry. Cross-language pipelines will have two or more 530 // entries. 531 repeated SdkHarnessContainerImage sdk_harness_container_images = 22; 532} 533 534// Specifies the shuffle mode used by a 535// [google.dataflow.v1beta3.Job], which determines the approach data is shuffled 536// during processing. More details in: 537// https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#dataflow-shuffle 538enum ShuffleMode { 539 // Shuffle mode information is not available. 540 SHUFFLE_MODE_UNSPECIFIED = 0; 541 542 // Shuffle is done on the worker VMs. 543 VM_BASED = 1; 544 545 // Shuffle is done on the service side. 546 SERVICE_BASED = 2; 547} 548 549// Describes any options that have an effect on the debugging of pipelines. 550message DebugOptions { 551 // When true, enables the logging of the literal hot key to the user's Cloud 552 // Logging. 553 bool enable_hot_key_logging = 1; 554} 555