xref: /aosp_15_r20/external/googleapis/google/dataflow/v1beta3/environment.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2022 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.dataflow.v1beta3;
18
19import "google/api/field_behavior.proto";
20import "google/protobuf/any.proto";
21import "google/protobuf/struct.proto";
22
23option csharp_namespace = "Google.Cloud.Dataflow.V1Beta3";
24option go_package = "cloud.google.com/go/dataflow/apiv1beta3/dataflowpb;dataflowpb";
25option java_multiple_files = true;
26option java_outer_classname = "EnvironmentProto";
27option java_package = "com.google.dataflow.v1beta3";
28option php_namespace = "Google\\Cloud\\Dataflow\\V1beta3";
29option ruby_package = "Google::Cloud::Dataflow::V1beta3";
30
31// Describes the environment in which a Dataflow Job runs.
32message Environment {
33  // The prefix of the resources the system should use for temporary
34  // storage.  The system will append the suffix "/temp-{JOBNAME} to
35  // this resource prefix, where {JOBNAME} is the value of the
36  // job_name field.  The resulting bucket and object prefix is used
37  // as the prefix of the resources used to store temporary data
38  // needed during the job execution.  NOTE: This will override the
39  // value in taskrunner_settings.
40  // The supported resource type is:
41  //
42  // Google Cloud Storage:
43  //
44  //   storage.googleapis.com/{bucket}/{object}
45  //   bucket.storage.googleapis.com/{object}
46  string temp_storage_prefix = 1;
47
48  // The type of cluster manager API to use.  If unknown or
49  // unspecified, the service will attempt to choose a reasonable
50  // default.  This should be in the form of the API service name,
51  // e.g. "compute.googleapis.com".
52  string cluster_manager_api_service = 2;
53
54  // The list of experiments to enable. This field should be used for SDK
55  // related experiments and not for service related experiments. The proper
56  // field for service related experiments is service_options.
57  repeated string experiments = 3;
58
59  // The list of service options to enable. This field should be used for
60  // service related experiments only. These experiments, when graduating to GA,
61  // should be replaced by dedicated fields or become default (i.e. always on).
62  repeated string service_options = 16;
63
64  // If set, contains the Cloud KMS key identifier used to encrypt data
65  // at rest, AKA a Customer Managed Encryption Key (CMEK).
66  //
67  // Format:
68  //   projects/PROJECT_ID/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY
69  string service_kms_key_name = 12;
70
71  // The worker pools. At least one "harness" worker pool must be
72  // specified in order for the job to have workers.
73  repeated WorkerPool worker_pools = 4;
74
75  // A description of the process that generated the request.
76  google.protobuf.Struct user_agent = 5;
77
78  // A structure describing which components and their versions of the service
79  // are required in order to run the job.
80  google.protobuf.Struct version = 6;
81
82  // The dataset for the current project where various workflow
83  // related tables are stored.
84  //
85  // The supported resource type is:
86  //
87  // Google BigQuery:
88  //   bigquery.googleapis.com/{dataset}
89  string dataset = 7;
90
91  // The Cloud Dataflow SDK pipeline options specified by the user. These
92  // options are passed through the service and are used to recreate the
93  // SDK pipeline options on the worker in a language agnostic and platform
94  // independent way.
95  google.protobuf.Struct sdk_pipeline_options = 8;
96
97  // Experimental settings.
98  google.protobuf.Any internal_experiments = 9;
99
100  // Identity to run virtual machines as. Defaults to the default account.
101  string service_account_email = 10;
102
103  // Which Flexible Resource Scheduling mode to run in.
104  FlexResourceSchedulingGoal flex_resource_scheduling_goal = 11;
105
106  // The Compute Engine region
107  // (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in
108  // which worker processing should occur, e.g. "us-west1". Mutually exclusive
109  // with worker_zone. If neither worker_region nor worker_zone is specified,
110  // default to the control plane's region.
111  string worker_region = 13;
112
113  // The Compute Engine zone
114  // (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in
115  // which worker processing should occur, e.g. "us-west1-a". Mutually exclusive
116  // with worker_region. If neither worker_region nor worker_zone is specified,
117  // a zone in the control plane's region is chosen based on available capacity.
118  string worker_zone = 14;
119
120  // Output only. The shuffle mode used for the job.
121  ShuffleMode shuffle_mode = 15 [(google.api.field_behavior) = OUTPUT_ONLY];
122
123  // Any debugging options to be supplied to the job.
124  DebugOptions debug_options = 17;
125}
126
127// The packages that must be installed in order for a worker to run the
128// steps of the Cloud Dataflow job that will be assigned to its worker
129// pool.
130//
131// This is the mechanism by which the Cloud Dataflow SDK causes code to
132// be loaded onto the workers. For example, the Cloud Dataflow Java SDK
133// might use this to install jars containing the user's code and all of the
134// various dependencies (libraries, data files, etc.) required in order
135// for that code to run.
136message Package {
137  // The name of the package.
138  string name = 1;
139
140  // The resource to read the package from. The supported resource type is:
141  //
142  // Google Cloud Storage:
143  //
144  //   storage.googleapis.com/{bucket}
145  //   bucket.storage.googleapis.com/
146  string location = 2;
147}
148
149// Specifies the processing model used by a
150// [google.dataflow.v1beta3.Job], which determines the way the Job is
151// managed by the Cloud Dataflow service (how workers are scheduled, how
152// inputs are sharded, etc).
153enum JobType {
154  // The type of the job is unspecified, or unknown.
155  JOB_TYPE_UNKNOWN = 0;
156
157  // A batch job with a well-defined end point: data is read, data is
158  // processed, data is written, and the job is done.
159  JOB_TYPE_BATCH = 1;
160
161  // A continuously streaming job with no end: data is read,
162  // processed, and written continuously.
163  JOB_TYPE_STREAMING = 2;
164}
165
166// Specifies the resource to optimize for in Flexible Resource Scheduling.
167enum FlexResourceSchedulingGoal {
168  // Run in the default mode.
169  FLEXRS_UNSPECIFIED = 0;
170
171  // Optimize for lower execution time.
172  FLEXRS_SPEED_OPTIMIZED = 1;
173
174  // Optimize for lower cost.
175  FLEXRS_COST_OPTIMIZED = 2;
176}
177
178// Describes the data disk used by a workflow job.
179message Disk {
180  // Size of disk in GB.  If zero or unspecified, the service will
181  // attempt to choose a reasonable default.
182  int32 size_gb = 1;
183
184  // Disk storage type, as defined by Google Compute Engine.  This
185  // must be a disk type appropriate to the project and zone in which
186  // the workers will run.  If unknown or unspecified, the service
187  // will attempt to choose a reasonable default.
188  //
189  // For example, the standard persistent disk type is a resource name
190  // typically ending in "pd-standard".  If SSD persistent disks are
191  // available, the resource name typically ends with "pd-ssd".  The
192  // actual valid values are defined the Google Compute Engine API,
193  // not by the Cloud Dataflow API; consult the Google Compute Engine
194  // documentation for more information about determining the set of
195  // available disk types for a particular project and zone.
196  //
197  // Google Compute Engine Disk types are local to a particular
198  // project in a particular zone, and so the resource name will
199  // typically look something like this:
200  //
201  // compute.googleapis.com/projects/project-id/zones/zone/diskTypes/pd-standard
202  string disk_type = 2;
203
204  // Directory in a VM where disk is mounted.
205  string mount_point = 3;
206}
207
208// Provides data to pass through to the worker harness.
209message WorkerSettings {
210  // The base URL for accessing Google Cloud APIs.
211  //
212  // When workers access Google Cloud APIs, they logically do so via
213  // relative URLs.  If this field is specified, it supplies the base
214  // URL to use for resolving these relative URLs.  The normative
215  // algorithm used is defined by RFC 1808, "Relative Uniform Resource
216  // Locators".
217  //
218  // If not specified, the default value is "http://www.googleapis.com/"
219  string base_url = 1;
220
221  // Whether to send work progress updates to the service.
222  bool reporting_enabled = 2;
223
224  // The Cloud Dataflow service path relative to the root URL, for example,
225  // "dataflow/v1b3/projects".
226  string service_path = 3;
227
228  // The Shuffle service path relative to the root URL, for example,
229  // "shuffle/v1beta1".
230  string shuffle_service_path = 4;
231
232  // The ID of the worker running this pipeline.
233  string worker_id = 5;
234
235  // The prefix of the resources the system should use for temporary
236  // storage.
237  //
238  // The supported resource type is:
239  //
240  // Google Cloud Storage:
241  //
242  //   storage.googleapis.com/{bucket}/{object}
243  //   bucket.storage.googleapis.com/{object}
244  string temp_storage_prefix = 6;
245}
246
247// Taskrunner configuration settings.
248message TaskRunnerSettings {
249  // The UNIX user ID on the worker VM to use for tasks launched by
250  // taskrunner; e.g. "root".
251  string task_user = 1;
252
253  // The UNIX group ID on the worker VM to use for tasks launched by
254  // taskrunner; e.g. "wheel".
255  string task_group = 2;
256
257  // The OAuth2 scopes to be requested by the taskrunner in order to
258  // access the Cloud Dataflow API.
259  repeated string oauth_scopes = 3;
260
261  // The base URL for the taskrunner to use when accessing Google Cloud APIs.
262  //
263  // When workers access Google Cloud APIs, they logically do so via
264  // relative URLs.  If this field is specified, it supplies the base
265  // URL to use for resolving these relative URLs.  The normative
266  // algorithm used is defined by RFC 1808, "Relative Uniform Resource
267  // Locators".
268  //
269  // If not specified, the default value is "http://www.googleapis.com/"
270  string base_url = 4;
271
272  // The API version of endpoint, e.g. "v1b3"
273  string dataflow_api_version = 5;
274
275  // The settings to pass to the parallel worker harness.
276  WorkerSettings parallel_worker_settings = 6;
277
278  // The location on the worker for task-specific subdirectories.
279  string base_task_dir = 7;
280
281  // Whether to continue taskrunner if an exception is hit.
282  bool continue_on_exception = 8;
283
284  // Whether to send taskrunner log info to Google Compute Engine VM serial
285  // console.
286  bool log_to_serialconsole = 9;
287
288  // Whether to also send taskrunner log info to stderr.
289  bool alsologtostderr = 10;
290
291  // Indicates where to put logs.  If this is not specified, the logs
292  // will not be uploaded.
293  //
294  // The supported resource type is:
295  //
296  // Google Cloud Storage:
297  //   storage.googleapis.com/{bucket}/{object}
298  //   bucket.storage.googleapis.com/{object}
299  string log_upload_location = 11;
300
301  // The directory on the VM to store logs.
302  string log_dir = 12;
303
304  // The prefix of the resources the taskrunner should use for
305  // temporary storage.
306  //
307  // The supported resource type is:
308  //
309  // Google Cloud Storage:
310  //   storage.googleapis.com/{bucket}/{object}
311  //   bucket.storage.googleapis.com/{object}
312  string temp_storage_prefix = 13;
313
314  // The command to launch the worker harness.
315  string harness_command = 14;
316
317  // The file to store the workflow in.
318  string workflow_file_name = 15;
319
320  // The file to store preprocessing commands in.
321  string commandlines_file_name = 16;
322
323  // The ID string of the VM.
324  string vm_id = 17;
325
326  // The suggested backend language.
327  string language_hint = 18;
328
329  // The streaming worker main class name.
330  string streaming_worker_main_class = 19;
331}
332
333// Specifies what happens to a resource when a Cloud Dataflow
334// [google.dataflow.v1beta3.Job][google.dataflow.v1beta3.Job] has completed.
335enum TeardownPolicy {
336  // The teardown policy isn't specified, or is unknown.
337  TEARDOWN_POLICY_UNKNOWN = 0;
338
339  // Always teardown the resource.
340  TEARDOWN_ALWAYS = 1;
341
342  // Teardown the resource on success. This is useful for debugging
343  // failures.
344  TEARDOWN_ON_SUCCESS = 2;
345
346  // Never teardown the resource. This is useful for debugging and
347  // development.
348  TEARDOWN_NEVER = 3;
349}
350
351// The default set of packages to be staged on a pool of workers.
352enum DefaultPackageSet {
353  // The default set of packages to stage is unknown, or unspecified.
354  DEFAULT_PACKAGE_SET_UNKNOWN = 0;
355
356  // Indicates that no packages should be staged at the worker unless
357  // explicitly specified by the job.
358  DEFAULT_PACKAGE_SET_NONE = 1;
359
360  // Stage packages typically useful to workers written in Java.
361  DEFAULT_PACKAGE_SET_JAVA = 2;
362
363  // Stage packages typically useful to workers written in Python.
364  DEFAULT_PACKAGE_SET_PYTHON = 3;
365}
366
367// Specifies the algorithm used to determine the number of worker
368// processes to run at any given point in time, based on the amount of
369// data left to process, the number of workers, and how quickly
370// existing workers are processing data.
371enum AutoscalingAlgorithm {
372  // The algorithm is unknown, or unspecified.
373  AUTOSCALING_ALGORITHM_UNKNOWN = 0;
374
375  // Disable autoscaling.
376  AUTOSCALING_ALGORITHM_NONE = 1;
377
378  // Increase worker count over time to reduce job execution time.
379  AUTOSCALING_ALGORITHM_BASIC = 2;
380}
381
382// Settings for WorkerPool autoscaling.
383message AutoscalingSettings {
384  // The algorithm to use for autoscaling.
385  AutoscalingAlgorithm algorithm = 1;
386
387  // The maximum number of workers to cap scaling at.
388  int32 max_num_workers = 2;
389}
390
391// Specifies how IP addresses should be allocated to the worker machines.
392enum WorkerIPAddressConfiguration {
393  // The configuration is unknown, or unspecified.
394  WORKER_IP_UNSPECIFIED = 0;
395
396  // Workers should have public IP addresses.
397  WORKER_IP_PUBLIC = 1;
398
399  // Workers should have private IP addresses.
400  WORKER_IP_PRIVATE = 2;
401}
402
403// Defines a SDK harness container for executing Dataflow pipelines.
404message SdkHarnessContainerImage {
405  // A docker container image that resides in Google Container Registry.
406  string container_image = 1;
407
408  // If true, recommends the Dataflow service to use only one core per SDK
409  // container instance with this image. If false (or unset) recommends using
410  // more than one core per SDK container instance with this image for
411  // efficiency. Note that Dataflow service may choose to override this property
412  // if needed.
413  bool use_single_core_per_container = 2;
414
415  // Environment ID for the Beam runner API proto Environment that corresponds
416  // to the current SDK Harness.
417  string environment_id = 3;
418
419  // The set of capabilities enumerated in the above Environment proto. See also
420  // https://github.com/apache/beam/blob/master/model/pipeline/src/main/proto/beam_runner_api.proto
421  repeated string capabilities = 4;
422}
423
424// Describes one particular pool of Cloud Dataflow workers to be
425// instantiated by the Cloud Dataflow service in order to perform the
426// computations required by a job.  Note that a workflow job may use
427// multiple pools, in order to match the various computational
428// requirements of the various stages of the job.
429message WorkerPool {
430  // The kind of the worker pool; currently only `harness` and `shuffle`
431  // are supported.
432  string kind = 1;
433
434  // Number of Google Compute Engine workers in this pool needed to
435  // execute the job.  If zero or unspecified, the service will
436  // attempt to choose a reasonable default.
437  int32 num_workers = 2;
438
439  // Packages to be installed on workers.
440  repeated Package packages = 3;
441
442  // The default package set to install.  This allows the service to
443  // select a default set of packages which are useful to worker
444  // harnesses written in a particular language.
445  DefaultPackageSet default_package_set = 4;
446
447  // Machine type (e.g. "n1-standard-1").  If empty or unspecified, the
448  // service will attempt to choose a reasonable default.
449  string machine_type = 5;
450
451  // Sets the policy for determining when to turndown worker pool.
452  // Allowed values are: `TEARDOWN_ALWAYS`, `TEARDOWN_ON_SUCCESS`, and
453  // `TEARDOWN_NEVER`.
454  // `TEARDOWN_ALWAYS` means workers are always torn down regardless of whether
455  // the job succeeds. `TEARDOWN_ON_SUCCESS` means workers are torn down
456  // if the job succeeds. `TEARDOWN_NEVER` means the workers are never torn
457  // down.
458  //
459  // If the workers are not torn down by the service, they will
460  // continue to run and use Google Compute Engine VM resources in the
461  // user's project until they are explicitly terminated by the user.
462  // Because of this, Google recommends using the `TEARDOWN_ALWAYS`
463  // policy except for small, manually supervised test jobs.
464  //
465  // If unknown or unspecified, the service will attempt to choose a reasonable
466  // default.
467  TeardownPolicy teardown_policy = 6;
468
469  // Size of root disk for VMs, in GB.  If zero or unspecified, the service will
470  // attempt to choose a reasonable default.
471  int32 disk_size_gb = 7;
472
473  // Type of root disk for VMs.  If empty or unspecified, the service will
474  // attempt to choose a reasonable default.
475  string disk_type = 16;
476
477  // Fully qualified source image for disks.
478  string disk_source_image = 8;
479
480  // Zone to run the worker pools in.  If empty or unspecified, the service
481  // will attempt to choose a reasonable default.
482  string zone = 9;
483
484  // Settings passed through to Google Compute Engine workers when
485  // using the standard Dataflow task runner.  Users should ignore
486  // this field.
487  TaskRunnerSettings taskrunner_settings = 10;
488
489  // The action to take on host maintenance, as defined by the Google
490  // Compute Engine API.
491  string on_host_maintenance = 11;
492
493  // Data disks that are used by a VM in this workflow.
494  repeated Disk data_disks = 12;
495
496  // Metadata to set on the Google Compute Engine VMs.
497  map<string, string> metadata = 13;
498
499  // Settings for autoscaling of this WorkerPool.
500  AutoscalingSettings autoscaling_settings = 14;
501
502  // Extra arguments for this worker pool.
503  google.protobuf.Any pool_args = 15;
504
505  // Network to which VMs will be assigned.  If empty or unspecified,
506  // the service will use the network "default".
507  string network = 17;
508
509  // Subnetwork to which VMs will be assigned, if desired.  Expected to be of
510  // the form "regions/REGION/subnetworks/SUBNETWORK".
511  string subnetwork = 19;
512
513  // Required. Docker container image that executes the Cloud Dataflow worker
514  // harness, residing in Google Container Registry.
515  //
516  // Deprecated for the Fn API path. Use sdk_harness_container_images instead.
517  string worker_harness_container_image = 18;
518
519  // The number of threads per worker harness. If empty or unspecified, the
520  // service will choose a number of threads (according to the number of cores
521  // on the selected machine type for batch, or 1 by convention for streaming).
522  int32 num_threads_per_worker = 20;
523
524  // Configuration for VM IPs.
525  WorkerIPAddressConfiguration ip_configuration = 21;
526
527  // Set of SDK harness containers needed to execute this pipeline. This will
528  // only be set in the Fn API path. For non-cross-language pipelines this
529  // should have only one entry. Cross-language pipelines will have two or more
530  // entries.
531  repeated SdkHarnessContainerImage sdk_harness_container_images = 22;
532}
533
534// Specifies the shuffle mode used by a
535// [google.dataflow.v1beta3.Job], which determines the approach data is shuffled
536// during processing. More details in:
537// https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#dataflow-shuffle
538enum ShuffleMode {
539  // Shuffle mode information is not available.
540  SHUFFLE_MODE_UNSPECIFIED = 0;
541
542  // Shuffle is done on the worker VMs.
543  VM_BASED = 1;
544
545  // Shuffle is done on the service side.
546  SERVICE_BASED = 2;
547}
548
549// Describes any options that have an effect on the debugging of pipelines.
550message DebugOptions {
551  // When true, enables the logging of the literal hot key to the user's Cloud
552  // Logging.
553  bool enable_hot_key_logging = 1;
554}
555