xref: /aosp_15_r20/external/googleapis/google/cloud/dataproc/v1/shared.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.dataproc.v1;
18
19import "google/api/field_behavior.proto";
20import "google/api/resource.proto";
21import "google/protobuf/duration.proto";
22import "google/protobuf/timestamp.proto";
23
24option go_package = "cloud.google.com/go/dataproc/v2/apiv1/dataprocpb;dataprocpb";
25option java_multiple_files = true;
26option java_outer_classname = "SharedProto";
27option java_package = "com.google.cloud.dataproc.v1";
28option (google.api.resource_definition) = {
29  type: "container.googleapis.com/Cluster"
30  pattern: "projects/{project}/locations/{location}/clusters/{cluster}"
31};
32option (google.api.resource_definition) = {
33  type: "metastore.googleapis.com/Service"
34  pattern: "projects/{project}/locations/{location}/services/{service}"
35};
36
37// Runtime configuration for a workload.
38message RuntimeConfig {
39  // Optional. Version of the batch runtime.
40  string version = 1 [(google.api.field_behavior) = OPTIONAL];
41
42  // Optional. Optional custom container image for the job runtime environment.
43  // If not specified, a default container image will be used.
44  string container_image = 2 [(google.api.field_behavior) = OPTIONAL];
45
46  // Optional. A mapping of property names to values, which are used to
47  // configure workload execution.
48  map<string, string> properties = 3 [(google.api.field_behavior) = OPTIONAL];
49
50  // Optional. Dependency repository configuration.
51  RepositoryConfig repository_config = 5
52      [(google.api.field_behavior) = OPTIONAL];
53}
54
55// Environment configuration for a workload.
56message EnvironmentConfig {
57  // Optional. Execution configuration for a workload.
58  ExecutionConfig execution_config = 1 [(google.api.field_behavior) = OPTIONAL];
59
60  // Optional. Peripherals configuration that workload has access to.
61  PeripheralsConfig peripherals_config = 2
62      [(google.api.field_behavior) = OPTIONAL];
63}
64
65// Execution configuration for a workload.
66message ExecutionConfig {
67  // Optional. Service account that used to execute workload.
68  string service_account = 2 [(google.api.field_behavior) = OPTIONAL];
69
70  // Network configuration for workload execution.
71  oneof network {
72    // Optional. Network URI to connect workload to.
73    string network_uri = 4 [(google.api.field_behavior) = OPTIONAL];
74
75    // Optional. Subnetwork URI to connect workload to.
76    string subnetwork_uri = 5 [(google.api.field_behavior) = OPTIONAL];
77  }
78
79  // Optional. Tags used for network traffic control.
80  repeated string network_tags = 6 [(google.api.field_behavior) = OPTIONAL];
81
82  // Optional. The Cloud KMS key to use for encryption.
83  string kms_key = 7 [(google.api.field_behavior) = OPTIONAL];
84
85  // Optional. Applies to sessions only. The duration to keep the session alive
86  // while it's idling. Exceeding this threshold causes the session to
87  // terminate. This field cannot be set on a batch workload. Minimum value is
88  // 10 minutes; maximum value is 14 days (see JSON representation of
89  // [Duration](https://developers.google.com/protocol-buffers/docs/proto3#json)).
90  // Defaults to 1 hour if not set.
91  // If both `ttl` and `idle_ttl` are specified for an interactive session,
92  // the conditions are treated as `OR` conditions: the workload will be
93  // terminated when it has been idle for `idle_ttl` or when `ttl` has been
94  // exceeded, whichever occurs first.
95  google.protobuf.Duration idle_ttl = 8
96      [(google.api.field_behavior) = OPTIONAL];
97
98  // Optional. The duration after which the workload will be terminated,
99  // specified as the JSON representation for
100  // [Duration](https://protobuf.dev/programming-guides/proto3/#json).
101  // When the workload exceeds this duration, it will be unconditionally
102  // terminated without waiting for ongoing work to finish. If `ttl` is not
103  // specified for a batch workload, the workload will be allowed to run until
104  // it exits naturally (or run forever without exiting). If `ttl` is not
105  // specified for an interactive session, it defaults to 24 hours. If `ttl` is
106  // not specified for a batch that uses 2.1+ runtime version, it defaults to 4
107  // hours. Minimum value is 10 minutes; maximum value is 14 days. If both `ttl`
108  // and `idle_ttl` are specified (for an interactive session), the conditions
109  // are treated as `OR` conditions: the workload will be terminated when it has
110  // been idle for `idle_ttl` or when `ttl` has been exceeded, whichever occurs
111  // first.
112  google.protobuf.Duration ttl = 9 [(google.api.field_behavior) = OPTIONAL];
113
114  // Optional. A Cloud Storage bucket used to stage workload dependencies,
115  // config files, and store workload output and other ephemeral data, such as
116  // Spark history files. If you do not specify a staging bucket, Cloud Dataproc
117  // will determine a Cloud Storage location according to the region where your
118  // workload is running, and then create and manage project-level, per-location
119  // staging and temporary buckets.
120  // **This field requires a Cloud Storage bucket name, not a `gs://...` URI to
121  // a Cloud Storage bucket.**
122  string staging_bucket = 10 [(google.api.field_behavior) = OPTIONAL];
123}
124
125// Spark History Server configuration for the workload.
126message SparkHistoryServerConfig {
127  // Optional. Resource name of an existing Dataproc Cluster to act as a Spark
128  // History Server for the workload.
129  //
130  // Example:
131  //
132  // * `projects/[project_id]/regions/[region]/clusters/[cluster_name]`
133  string dataproc_cluster = 1 [(google.api.field_behavior) = OPTIONAL];
134}
135
136// Auxiliary services configuration for a workload.
137message PeripheralsConfig {
138  // Optional. Resource name of an existing Dataproc Metastore service.
139  //
140  // Example:
141  //
142  // * `projects/[project_id]/locations/[region]/services/[service_id]`
143  string metastore_service = 1 [
144    (google.api.field_behavior) = OPTIONAL,
145    (google.api.resource_reference) = {
146      type: "metastore.googleapis.com/Service"
147    }
148  ];
149
150  // Optional. The Spark History Server configuration for the workload.
151  SparkHistoryServerConfig spark_history_server_config = 2
152      [(google.api.field_behavior) = OPTIONAL];
153}
154
155// Runtime information about workload execution.
156message RuntimeInfo {
157  // Output only. Map of remote access endpoints (such as web interfaces and
158  // APIs) to their URIs.
159  map<string, string> endpoints = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
160
161  // Output only. A URI pointing to the location of the stdout and stderr of the
162  // workload.
163  string output_uri = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
164
165  // Output only. A URI pointing to the location of the diagnostics tarball.
166  string diagnostic_output_uri = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
167
168  // Output only. Approximate workload resource usage, calculated when
169  // the workload completes (see [Dataproc Serverless pricing]
170  // (https://cloud.google.com/dataproc-serverless/pricing)).
171  //
172  // **Note:** This metric calculation may change in the future, for
173  // example, to capture cumulative workload resource
174  // consumption during workload execution (see the
175  // [Dataproc Serverless release notes]
176  // (https://cloud.google.com/dataproc-serverless/docs/release-notes)
177  // for announcements, changes, fixes
178  // and other Dataproc developments).
179  UsageMetrics approximate_usage = 6
180      [(google.api.field_behavior) = OUTPUT_ONLY];
181
182  // Output only. Snapshot of current workload resource usage.
183  UsageSnapshot current_usage = 7 [(google.api.field_behavior) = OUTPUT_ONLY];
184}
185
186// Usage metrics represent approximate total resources consumed by a workload.
187message UsageMetrics {
188  // Optional. DCU (Dataproc Compute Units) usage in (`milliDCU` x `seconds`)
189  // (see [Dataproc Serverless pricing]
190  // (https://cloud.google.com/dataproc-serverless/pricing)).
191  int64 milli_dcu_seconds = 1 [(google.api.field_behavior) = OPTIONAL];
192
193  // Optional. Shuffle storage usage in (`GB` x `seconds`) (see
194  // [Dataproc Serverless pricing]
195  // (https://cloud.google.com/dataproc-serverless/pricing)).
196  int64 shuffle_storage_gb_seconds = 2 [(google.api.field_behavior) = OPTIONAL];
197
198  // Optional. Accelerator usage in (`milliAccelerator` x `seconds`) (see
199  // [Dataproc Serverless pricing]
200  // (https://cloud.google.com/dataproc-serverless/pricing)).
201  int64 milli_accelerator_seconds = 3 [(google.api.field_behavior) = OPTIONAL];
202
203  // Optional. Accelerator type being used, if any
204  string accelerator_type = 4 [(google.api.field_behavior) = OPTIONAL];
205}
206
207// The usage snapshot represents the resources consumed by a workload at a
208// specified time.
209message UsageSnapshot {
210  // Optional. Milli (one-thousandth) Dataproc Compute Units (DCUs) (see
211  // [Dataproc Serverless pricing]
212  // (https://cloud.google.com/dataproc-serverless/pricing)).
213  int64 milli_dcu = 1 [(google.api.field_behavior) = OPTIONAL];
214
215  // Optional. Shuffle Storage in gigabytes (GB). (see [Dataproc Serverless
216  // pricing] (https://cloud.google.com/dataproc-serverless/pricing))
217  int64 shuffle_storage_gb = 2 [(google.api.field_behavior) = OPTIONAL];
218
219  // Optional. Milli (one-thousandth) Dataproc Compute Units (DCUs) charged at
220  // premium tier (see [Dataproc Serverless pricing]
221  // (https://cloud.google.com/dataproc-serverless/pricing)).
222  int64 milli_dcu_premium = 4 [(google.api.field_behavior) = OPTIONAL];
223
224  // Optional. Shuffle Storage in gigabytes (GB) charged at premium tier. (see
225  // [Dataproc Serverless pricing]
226  // (https://cloud.google.com/dataproc-serverless/pricing))
227  int64 shuffle_storage_gb_premium = 5 [(google.api.field_behavior) = OPTIONAL];
228
229  // Optional. Milli (one-thousandth) accelerator. (see [Dataproc
230  // Serverless pricing] (https://cloud.google.com/dataproc-serverless/pricing))
231  int64 milli_accelerator = 6 [(google.api.field_behavior) = OPTIONAL];
232
233  // Optional. Accelerator type being used, if any
234  string accelerator_type = 7 [(google.api.field_behavior) = OPTIONAL];
235
236  // Optional. The timestamp of the usage snapshot.
237  google.protobuf.Timestamp snapshot_time = 3
238      [(google.api.field_behavior) = OPTIONAL];
239}
240
241// The cluster's GKE config.
242message GkeClusterConfig {
243  // Optional. A target GKE cluster to deploy to. It must be in the same project
244  // and region as the Dataproc cluster (the GKE cluster can be zonal or
245  // regional). Format:
246  // 'projects/{project}/locations/{location}/clusters/{cluster_id}'
247  string gke_cluster_target = 2 [
248    (google.api.field_behavior) = OPTIONAL,
249    (google.api.resource_reference) = {
250      type: "container.googleapis.com/Cluster"
251    }
252  ];
253
254  // Optional. GKE node pools where workloads will be scheduled. At least one
255  // node pool must be assigned the `DEFAULT`
256  // [GkeNodePoolTarget.Role][google.cloud.dataproc.v1.GkeNodePoolTarget.Role].
257  // If a `GkeNodePoolTarget` is not specified, Dataproc constructs a `DEFAULT`
258  // `GkeNodePoolTarget`. Each role can be given to only one
259  // `GkeNodePoolTarget`. All node pools must have the same location settings.
260  repeated GkeNodePoolTarget node_pool_target = 3
261      [(google.api.field_behavior) = OPTIONAL];
262}
263
264// The configuration for running the Dataproc cluster on Kubernetes.
265message KubernetesClusterConfig {
266  // Optional. A namespace within the Kubernetes cluster to deploy into. If this
267  // namespace does not exist, it is created. If it exists, Dataproc verifies
268  // that another Dataproc VirtualCluster is not installed into it. If not
269  // specified, the name of the Dataproc Cluster is used.
270  string kubernetes_namespace = 1 [(google.api.field_behavior) = OPTIONAL];
271
272  oneof config {
273    // Required. The configuration for running the Dataproc cluster on GKE.
274    GkeClusterConfig gke_cluster_config = 2
275        [(google.api.field_behavior) = REQUIRED];
276  }
277
278  // Optional. The software configuration for this Dataproc cluster running on
279  // Kubernetes.
280  KubernetesSoftwareConfig kubernetes_software_config = 3
281      [(google.api.field_behavior) = OPTIONAL];
282}
283
284// The software configuration for this Dataproc cluster running on Kubernetes.
285message KubernetesSoftwareConfig {
286  // The components that should be installed in this Dataproc cluster. The key
287  // must be a string from the KubernetesComponent enumeration. The value is
288  // the version of the software to be installed.
289  // At least one entry must be specified.
290  map<string, string> component_version = 1;
291
292  // The properties to set on daemon config files.
293  //
294  // Property keys are specified in `prefix:property` format, for example
295  // `spark:spark.kubernetes.container.image`. The following are supported
296  // prefixes and their mappings:
297  //
298  // * spark:  `spark-defaults.conf`
299  //
300  // For more information, see [Cluster
301  // properties](https://cloud.google.com/dataproc/docs/concepts/cluster-properties).
302  map<string, string> properties = 2;
303}
304
305// GKE node pools that Dataproc workloads run on.
306message GkeNodePoolTarget {
307  // `Role` specifies the tasks that will run on the node pool. Roles can be
308  // specific to workloads. Exactly one
309  // [GkeNodePoolTarget][google.cloud.dataproc.v1.GkeNodePoolTarget] within the
310  // virtual cluster must have the `DEFAULT` role, which is used to run all
311  // workloads that are not associated with a node pool.
312  enum Role {
313    // Role is unspecified.
314    ROLE_UNSPECIFIED = 0;
315
316    // At least one node pool must have the `DEFAULT` role.
317    // Work assigned to a role that is not associated with a node pool
318    // is assigned to the node pool with the `DEFAULT` role. For example,
319    // work assigned to the `CONTROLLER` role will be assigned to the node pool
320    // with the `DEFAULT` role if no node pool has the `CONTROLLER` role.
321    DEFAULT = 1;
322
323    // Run work associated with the Dataproc control plane (for example,
324    // controllers and webhooks). Very low resource requirements.
325    CONTROLLER = 2;
326
327    // Run work associated with a Spark driver of a job.
328    SPARK_DRIVER = 3;
329
330    // Run work associated with a Spark executor of a job.
331    SPARK_EXECUTOR = 4;
332  }
333
334  // Required. The target GKE node pool.
335  // Format:
336  // 'projects/{project}/locations/{location}/clusters/{cluster}/nodePools/{node_pool}'
337  string node_pool = 1 [(google.api.field_behavior) = REQUIRED];
338
339  // Required. The roles associated with the GKE node pool.
340  repeated Role roles = 2 [(google.api.field_behavior) = REQUIRED];
341
342  // Input only. The configuration for the GKE node pool.
343  //
344  // If specified, Dataproc attempts to create a node pool with the
345  // specified shape. If one with the same name already exists, it is
346  // verified against all specified fields. If a field differs, the
347  // virtual cluster creation will fail.
348  //
349  // If omitted, any node pool with the specified name is used. If a
350  // node pool with the specified name does not exist, Dataproc create a
351  // node pool with default values.
352  //
353  // This is an input only field. It will not be returned by the API.
354  GkeNodePoolConfig node_pool_config = 3
355      [(google.api.field_behavior) = INPUT_ONLY];
356}
357
358// The configuration of a GKE node pool used by a [Dataproc-on-GKE
359// cluster](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke#create-a-dataproc-on-gke-cluster).
360message GkeNodePoolConfig {
361  // Parameters that describe cluster nodes.
362  message GkeNodeConfig {
363    // Optional. The name of a Compute Engine [machine
364    // type](https://cloud.google.com/compute/docs/machine-types).
365    string machine_type = 1 [(google.api.field_behavior) = OPTIONAL];
366
367    // Optional. The number of local SSD disks to attach to the node, which is
368    // limited by the maximum number of disks allowable per zone (see [Adding
369    // Local SSDs](https://cloud.google.com/compute/docs/disks/local-ssd)).
370    int32 local_ssd_count = 7 [(google.api.field_behavior) = OPTIONAL];
371
372    // Optional. Whether the nodes are created as legacy [preemptible VM
373    // instances] (https://cloud.google.com/compute/docs/instances/preemptible).
374    // Also see
375    // [Spot][google.cloud.dataproc.v1.GkeNodePoolConfig.GkeNodeConfig.spot]
376    // VMs, preemptible VM instances without a maximum lifetime. Legacy and Spot
377    // preemptible nodes cannot be used in a node pool with the `CONTROLLER`
378    // [role]
379    // (/dataproc/docs/reference/rest/v1/projects.regions.clusters#role)
380    // or in the DEFAULT node pool if the CONTROLLER role is not assigned (the
381    // DEFAULT node pool will assume the CONTROLLER role).
382    bool preemptible = 10 [(google.api.field_behavior) = OPTIONAL];
383
384    // Optional. A list of [hardware
385    // accelerators](https://cloud.google.com/compute/docs/gpus) to attach to
386    // each node.
387    repeated GkeNodePoolAcceleratorConfig accelerators = 11
388        [(google.api.field_behavior) = OPTIONAL];
389
390    // Optional. [Minimum CPU
391    // platform](https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform)
392    // to be used by this instance. The instance may be scheduled on the
393    // specified or a newer CPU platform. Specify the friendly names of CPU
394    // platforms, such as "Intel Haswell"` or Intel Sandy Bridge".
395    string min_cpu_platform = 13 [(google.api.field_behavior) = OPTIONAL];
396
397    // Optional. The [Customer Managed Encryption Key (CMEK)]
398    // (https://cloud.google.com/kubernetes-engine/docs/how-to/using-cmek)
399    // used to encrypt the boot disk attached to each node in the node pool.
400    // Specify the key using the following format:
401    // <code>projects/<var>KEY_PROJECT_ID</var>/locations/<var>LOCATION</var>/keyRings/<var>RING_NAME</var>/cryptoKeys/<var>KEY_NAME</var></code>.
402    string boot_disk_kms_key = 23 [(google.api.field_behavior) = OPTIONAL];
403
404    // Optional. Whether the nodes are created as [Spot VM instances]
405    // (https://cloud.google.com/compute/docs/instances/spot).
406    // Spot VMs are the latest update to legacy
407    // [preemptible
408    // VMs][google.cloud.dataproc.v1.GkeNodePoolConfig.GkeNodeConfig.preemptible].
409    // Spot VMs do not have a maximum lifetime. Legacy and Spot preemptible
410    // nodes cannot be used in a node pool with the `CONTROLLER`
411    // [role](/dataproc/docs/reference/rest/v1/projects.regions.clusters#role)
412    // or in the DEFAULT node pool if the CONTROLLER role is not assigned (the
413    // DEFAULT node pool will assume the CONTROLLER role).
414    bool spot = 32 [(google.api.field_behavior) = OPTIONAL];
415  }
416
417  // A GkeNodeConfigAcceleratorConfig represents a Hardware Accelerator request
418  // for a node pool.
419  message GkeNodePoolAcceleratorConfig {
420    // The number of accelerator cards exposed to an instance.
421    int64 accelerator_count = 1;
422
423    // The accelerator type resource namename (see GPUs on Compute Engine).
424    string accelerator_type = 2;
425
426    // Size of partitions to create on the GPU. Valid values are described in
427    // the NVIDIA [mig user
428    // guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#partitioning).
429    string gpu_partition_size = 3;
430  }
431
432  // GkeNodePoolAutoscaling contains information the cluster autoscaler needs to
433  // adjust the size of the node pool to the current cluster usage.
434  message GkeNodePoolAutoscalingConfig {
435    // The minimum number of nodes in the node pool. Must be >= 0 and <=
436    // max_node_count.
437    int32 min_node_count = 2;
438
439    // The maximum number of nodes in the node pool. Must be >= min_node_count,
440    // and must be > 0.
441    // **Note:** Quota must be sufficient to scale up the cluster.
442    int32 max_node_count = 3;
443  }
444
445  // Optional. The node pool configuration.
446  GkeNodeConfig config = 2 [(google.api.field_behavior) = OPTIONAL];
447
448  // Optional. The list of Compute Engine
449  // [zones](https://cloud.google.com/compute/docs/zones#available) where
450  // node pool nodes associated with a Dataproc on GKE virtual cluster
451  // will be located.
452  //
453  // **Note:** All node pools associated with a virtual cluster
454  // must be located in the same region as the virtual cluster, and they must
455  // be located in the same zone within that region.
456  //
457  // If a location is not specified during node pool creation, Dataproc on GKE
458  // will choose the zone.
459  repeated string locations = 13 [(google.api.field_behavior) = OPTIONAL];
460
461  // Optional. The autoscaler configuration for this node pool. The autoscaler
462  // is enabled only when a valid configuration is present.
463  GkeNodePoolAutoscalingConfig autoscaling = 4
464      [(google.api.field_behavior) = OPTIONAL];
465}
466
467// Configuration for dependency repositories
468message RepositoryConfig {
469  // Optional. Configuration for PyPi repository.
470  PyPiRepositoryConfig pypi_repository_config = 1
471      [(google.api.field_behavior) = OPTIONAL];
472}
473
474// Configuration for PyPi repository
475message PyPiRepositoryConfig {
476  // Optional. PyPi repository address
477  string pypi_repository = 1 [(google.api.field_behavior) = OPTIONAL];
478}
479
480// Cluster components that can be activated.
481enum Component {
482  // Unspecified component. Specifying this will cause Cluster creation to fail.
483  COMPONENT_UNSPECIFIED = 0;
484
485  // The Anaconda python distribution. The Anaconda component is not supported
486  // in the Dataproc [2.0 image]
487  // (/https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-release-2.0).
488  // The 2.0 image is pre-installed with Miniconda.
489  ANACONDA = 5;
490
491  // Docker
492  DOCKER = 13;
493
494  // The Druid query engine. (alpha)
495  DRUID = 9;
496
497  // Flink
498  FLINK = 14;
499
500  // HBase. (beta)
501  HBASE = 11;
502
503  // The Hive Web HCatalog (the REST service for accessing HCatalog).
504  HIVE_WEBHCAT = 3;
505
506  // Hudi.
507  HUDI = 18;
508
509  // The Jupyter Notebook.
510  JUPYTER = 1;
511
512  // The Presto query engine.
513  PRESTO = 6;
514
515  // The Trino query engine.
516  TRINO = 17;
517
518  // The Ranger service.
519  RANGER = 12;
520
521  // The Solr service.
522  SOLR = 10;
523
524  // The Zeppelin notebook.
525  ZEPPELIN = 4;
526
527  // The Zookeeper service.
528  ZOOKEEPER = 8;
529}
530
531// Actions in response to failure of a resource associated with a cluster.
532enum FailureAction {
533  // When FailureAction is unspecified, failure action defaults to NO_ACTION.
534  FAILURE_ACTION_UNSPECIFIED = 0;
535
536  // Take no action on failure to create a cluster resource. NO_ACTION is the
537  // default.
538  NO_ACTION = 1;
539
540  // Delete the failed cluster resource.
541  DELETE = 2;
542}
543