1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.dataproc.v1; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21import "google/protobuf/duration.proto"; 22import "google/protobuf/timestamp.proto"; 23 24option go_package = "cloud.google.com/go/dataproc/v2/apiv1/dataprocpb;dataprocpb"; 25option java_multiple_files = true; 26option java_outer_classname = "SharedProto"; 27option java_package = "com.google.cloud.dataproc.v1"; 28option (google.api.resource_definition) = { 29 type: "container.googleapis.com/Cluster" 30 pattern: "projects/{project}/locations/{location}/clusters/{cluster}" 31}; 32option (google.api.resource_definition) = { 33 type: "metastore.googleapis.com/Service" 34 pattern: "projects/{project}/locations/{location}/services/{service}" 35}; 36 37// Runtime configuration for a workload. 38message RuntimeConfig { 39 // Optional. Version of the batch runtime. 40 string version = 1 [(google.api.field_behavior) = OPTIONAL]; 41 42 // Optional. Optional custom container image for the job runtime environment. 43 // If not specified, a default container image will be used. 44 string container_image = 2 [(google.api.field_behavior) = OPTIONAL]; 45 46 // Optional. A mapping of property names to values, which are used to 47 // configure workload execution. 48 map<string, string> properties = 3 [(google.api.field_behavior) = OPTIONAL]; 49 50 // Optional. Dependency repository configuration. 51 RepositoryConfig repository_config = 5 52 [(google.api.field_behavior) = OPTIONAL]; 53} 54 55// Environment configuration for a workload. 56message EnvironmentConfig { 57 // Optional. Execution configuration for a workload. 58 ExecutionConfig execution_config = 1 [(google.api.field_behavior) = OPTIONAL]; 59 60 // Optional. Peripherals configuration that workload has access to. 61 PeripheralsConfig peripherals_config = 2 62 [(google.api.field_behavior) = OPTIONAL]; 63} 64 65// Execution configuration for a workload. 66message ExecutionConfig { 67 // Optional. Service account that used to execute workload. 68 string service_account = 2 [(google.api.field_behavior) = OPTIONAL]; 69 70 // Network configuration for workload execution. 71 oneof network { 72 // Optional. Network URI to connect workload to. 73 string network_uri = 4 [(google.api.field_behavior) = OPTIONAL]; 74 75 // Optional. Subnetwork URI to connect workload to. 76 string subnetwork_uri = 5 [(google.api.field_behavior) = OPTIONAL]; 77 } 78 79 // Optional. Tags used for network traffic control. 80 repeated string network_tags = 6 [(google.api.field_behavior) = OPTIONAL]; 81 82 // Optional. The Cloud KMS key to use for encryption. 83 string kms_key = 7 [(google.api.field_behavior) = OPTIONAL]; 84 85 // Optional. Applies to sessions only. The duration to keep the session alive 86 // while it's idling. Exceeding this threshold causes the session to 87 // terminate. This field cannot be set on a batch workload. Minimum value is 88 // 10 minutes; maximum value is 14 days (see JSON representation of 89 // [Duration](https://developers.google.com/protocol-buffers/docs/proto3#json)). 90 // Defaults to 1 hour if not set. 91 // If both `ttl` and `idle_ttl` are specified for an interactive session, 92 // the conditions are treated as `OR` conditions: the workload will be 93 // terminated when it has been idle for `idle_ttl` or when `ttl` has been 94 // exceeded, whichever occurs first. 95 google.protobuf.Duration idle_ttl = 8 96 [(google.api.field_behavior) = OPTIONAL]; 97 98 // Optional. The duration after which the workload will be terminated, 99 // specified as the JSON representation for 100 // [Duration](https://protobuf.dev/programming-guides/proto3/#json). 101 // When the workload exceeds this duration, it will be unconditionally 102 // terminated without waiting for ongoing work to finish. If `ttl` is not 103 // specified for a batch workload, the workload will be allowed to run until 104 // it exits naturally (or run forever without exiting). If `ttl` is not 105 // specified for an interactive session, it defaults to 24 hours. If `ttl` is 106 // not specified for a batch that uses 2.1+ runtime version, it defaults to 4 107 // hours. Minimum value is 10 minutes; maximum value is 14 days. If both `ttl` 108 // and `idle_ttl` are specified (for an interactive session), the conditions 109 // are treated as `OR` conditions: the workload will be terminated when it has 110 // been idle for `idle_ttl` or when `ttl` has been exceeded, whichever occurs 111 // first. 112 google.protobuf.Duration ttl = 9 [(google.api.field_behavior) = OPTIONAL]; 113 114 // Optional. A Cloud Storage bucket used to stage workload dependencies, 115 // config files, and store workload output and other ephemeral data, such as 116 // Spark history files. If you do not specify a staging bucket, Cloud Dataproc 117 // will determine a Cloud Storage location according to the region where your 118 // workload is running, and then create and manage project-level, per-location 119 // staging and temporary buckets. 120 // **This field requires a Cloud Storage bucket name, not a `gs://...` URI to 121 // a Cloud Storage bucket.** 122 string staging_bucket = 10 [(google.api.field_behavior) = OPTIONAL]; 123} 124 125// Spark History Server configuration for the workload. 126message SparkHistoryServerConfig { 127 // Optional. Resource name of an existing Dataproc Cluster to act as a Spark 128 // History Server for the workload. 129 // 130 // Example: 131 // 132 // * `projects/[project_id]/regions/[region]/clusters/[cluster_name]` 133 string dataproc_cluster = 1 [(google.api.field_behavior) = OPTIONAL]; 134} 135 136// Auxiliary services configuration for a workload. 137message PeripheralsConfig { 138 // Optional. Resource name of an existing Dataproc Metastore service. 139 // 140 // Example: 141 // 142 // * `projects/[project_id]/locations/[region]/services/[service_id]` 143 string metastore_service = 1 [ 144 (google.api.field_behavior) = OPTIONAL, 145 (google.api.resource_reference) = { 146 type: "metastore.googleapis.com/Service" 147 } 148 ]; 149 150 // Optional. The Spark History Server configuration for the workload. 151 SparkHistoryServerConfig spark_history_server_config = 2 152 [(google.api.field_behavior) = OPTIONAL]; 153} 154 155// Runtime information about workload execution. 156message RuntimeInfo { 157 // Output only. Map of remote access endpoints (such as web interfaces and 158 // APIs) to their URIs. 159 map<string, string> endpoints = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 160 161 // Output only. A URI pointing to the location of the stdout and stderr of the 162 // workload. 163 string output_uri = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 164 165 // Output only. A URI pointing to the location of the diagnostics tarball. 166 string diagnostic_output_uri = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 167 168 // Output only. Approximate workload resource usage, calculated when 169 // the workload completes (see [Dataproc Serverless pricing] 170 // (https://cloud.google.com/dataproc-serverless/pricing)). 171 // 172 // **Note:** This metric calculation may change in the future, for 173 // example, to capture cumulative workload resource 174 // consumption during workload execution (see the 175 // [Dataproc Serverless release notes] 176 // (https://cloud.google.com/dataproc-serverless/docs/release-notes) 177 // for announcements, changes, fixes 178 // and other Dataproc developments). 179 UsageMetrics approximate_usage = 6 180 [(google.api.field_behavior) = OUTPUT_ONLY]; 181 182 // Output only. Snapshot of current workload resource usage. 183 UsageSnapshot current_usage = 7 [(google.api.field_behavior) = OUTPUT_ONLY]; 184} 185 186// Usage metrics represent approximate total resources consumed by a workload. 187message UsageMetrics { 188 // Optional. DCU (Dataproc Compute Units) usage in (`milliDCU` x `seconds`) 189 // (see [Dataproc Serverless pricing] 190 // (https://cloud.google.com/dataproc-serverless/pricing)). 191 int64 milli_dcu_seconds = 1 [(google.api.field_behavior) = OPTIONAL]; 192 193 // Optional. Shuffle storage usage in (`GB` x `seconds`) (see 194 // [Dataproc Serverless pricing] 195 // (https://cloud.google.com/dataproc-serverless/pricing)). 196 int64 shuffle_storage_gb_seconds = 2 [(google.api.field_behavior) = OPTIONAL]; 197 198 // Optional. Accelerator usage in (`milliAccelerator` x `seconds`) (see 199 // [Dataproc Serverless pricing] 200 // (https://cloud.google.com/dataproc-serverless/pricing)). 201 int64 milli_accelerator_seconds = 3 [(google.api.field_behavior) = OPTIONAL]; 202 203 // Optional. Accelerator type being used, if any 204 string accelerator_type = 4 [(google.api.field_behavior) = OPTIONAL]; 205} 206 207// The usage snapshot represents the resources consumed by a workload at a 208// specified time. 209message UsageSnapshot { 210 // Optional. Milli (one-thousandth) Dataproc Compute Units (DCUs) (see 211 // [Dataproc Serverless pricing] 212 // (https://cloud.google.com/dataproc-serverless/pricing)). 213 int64 milli_dcu = 1 [(google.api.field_behavior) = OPTIONAL]; 214 215 // Optional. Shuffle Storage in gigabytes (GB). (see [Dataproc Serverless 216 // pricing] (https://cloud.google.com/dataproc-serverless/pricing)) 217 int64 shuffle_storage_gb = 2 [(google.api.field_behavior) = OPTIONAL]; 218 219 // Optional. Milli (one-thousandth) Dataproc Compute Units (DCUs) charged at 220 // premium tier (see [Dataproc Serverless pricing] 221 // (https://cloud.google.com/dataproc-serverless/pricing)). 222 int64 milli_dcu_premium = 4 [(google.api.field_behavior) = OPTIONAL]; 223 224 // Optional. Shuffle Storage in gigabytes (GB) charged at premium tier. (see 225 // [Dataproc Serverless pricing] 226 // (https://cloud.google.com/dataproc-serverless/pricing)) 227 int64 shuffle_storage_gb_premium = 5 [(google.api.field_behavior) = OPTIONAL]; 228 229 // Optional. Milli (one-thousandth) accelerator. (see [Dataproc 230 // Serverless pricing] (https://cloud.google.com/dataproc-serverless/pricing)) 231 int64 milli_accelerator = 6 [(google.api.field_behavior) = OPTIONAL]; 232 233 // Optional. Accelerator type being used, if any 234 string accelerator_type = 7 [(google.api.field_behavior) = OPTIONAL]; 235 236 // Optional. The timestamp of the usage snapshot. 237 google.protobuf.Timestamp snapshot_time = 3 238 [(google.api.field_behavior) = OPTIONAL]; 239} 240 241// The cluster's GKE config. 242message GkeClusterConfig { 243 // Optional. A target GKE cluster to deploy to. It must be in the same project 244 // and region as the Dataproc cluster (the GKE cluster can be zonal or 245 // regional). Format: 246 // 'projects/{project}/locations/{location}/clusters/{cluster_id}' 247 string gke_cluster_target = 2 [ 248 (google.api.field_behavior) = OPTIONAL, 249 (google.api.resource_reference) = { 250 type: "container.googleapis.com/Cluster" 251 } 252 ]; 253 254 // Optional. GKE node pools where workloads will be scheduled. At least one 255 // node pool must be assigned the `DEFAULT` 256 // [GkeNodePoolTarget.Role][google.cloud.dataproc.v1.GkeNodePoolTarget.Role]. 257 // If a `GkeNodePoolTarget` is not specified, Dataproc constructs a `DEFAULT` 258 // `GkeNodePoolTarget`. Each role can be given to only one 259 // `GkeNodePoolTarget`. All node pools must have the same location settings. 260 repeated GkeNodePoolTarget node_pool_target = 3 261 [(google.api.field_behavior) = OPTIONAL]; 262} 263 264// The configuration for running the Dataproc cluster on Kubernetes. 265message KubernetesClusterConfig { 266 // Optional. A namespace within the Kubernetes cluster to deploy into. If this 267 // namespace does not exist, it is created. If it exists, Dataproc verifies 268 // that another Dataproc VirtualCluster is not installed into it. If not 269 // specified, the name of the Dataproc Cluster is used. 270 string kubernetes_namespace = 1 [(google.api.field_behavior) = OPTIONAL]; 271 272 oneof config { 273 // Required. The configuration for running the Dataproc cluster on GKE. 274 GkeClusterConfig gke_cluster_config = 2 275 [(google.api.field_behavior) = REQUIRED]; 276 } 277 278 // Optional. The software configuration for this Dataproc cluster running on 279 // Kubernetes. 280 KubernetesSoftwareConfig kubernetes_software_config = 3 281 [(google.api.field_behavior) = OPTIONAL]; 282} 283 284// The software configuration for this Dataproc cluster running on Kubernetes. 285message KubernetesSoftwareConfig { 286 // The components that should be installed in this Dataproc cluster. The key 287 // must be a string from the KubernetesComponent enumeration. The value is 288 // the version of the software to be installed. 289 // At least one entry must be specified. 290 map<string, string> component_version = 1; 291 292 // The properties to set on daemon config files. 293 // 294 // Property keys are specified in `prefix:property` format, for example 295 // `spark:spark.kubernetes.container.image`. The following are supported 296 // prefixes and their mappings: 297 // 298 // * spark: `spark-defaults.conf` 299 // 300 // For more information, see [Cluster 301 // properties](https://cloud.google.com/dataproc/docs/concepts/cluster-properties). 302 map<string, string> properties = 2; 303} 304 305// GKE node pools that Dataproc workloads run on. 306message GkeNodePoolTarget { 307 // `Role` specifies the tasks that will run on the node pool. Roles can be 308 // specific to workloads. Exactly one 309 // [GkeNodePoolTarget][google.cloud.dataproc.v1.GkeNodePoolTarget] within the 310 // virtual cluster must have the `DEFAULT` role, which is used to run all 311 // workloads that are not associated with a node pool. 312 enum Role { 313 // Role is unspecified. 314 ROLE_UNSPECIFIED = 0; 315 316 // At least one node pool must have the `DEFAULT` role. 317 // Work assigned to a role that is not associated with a node pool 318 // is assigned to the node pool with the `DEFAULT` role. For example, 319 // work assigned to the `CONTROLLER` role will be assigned to the node pool 320 // with the `DEFAULT` role if no node pool has the `CONTROLLER` role. 321 DEFAULT = 1; 322 323 // Run work associated with the Dataproc control plane (for example, 324 // controllers and webhooks). Very low resource requirements. 325 CONTROLLER = 2; 326 327 // Run work associated with a Spark driver of a job. 328 SPARK_DRIVER = 3; 329 330 // Run work associated with a Spark executor of a job. 331 SPARK_EXECUTOR = 4; 332 } 333 334 // Required. The target GKE node pool. 335 // Format: 336 // 'projects/{project}/locations/{location}/clusters/{cluster}/nodePools/{node_pool}' 337 string node_pool = 1 [(google.api.field_behavior) = REQUIRED]; 338 339 // Required. The roles associated with the GKE node pool. 340 repeated Role roles = 2 [(google.api.field_behavior) = REQUIRED]; 341 342 // Input only. The configuration for the GKE node pool. 343 // 344 // If specified, Dataproc attempts to create a node pool with the 345 // specified shape. If one with the same name already exists, it is 346 // verified against all specified fields. If a field differs, the 347 // virtual cluster creation will fail. 348 // 349 // If omitted, any node pool with the specified name is used. If a 350 // node pool with the specified name does not exist, Dataproc create a 351 // node pool with default values. 352 // 353 // This is an input only field. It will not be returned by the API. 354 GkeNodePoolConfig node_pool_config = 3 355 [(google.api.field_behavior) = INPUT_ONLY]; 356} 357 358// The configuration of a GKE node pool used by a [Dataproc-on-GKE 359// cluster](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke#create-a-dataproc-on-gke-cluster). 360message GkeNodePoolConfig { 361 // Parameters that describe cluster nodes. 362 message GkeNodeConfig { 363 // Optional. The name of a Compute Engine [machine 364 // type](https://cloud.google.com/compute/docs/machine-types). 365 string machine_type = 1 [(google.api.field_behavior) = OPTIONAL]; 366 367 // Optional. The number of local SSD disks to attach to the node, which is 368 // limited by the maximum number of disks allowable per zone (see [Adding 369 // Local SSDs](https://cloud.google.com/compute/docs/disks/local-ssd)). 370 int32 local_ssd_count = 7 [(google.api.field_behavior) = OPTIONAL]; 371 372 // Optional. Whether the nodes are created as legacy [preemptible VM 373 // instances] (https://cloud.google.com/compute/docs/instances/preemptible). 374 // Also see 375 // [Spot][google.cloud.dataproc.v1.GkeNodePoolConfig.GkeNodeConfig.spot] 376 // VMs, preemptible VM instances without a maximum lifetime. Legacy and Spot 377 // preemptible nodes cannot be used in a node pool with the `CONTROLLER` 378 // [role] 379 // (/dataproc/docs/reference/rest/v1/projects.regions.clusters#role) 380 // or in the DEFAULT node pool if the CONTROLLER role is not assigned (the 381 // DEFAULT node pool will assume the CONTROLLER role). 382 bool preemptible = 10 [(google.api.field_behavior) = OPTIONAL]; 383 384 // Optional. A list of [hardware 385 // accelerators](https://cloud.google.com/compute/docs/gpus) to attach to 386 // each node. 387 repeated GkeNodePoolAcceleratorConfig accelerators = 11 388 [(google.api.field_behavior) = OPTIONAL]; 389 390 // Optional. [Minimum CPU 391 // platform](https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform) 392 // to be used by this instance. The instance may be scheduled on the 393 // specified or a newer CPU platform. Specify the friendly names of CPU 394 // platforms, such as "Intel Haswell"` or Intel Sandy Bridge". 395 string min_cpu_platform = 13 [(google.api.field_behavior) = OPTIONAL]; 396 397 // Optional. The [Customer Managed Encryption Key (CMEK)] 398 // (https://cloud.google.com/kubernetes-engine/docs/how-to/using-cmek) 399 // used to encrypt the boot disk attached to each node in the node pool. 400 // Specify the key using the following format: 401 // <code>projects/<var>KEY_PROJECT_ID</var>/locations/<var>LOCATION</var>/keyRings/<var>RING_NAME</var>/cryptoKeys/<var>KEY_NAME</var></code>. 402 string boot_disk_kms_key = 23 [(google.api.field_behavior) = OPTIONAL]; 403 404 // Optional. Whether the nodes are created as [Spot VM instances] 405 // (https://cloud.google.com/compute/docs/instances/spot). 406 // Spot VMs are the latest update to legacy 407 // [preemptible 408 // VMs][google.cloud.dataproc.v1.GkeNodePoolConfig.GkeNodeConfig.preemptible]. 409 // Spot VMs do not have a maximum lifetime. Legacy and Spot preemptible 410 // nodes cannot be used in a node pool with the `CONTROLLER` 411 // [role](/dataproc/docs/reference/rest/v1/projects.regions.clusters#role) 412 // or in the DEFAULT node pool if the CONTROLLER role is not assigned (the 413 // DEFAULT node pool will assume the CONTROLLER role). 414 bool spot = 32 [(google.api.field_behavior) = OPTIONAL]; 415 } 416 417 // A GkeNodeConfigAcceleratorConfig represents a Hardware Accelerator request 418 // for a node pool. 419 message GkeNodePoolAcceleratorConfig { 420 // The number of accelerator cards exposed to an instance. 421 int64 accelerator_count = 1; 422 423 // The accelerator type resource namename (see GPUs on Compute Engine). 424 string accelerator_type = 2; 425 426 // Size of partitions to create on the GPU. Valid values are described in 427 // the NVIDIA [mig user 428 // guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#partitioning). 429 string gpu_partition_size = 3; 430 } 431 432 // GkeNodePoolAutoscaling contains information the cluster autoscaler needs to 433 // adjust the size of the node pool to the current cluster usage. 434 message GkeNodePoolAutoscalingConfig { 435 // The minimum number of nodes in the node pool. Must be >= 0 and <= 436 // max_node_count. 437 int32 min_node_count = 2; 438 439 // The maximum number of nodes in the node pool. Must be >= min_node_count, 440 // and must be > 0. 441 // **Note:** Quota must be sufficient to scale up the cluster. 442 int32 max_node_count = 3; 443 } 444 445 // Optional. The node pool configuration. 446 GkeNodeConfig config = 2 [(google.api.field_behavior) = OPTIONAL]; 447 448 // Optional. The list of Compute Engine 449 // [zones](https://cloud.google.com/compute/docs/zones#available) where 450 // node pool nodes associated with a Dataproc on GKE virtual cluster 451 // will be located. 452 // 453 // **Note:** All node pools associated with a virtual cluster 454 // must be located in the same region as the virtual cluster, and they must 455 // be located in the same zone within that region. 456 // 457 // If a location is not specified during node pool creation, Dataproc on GKE 458 // will choose the zone. 459 repeated string locations = 13 [(google.api.field_behavior) = OPTIONAL]; 460 461 // Optional. The autoscaler configuration for this node pool. The autoscaler 462 // is enabled only when a valid configuration is present. 463 GkeNodePoolAutoscalingConfig autoscaling = 4 464 [(google.api.field_behavior) = OPTIONAL]; 465} 466 467// Configuration for dependency repositories 468message RepositoryConfig { 469 // Optional. Configuration for PyPi repository. 470 PyPiRepositoryConfig pypi_repository_config = 1 471 [(google.api.field_behavior) = OPTIONAL]; 472} 473 474// Configuration for PyPi repository 475message PyPiRepositoryConfig { 476 // Optional. PyPi repository address 477 string pypi_repository = 1 [(google.api.field_behavior) = OPTIONAL]; 478} 479 480// Cluster components that can be activated. 481enum Component { 482 // Unspecified component. Specifying this will cause Cluster creation to fail. 483 COMPONENT_UNSPECIFIED = 0; 484 485 // The Anaconda python distribution. The Anaconda component is not supported 486 // in the Dataproc [2.0 image] 487 // (/https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-release-2.0). 488 // The 2.0 image is pre-installed with Miniconda. 489 ANACONDA = 5; 490 491 // Docker 492 DOCKER = 13; 493 494 // The Druid query engine. (alpha) 495 DRUID = 9; 496 497 // Flink 498 FLINK = 14; 499 500 // HBase. (beta) 501 HBASE = 11; 502 503 // The Hive Web HCatalog (the REST service for accessing HCatalog). 504 HIVE_WEBHCAT = 3; 505 506 // Hudi. 507 HUDI = 18; 508 509 // The Jupyter Notebook. 510 JUPYTER = 1; 511 512 // The Presto query engine. 513 PRESTO = 6; 514 515 // The Trino query engine. 516 TRINO = 17; 517 518 // The Ranger service. 519 RANGER = 12; 520 521 // The Solr service. 522 SOLR = 10; 523 524 // The Zeppelin notebook. 525 ZEPPELIN = 4; 526 527 // The Zookeeper service. 528 ZOOKEEPER = 8; 529} 530 531// Actions in response to failure of a resource associated with a cluster. 532enum FailureAction { 533 // When FailureAction is unspecified, failure action defaults to NO_ACTION. 534 FAILURE_ACTION_UNSPECIFIED = 0; 535 536 // Take no action on failure to create a cluster resource. NO_ACTION is the 537 // default. 538 NO_ACTION = 1; 539 540 // Delete the failed cluster resource. 541 DELETE = 2; 542} 543