batch/v1/job.proto

// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.batch.v1;

import "google/api/field_behavior.proto";
import "google/api/resource.proto";
import "google/cloud/batch/v1/task.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/timestamp.proto";

option csharp_namespace = "Google.Cloud.Batch.V1";
option go_package = "cloud.google.com/go/batch/apiv1/batchpb;batchpb";
option java_multiple_files = true;
option java_outer_classname = "JobProto";
option java_package = "com.google.cloud.batch.v1";
option objc_class_prefix = "GCB";
option php_namespace = "Google\\Cloud\\Batch\\V1";
option ruby_package = "Google::Cloud::Batch::V1";

// The Cloud Batch Job description.
message Job {
  option (google.api.resource) = {
    type: "batch.googleapis.com/Job"
    pattern: "projects/{project}/locations/{location}/jobs/{job}"
  };

  // Output only. Job name.
  // For example: "projects/123456/locations/us-central1/jobs/job01".
  string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. A system generated unique ID for the Job.
  string uid = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Priority of the Job.
  // The valid value range is [0, 100). Default value is 0.
  // Higher value indicates higher priority.
  // A job with higher priority value is more likely to run earlier if all other
  // requirements are satisfied.
  int64 priority = 3;

  // Required. TaskGroups in the Job. Only one TaskGroup is supported now.
  repeated TaskGroup task_groups = 4 [(google.api.field_behavior) = REQUIRED];

  // Compute resource allocation for all TaskGroups in the Job.
  AllocationPolicy allocation_policy = 7;

  // Labels for the Job. Labels could be user provided or system generated.
  // For example,
  // "labels": {
  //    "department": "finance",
  //    "environment": "test"
  //  }
  // You can assign up to 64 labels.  [Google Compute Engine label
  // restrictions](https://cloud.google.com/compute/docs/labeling-resources#restrictions)
  // apply.
  // Label names that start with "goog-" or "google-" are reserved.
  map<string, string> labels = 8;

  // Output only. Job status. It is read only for users.
  JobStatus status = 9 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. When the Job was created.
  google.protobuf.Timestamp create_time = 11
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. The last time the Job was updated.
  google.protobuf.Timestamp update_time = 12
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Log preservation policy for the Job.
  LogsPolicy logs_policy = 13;

  // Notification configurations.
  repeated JobNotification notifications = 14;
}

// LogsPolicy describes how outputs from a Job's Tasks (stdout/stderr) will be
// preserved.
message LogsPolicy {
  // `CloudLoggingOption` contains additional settings for Cloud Logging logs
  // generated by Batch job.
  message CloudLoggingOption {
    // Optional. Set this flag to true to change the [monitored resource
    // type](https://cloud.google.com/monitoring/api/resources) for
    // Cloud Logging logs generated by this Batch job from
    // the
    // [`batch.googleapis.com/Job`](https://cloud.google.com/monitoring/api/resources#tag_batch.googleapis.com/Job)
    // type to the formerly used
    // [`generic_task`](https://cloud.google.com/monitoring/api/resources#tag_generic_task)
    // type.
    bool use_generic_task_monitored_resource = 1
        [(google.api.field_behavior) = OPTIONAL];
  }

  // The destination (if any) for logs.
  enum Destination {
    // Logs are not preserved.
    DESTINATION_UNSPECIFIED = 0;

    // Logs are streamed to Cloud Logging.
    CLOUD_LOGGING = 1;

    // Logs are saved to a file path.
    PATH = 2;
  }

  // Where logs should be saved.
  Destination destination = 1;

  // The path to which logs are saved when the destination = PATH. This can be a
  // local file path on the VM, or under the mount point of a Persistent Disk or
  // Filestore, or a Cloud Storage path.
  string logs_path = 2;

  // Optional. Additional settings for Cloud Logging. It will only take effect
  // when the destination of `LogsPolicy` is set to `CLOUD_LOGGING`.
  CloudLoggingOption cloud_logging_option = 3
      [(google.api.field_behavior) = OPTIONAL];
}

// Job status.
message JobStatus {
  // VM instance status.
  message InstanceStatus {
    // The Compute Engine machine type.
    string machine_type = 1;

    // The VM instance provisioning model.
    AllocationPolicy.ProvisioningModel provisioning_model = 2;

    // The max number of tasks can be assigned to this instance type.
    int64 task_pack = 3;

    // The VM boot disk.
    AllocationPolicy.Disk boot_disk = 4;
  }

  // Aggregated task status for a TaskGroup.
  message TaskGroupStatus {
    // Count of task in each state in the TaskGroup.
    // The map key is task state name.
    map<string, int64> counts = 1;

    // Status of instances allocated for the TaskGroup.
    repeated InstanceStatus instances = 2;
  }

  // Valid Job states.
  enum State {
    // Job state unspecified.
    STATE_UNSPECIFIED = 0;

    // Job is admitted (validated and persisted) and waiting for resources.
    QUEUED = 1;

    // Job is scheduled to run as soon as resource allocation is ready.
    // The resource allocation may happen at a later time but with a high
    // chance to succeed.
    SCHEDULED = 2;

    // Resource allocation has been successful. At least one Task in the Job is
    // RUNNING.
    RUNNING = 3;

    // All Tasks in the Job have finished successfully.
    SUCCEEDED = 4;

    // At least one Task in the Job has failed.
    FAILED = 5;

    // The Job will be deleted, but has not been deleted yet. Typically this is
    // because resources used by the Job are still being cleaned up.
    DELETION_IN_PROGRESS = 6;
  }

  // Job state
  State state = 1;

  // Job status events
  repeated StatusEvent status_events = 2;

  // Aggregated task status for each TaskGroup in the Job.
  // The map key is TaskGroup ID.
  map<string, TaskGroupStatus> task_groups = 4;

  // The duration of time that the Job spent in status RUNNING.
  google.protobuf.Duration run_duration = 5;
}

// Notification configurations.
message JobNotification {
  // Message details.
  // Describe the conditions under which messages will be sent.
  // If no attribute is defined, no message will be sent by default.
  // One message should specify either the job or the task level attributes,
  // but not both. For example,
  // job level: JOB_STATE_CHANGED and/or a specified new_job_state;
  // task level: TASK_STATE_CHANGED and/or a specified new_task_state.
  message Message {
    // The message type.
    Type type = 1;

    // The new job state.
    JobStatus.State new_job_state = 2;

    // The new task state.
    TaskStatus.State new_task_state = 3;
  }

  // The message type.
  enum Type {
    // Unspecified.
    TYPE_UNSPECIFIED = 0;

    // Notify users that the job state has changed.
    JOB_STATE_CHANGED = 1;

    // Notify users that the task state has changed.
    TASK_STATE_CHANGED = 2;
  }

  // The Pub/Sub topic where notifications like the job state changes
  // will be published. The topic must exist in the same project as
  // the job and billings will be charged to this project.
  // If not specified, no Pub/Sub messages will be sent.
  // Topic format: `projects/{project}/topics/{topic}`.
  string pubsub_topic = 1;

  // The attribute requirements of messages to be sent to this Pub/Sub topic.
  // Without this field, no message will be sent.
  Message message = 2;
}

// A Job's resource allocation policy describes when, where, and how compute
// resources should be allocated for the Job.
message AllocationPolicy {
  message LocationPolicy {
    // A list of allowed location names represented by internal URLs.
    //
    // Each location can be a region or a zone.
    // Only one region or multiple zones in one region is supported now.
    // For example,
    // ["regions/us-central1"] allow VMs in any zones in region us-central1.
    // ["zones/us-central1-a", "zones/us-central1-c"] only allow VMs
    // in zones us-central1-a and us-central1-c.
    //
    // All locations end up in different regions would cause errors.
    // For example,
    // ["regions/us-central1", "zones/us-central1-a", "zones/us-central1-b",
    // "zones/us-west1-a"] contains 2 regions "us-central1" and
    // "us-west1". An error is expected in this case.
    repeated string allowed_locations = 1;
  }

  // A new persistent disk or a local ssd.
  // A VM can only have one local SSD setting but multiple local SSD partitions.
  // See https://cloud.google.com/compute/docs/disks#pdspecs and
  // https://cloud.google.com/compute/docs/disks#localssds.
  message Disk {
    // A data source from which a PD will be created.
    oneof data_source {
      // URL for a VM image to use as the data source for this disk.
      // For example, the following are all valid URLs:
      //
      // * Specify the image by its family name:
      // projects/{project}/global/images/family/{image_family}
      // * Specify the image version:
      // projects/{project}/global/images/{image_version}
      //
      // You can also use Batch customized image in short names.
      // The following image values are supported for a boot disk:
      //
      // * `batch-debian`: use Batch Debian images.
      // * `batch-centos`: use Batch CentOS images.
      // * `batch-cos`: use Batch Container-Optimized images.
      // * `batch-hpc-centos`: use Batch HPC CentOS images.
      // * `batch-hpc-rocky`: use Batch HPC Rocky Linux images.
      string image = 4;

      // Name of a snapshot used as the data source.
      // Snapshot is not supported as boot disk now.
      string snapshot = 5;
    }

    // Disk type as shown in `gcloud compute disk-types list`.
    // For example, local SSD uses type "local-ssd".
    // Persistent disks and boot disks use "pd-balanced", "pd-extreme", "pd-ssd"
    // or "pd-standard".
    string type = 1;

    // Disk size in GB.
    //
    // **Non-Boot Disk**:
    // If the `type` specifies a persistent disk, this field
    // is ignored if `data_source` is set as `image` or `snapshot`.
    // If the `type` specifies a local SSD, this field should be a multiple of
    // 375 GB, otherwise, the final size will be the next greater multiple of
    // 375 GB.
    //
    // **Boot Disk**:
    // Batch will calculate the boot disk size based on source
    // image and task requirements if you do not speicify the size.
    // If both this field and the `boot_disk_mib` field in task spec's
    // `compute_resource` are defined, Batch will only honor this field.
    // Also, this field should be no smaller than the source disk's
    // size when the `data_source` is set as `snapshot` or `image`.
    // For example, if you set an image as the `data_source` field and the
    // image's default disk size 30 GB, you can only use this field to make the
    // disk larger or equal to 30 GB.
    int64 size_gb = 2;

    // Local SSDs are available through both "SCSI" and "NVMe" interfaces.
    // If not indicated, "NVMe" will be the default one for local ssds.
    // This field is ignored for persistent disks as the interface is chosen
    // automatically. See
    // https://cloud.google.com/compute/docs/disks/persistent-disks#choose_an_interface.
    string disk_interface = 6;
  }

  // A new or an existing persistent disk (PD) or a local ssd attached to a VM
  // instance.
  message AttachedDisk {
    oneof attached {
      Disk new_disk = 1;

      // Name of an existing PD.
      string existing_disk = 2;
    }

    // Device name that the guest operating system will see.
    // It is used by Runnable.volumes field to mount disks. So please specify
    // the device_name if you want Batch to help mount the disk, and it should
    // match the device_name field in volumes.
    string device_name = 3;
  }

  // Accelerator describes Compute Engine accelerators to be attached to the VM.
  message Accelerator {
    // The accelerator type. For example, "nvidia-tesla-t4".
    // See `gcloud compute accelerator-types list`.
    string type = 1;

    // The number of accelerators of this type.
    int64 count = 2;

    // Deprecated: please use instances[0].install_gpu_drivers instead.
    bool install_gpu_drivers = 3 [deprecated = true];

    // Optional. The NVIDIA GPU driver version that should be installed for this
    // type.
    //
    // You can define the specific driver version such as "470.103.01",
    // following the driver version requirements in
    // https://cloud.google.com/compute/docs/gpus/install-drivers-gpu#minimum-driver.
    // Batch will install the specific accelerator driver if qualified.
    string driver_version = 4 [(google.api.field_behavior) = OPTIONAL];
  }

  // InstancePolicy describes an instance type and resources attached to each VM
  // created by this InstancePolicy.
  message InstancePolicy {
    // The Compute Engine machine type.
    string machine_type = 2;

    // The minimum CPU platform.
    // See
    // https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform.
    string min_cpu_platform = 3;

    // The provisioning model.
    ProvisioningModel provisioning_model = 4;

    // The accelerators attached to each VM instance.
    repeated Accelerator accelerators = 5;

    // Boot disk to be created and attached to each VM by this InstancePolicy.
    // Boot disk will be deleted when the VM is deleted.
    // Batch API now only supports booting from image.
    Disk boot_disk = 8;

    // Non-boot disks to be attached for each VM created by this InstancePolicy.
    // New disks will be deleted when the VM is deleted.
    // A non-boot disk is a disk that can be of a device with a
    // file system or a raw storage drive that is not ready for data
    // storage and accessing.
    repeated AttachedDisk disks = 6;

    // Optional. If specified, VMs will consume only the specified reservation.
    // If not specified (default), VMs will consume any applicable reservation.
    string reservation = 7 [(google.api.field_behavior) = OPTIONAL];
  }

  // InstancePolicyOrTemplate lets you define the type of resources to use for
  // this job either with an InstancePolicy or an instance template.
  // If undefined, Batch picks the type of VM to use and doesn't include
  // optional VM resources such as GPUs and extra disks.
  message InstancePolicyOrTemplate {
    oneof policy_template {
      // InstancePolicy.
      InstancePolicy policy = 1;

      // Name of an instance template used to create VMs.
      // Named the field as 'instance_template' instead of 'template' to avoid
      // c++ keyword conflict.
      string instance_template = 2;
    }

    // Set this field true if users want Batch to help fetch drivers from a
    // third party location and install them for GPUs specified in
    // policy.accelerators or instance_template on their behalf. Default is
    // false.
    //
    // For Container-Optimized Image cases, Batch will install the
    // accelerator driver following milestones of
    // https://cloud.google.com/container-optimized-os/docs/release-notes. For
    // non Container-Optimized Image cases, following
    // https://github.com/GoogleCloudPlatform/compute-gpu-installation/blob/main/linux/install_gpu_driver.py.
    bool install_gpu_drivers = 3;
  }

  // A network interface.
  message NetworkInterface {
    // The URL of an existing network resource.
    // You can specify the network as a full or partial URL.
    //
    // For example, the following are all valid URLs:
    //
    // * https://www.googleapis.com/compute/v1/projects/{project}/global/networks/{network}
    // * projects/{project}/global/networks/{network}
    // * global/networks/{network}
    string network = 1;

    // The URL of an existing subnetwork resource in the network.
    // You can specify the subnetwork as a full or partial URL.
    //
    // For example, the following are all valid URLs:
    //
    // * https://www.googleapis.com/compute/v1/projects/{project}/regions/{region}/subnetworks/{subnetwork}
    // * projects/{project}/regions/{region}/subnetworks/{subnetwork}
    // * regions/{region}/subnetworks/{subnetwork}
    string subnetwork = 2;

    // Default is false (with an external IP address). Required if
    // no external public IP address is attached to the VM. If no external
    // public IP address, additional configuration is required to allow the VM
    // to access Google Services. See
    // https://cloud.google.com/vpc/docs/configure-private-google-access and
    // https://cloud.google.com/nat/docs/gce-example#create-nat for more
    // information.
    bool no_external_ip_address = 3;
  }

  // NetworkPolicy describes VM instance network configurations.
  message NetworkPolicy {
    // Network configurations.
    repeated NetworkInterface network_interfaces = 1;
  }

  // PlacementPolicy describes a group placement policy for the VMs controlled
  // by this AllocationPolicy.
  message PlacementPolicy {
    // UNSPECIFIED vs. COLLOCATED (default UNSPECIFIED). Use COLLOCATED when you
    // want VMs to be located close to each other for low network latency
    // between the VMs. No placement policy will be generated when collocation
    // is UNSPECIFIED.
    string collocation = 1;

    // When specified, causes the job to fail if more than max_distance logical
    // switches are required between VMs. Batch uses the most compact possible
    // placement of VMs even when max_distance is not specified. An explicit
    // max_distance makes that level of compactness a strict requirement.
    // Not yet implemented
    int64 max_distance = 2;
  }

  // Compute Engine VM instance provisioning model.
  enum ProvisioningModel {
    // Unspecified.
    PROVISIONING_MODEL_UNSPECIFIED = 0;

    // Standard VM.
    STANDARD = 1;

    // SPOT VM.
    SPOT = 2;

    // Preemptible VM (PVM).
    //
    // Above SPOT VM is the preferable model for preemptible VM instances: the
    // old preemptible VM model (indicated by this field) is the older model,
    // and has been migrated to use the SPOT model as the underlying technology.
    // This old model will still be supported.
    PREEMPTIBLE = 3;
  }

  // Location where compute resources should be allocated for the Job.
  LocationPolicy location = 1;

  // Describe instances that can be created by this AllocationPolicy.
  // Only instances[0] is supported now.
  repeated InstancePolicyOrTemplate instances = 8;

  // Defines the service account for Batch-created VMs. If omitted, the [default
  // Compute Engine service
  // account](https://cloud.google.com/compute/docs/access/service-accounts#default_service_account)
  // is used. Must match the service account specified in any used instance
  // template configured in the Batch job.
  //
  // Includes the following fields:
  //  * email: The service account's email address. If not set, the default
  //  Compute Engine service account is used.
  //  * scopes: Additional OAuth scopes to grant the service account, beyond the
  //  default cloud-platform scope. (list of strings)
  ServiceAccount service_account = 9;

  // Labels applied to all VM instances and other resources
  // created by AllocationPolicy.
  // Labels could be user provided or system generated.
  // You can assign up to 64 labels. [Google Compute Engine label
  // restrictions](https://cloud.google.com/compute/docs/labeling-resources#restrictions)
  // apply.
  // Label names that start with "goog-" or "google-" are reserved.
  map<string, string> labels = 6;

  // The network policy.
  //
  // If you define an instance template in the `InstancePolicyOrTemplate` field,
  // Batch will use the network settings in the instance template instead of
  // this field.
  NetworkPolicy network = 7;

  // The placement policy.
  PlacementPolicy placement = 10;

  // Optional. Tags applied to the VM instances.
  //
  // The tags identify valid sources or targets for network firewalls.
  // Each tag must be 1-63 characters long, and comply with
  // [RFC1035](https://www.ietf.org/rfc/rfc1035.txt).
  repeated string tags = 11 [(google.api.field_behavior) = OPTIONAL];
}

// A TaskGroup defines one or more Tasks that all share the same TaskSpec.
message TaskGroup {
  option (google.api.resource) = {
    type: "batch.googleapis.com/TaskGroup"
    pattern: "projects/{project}/locations/{location}/jobs/{job}/taskGroups/{task_group}"
  };

  // How Tasks in the TaskGroup should be scheduled relative to each other.
  enum SchedulingPolicy {
    // Unspecified.
    SCHEDULING_POLICY_UNSPECIFIED = 0;

    // Run Tasks as soon as resources are available.
    //
    // Tasks might be executed in parallel depending on parallelism and
    // task_count values.
    AS_SOON_AS_POSSIBLE = 1;

    // Run Tasks sequentially with increased task index.
    IN_ORDER = 2;
  }

  // Output only. TaskGroup name.
  // The system generates this field based on parent Job name.
  // For example:
  // "projects/123456/locations/us-west1/jobs/job01/taskGroups/group01".
  string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Required. Tasks in the group share the same task spec.
  TaskSpec task_spec = 3 [(google.api.field_behavior) = REQUIRED];

  // Number of Tasks in the TaskGroup.
  // Default is 1.
  int64 task_count = 4;

  // Max number of tasks that can run in parallel.
  // Default to min(task_count, parallel tasks per job limit).
  // See: [Job Limits](https://cloud.google.com/batch/quotas#job_limits).
  // Field parallelism must be 1 if the scheduling_policy is IN_ORDER.
  int64 parallelism = 5;

  // Scheduling policy for Tasks in the TaskGroup.
  // The default value is AS_SOON_AS_POSSIBLE.
  SchedulingPolicy scheduling_policy = 6;

  // An array of environment variable mappings, which are passed to Tasks with
  // matching indices. If task_environments is used then task_count should
  // not be specified in the request (and will be ignored). Task count will be
  // the length of task_environments.
  //
  // Tasks get a BATCH_TASK_INDEX and BATCH_TASK_COUNT environment variable, in
  // addition to any environment variables set in task_environments, specifying
  // the number of Tasks in the Task's parent TaskGroup, and the specific Task's
  // index in the TaskGroup (0 through BATCH_TASK_COUNT - 1).
  repeated Environment task_environments = 9;

  // Max number of tasks that can be run on a VM at the same time.
  // If not specified, the system will decide a value based on available
  // compute resources on a VM and task requirements.
  int64 task_count_per_node = 10;

  // When true, Batch will populate a file with a list of all VMs assigned to
  // the TaskGroup and set the BATCH_HOSTS_FILE environment variable to the path
  // of that file. Defaults to false. The host file supports up to 1000 VMs.
  bool require_hosts_file = 11;

  // When true, Batch will configure SSH to allow passwordless login between
  // VMs running the Batch tasks in the same TaskGroup.
  bool permissive_ssh = 12;

  // Optional. If not set or set to false, Batch uses the root user to execute
  // runnables. If set to true, Batch runs the runnables using a non-root user.
  // Currently, the non-root user Batch used is generated by OS Login. For more
  // information, see [About OS
  // Login](https://cloud.google.com/compute/docs/oslogin).
  bool run_as_non_root = 14 [(google.api.field_behavior) = OPTIONAL];
}

// Carries information about a Google Cloud service account.
message ServiceAccount {
  // Email address of the service account.
  string email = 1;

  // List of scopes to be enabled for this service account.
  repeated string scopes = 2;
}