aiplatform/v1beta1/persistent_resource.proto

// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.aiplatform.v1beta1;

import "google/api/field_behavior.proto";
import "google/api/resource.proto";
import "google/cloud/aiplatform/v1beta1/encryption_spec.proto";
import "google/cloud/aiplatform/v1beta1/machine_resources.proto";
import "google/protobuf/timestamp.proto";
import "google/rpc/status.proto";

option csharp_namespace = "Google.Cloud.AIPlatform.V1Beta1";
option go_package = "cloud.google.com/go/aiplatform/apiv1beta1/aiplatformpb;aiplatformpb";
option java_multiple_files = true;
option java_outer_classname = "PersistentResourceProto";
option java_package = "com.google.cloud.aiplatform.v1beta1";
option php_namespace = "Google\\Cloud\\AIPlatform\\V1beta1";
option ruby_package = "Google::Cloud::AIPlatform::V1beta1";

// Represents long-lasting resources that are dedicated to users to runs custom
// workloads.
// A PersistentResource can have multiple node pools and each node
// pool can have its own machine spec.
message PersistentResource {
  option (google.api.resource) = {
    type: "aiplatform.googleapis.com/PersistentResource"
    pattern: "projects/{project}/locations/{location}/persistentResources/{persistent_resource}"
  };

  // Describes the PersistentResource state.
  enum State {
    // Not set.
    STATE_UNSPECIFIED = 0;

    // The PROVISIONING state indicates the persistent resources is being
    // created.
    PROVISIONING = 1;

    // The RUNNING state indicates the persistent resource is healthy and fully
    // usable.
    RUNNING = 3;

    // The STOPPING state indicates the persistent resource is being deleted.
    STOPPING = 4;

    // The ERROR state indicates the persistent resource may be unusable.
    // Details can be found in the `error` field.
    ERROR = 5;

    // The REBOOTING state indicates the persistent resource is being rebooted
    // (PR is not available right now but is expected to be ready again later).
    REBOOTING = 6;

    // The UPDATING state indicates the persistent resource is being updated.
    UPDATING = 7;
  }

  // Immutable. Resource name of a PersistentResource.
  string name = 1 [(google.api.field_behavior) = IMMUTABLE];

  // Optional. The display name of the PersistentResource.
  // The name can be up to 128 characters long and can consist of any UTF-8
  // characters.
  string display_name = 2 [(google.api.field_behavior) = OPTIONAL];

  // Required. The spec of the pools of different resources.
  repeated ResourcePool resource_pools = 4
      [(google.api.field_behavior) = REQUIRED];

  // Output only. The detailed state of a Study.
  State state = 5 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Only populated when persistent resource's state is `STOPPING`
  // or `ERROR`.
  google.rpc.Status error = 6 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Time when the PersistentResource was created.
  google.protobuf.Timestamp create_time = 7
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Time when the PersistentResource for the first time entered
  // the `RUNNING` state.
  google.protobuf.Timestamp start_time = 8
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Time when the PersistentResource was most recently updated.
  google.protobuf.Timestamp update_time = 9
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Optional. The labels with user-defined metadata to organize
  // PersistentResource.
  //
  // Label keys and values can be no longer than 64 characters
  // (Unicode codepoints), can only contain lowercase letters, numeric
  // characters, underscores and dashes. International characters are allowed.
  //
  // See https://goo.gl/xmQnxf for more information and examples of labels.
  map<string, string> labels = 10 [(google.api.field_behavior) = OPTIONAL];

  // Optional. The full name of the Compute Engine
  // [network](/compute/docs/networks-and-firewalls#networks) to peered with
  // Vertex AI to host the persistent resources.
  // For example, `projects/12345/global/networks/myVPC`.
  // [Format](/compute/docs/reference/rest/v1/networks/insert)
  // is of the form `projects/{project}/global/networks/{network}`.
  // Where {project} is a project number, as in `12345`, and {network} is a
  // network name.
  //
  // To specify this field, you must have already [configured VPC Network
  // Peering for Vertex
  // AI](https://cloud.google.com/vertex-ai/docs/general/vpc-peering).
  //
  // If this field is left unspecified, the resources aren't peered with any
  // network.
  string network = 11 [
    (google.api.field_behavior) = OPTIONAL,
    (google.api.resource_reference) = { type: "compute.googleapis.com/Network" }
  ];

  // Optional. Customer-managed encryption key spec for a PersistentResource.
  // If set, this PersistentResource and all sub-resources of this
  // PersistentResource will be secured by this key.
  EncryptionSpec encryption_spec = 12 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Persistent Resource runtime spec.
  // For example, used for Ray cluster configuration.
  ResourceRuntimeSpec resource_runtime_spec = 13
      [(google.api.field_behavior) = OPTIONAL];

  // Output only. Runtime information of the Persistent Resource.
  ResourceRuntime resource_runtime = 14
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Optional. A list of names for the reserved IP ranges under the VPC network
  // that can be used for this persistent resource.
  //
  // If set, we will deploy the persistent resource within the provided IP
  // ranges. Otherwise, the persistent resource is deployed to any IP
  // ranges under the provided VPC network.
  //
  // Example: ['vertex-ai-ip-range'].
  repeated string reserved_ip_ranges = 15
      [(google.api.field_behavior) = OPTIONAL];
}

// Represents the spec of a group of resources of the same type,
// for example machine type, disk, and accelerators, in a PersistentResource.
message ResourcePool {
  // The min/max number of replicas allowed if enabling autoscaling
  message AutoscalingSpec {
    // Optional. min replicas in the node pool,
    // must be ≤ replica_count and < max_replica_count or will throw error
    optional int64 min_replica_count = 1
        [(google.api.field_behavior) = OPTIONAL];

    // Optional. max replicas in the node pool,
    // must be ≥ replica_count and > min_replica_count or will throw error
    optional int64 max_replica_count = 2
        [(google.api.field_behavior) = OPTIONAL];
  }

  // Immutable. The unique ID in a PersistentResource for referring to this
  // resource pool. User can specify it if necessary. Otherwise, it's generated
  // automatically.
  string id = 1 [(google.api.field_behavior) = IMMUTABLE];

  // Required. Immutable. The specification of a single machine.
  MachineSpec machine_spec = 2 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.field_behavior) = IMMUTABLE
  ];

  // Optional. The total number of machines to use for this resource pool.
  optional int64 replica_count = 3 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Disk spec for the machine in this node pool.
  DiskSpec disk_spec = 4 [(google.api.field_behavior) = OPTIONAL];

  // Output only. The number of machines currently in use by training jobs for
  // this resource pool. Will replace idle_replica_count.
  int64 used_replica_count = 6 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Optional. Optional spec to configure GKE autoscaling
  AutoscalingSpec autoscaling_spec = 7 [(google.api.field_behavior) = OPTIONAL];
}

// Configuration for the runtime on a PersistentResource instance, including
// but not limited to:
//
// * Service accounts used to run the workloads.
// * Whether to make it a dedicated Ray Cluster.
message ResourceRuntimeSpec {
  // Optional. Configure the use of workload identity on the PersistentResource
  ServiceAccountSpec service_account_spec = 2
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. Ray cluster configuration.
  // Required when creating a dedicated RayCluster on the PersistentResource.
  RaySpec ray_spec = 1 [(google.api.field_behavior) = OPTIONAL];
}

// Configuration information for the Ray cluster.
// For experimental launch, Ray cluster creation and Persistent
// cluster creation are 1:1 mapping: We will provision all the nodes within the
// Persistent cluster as Ray nodes.
message RaySpec {
  // Optional. Default image for user to choose a preferred ML framework
  // (for example, TensorFlow or Pytorch) by choosing from [Vertex prebuilt
  // images](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers).
  // Either this or the resource_pool_images is required. Use this field if
  // you need all the resource pools to have the same Ray image. Otherwise, use
  // the {@code resource_pool_images} field.
  string image_uri = 1 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Required if image_uri isn't set. A map of resource_pool_id to
  // prebuild Ray image if user need to use different images for different
  // head/worker pools. This map needs to cover all the resource pool ids.
  // Example:
  // {
  //   "ray_head_node_pool": "head image"
  //   "ray_worker_node_pool1": "worker image"
  //   "ray_worker_node_pool2": "another worker image"
  // }
  map<string, string> resource_pool_images = 6
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. This will be used to indicate which resource pool will serve as
  // the Ray head node(the first node within that pool). Will use the machine
  // from the first workerpool as the head node by default if this field isn't
  // set.
  string head_node_resource_pool_id = 7
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. Ray metrics configurations.
  RayMetricSpec ray_metric_spec = 8 [(google.api.field_behavior) = OPTIONAL];
}

// Persistent Cluster runtime information as output
message ResourceRuntime {
  // Output only. URIs for user to connect to the Cluster.
  // Example:
  // {
  //   "RAY_HEAD_NODE_INTERNAL_IP": "head-node-IP:10001"
  //   "RAY_DASHBOARD_URI": "ray-dashboard-address:8888"
  // }
  map<string, string> access_uris = 1
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. The resource name of NotebookRuntimeTemplate for the RoV
  // Persistent Cluster The NotebokRuntimeTemplate is created in the same VPC
  // (if set), and with the same Ray and Python version as the Persistent
  // Cluster. Example:
  //   "projects/1000/locations/us-central1/notebookRuntimeTemplates/abc123"
  string notebook_runtime_template = 2 [
    (google.api.field_behavior) = OUTPUT_ONLY,
    (google.api.resource_reference) = {
      type: "aiplatform.googleapis.com/NotebookRuntimeTemplate"
    }
  ];
}

// Configuration for the use of custom service account to run the workloads.
message ServiceAccountSpec {
  // Required. If true, custom user-managed service account is enforced to run
  // any workloads (for example, Vertex Jobs) on the resource. Otherwise, uses
  // the [Vertex AI Custom Code Service
  // Agent](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents).
  bool enable_custom_service_account = 1
      [(google.api.field_behavior) = REQUIRED];

  // Optional. Required when all below conditions are met
  //  * `enable_custom_service_account` is true;
  //  * any runtime is specified via `ResourceRuntimeSpec` on creation time,
  //    for example, Ray
  //
  // The users must have `iam.serviceAccounts.actAs` permission on this service
  // account and then the specified runtime containers will run as it.
  //
  // Do not set this field if you want to submit jobs using custom service
  // account to this PersistentResource after creation, but only specify the
  // `service_account` inside the job.
  string service_account = 2 [(google.api.field_behavior) = OPTIONAL];
}

// Configuration for the Ray metrics.
message RayMetricSpec {
  // Optional. Flag to disable the Ray metrics collection.
  bool disabled = 1 [(google.api.field_behavior) = OPTIONAL];
}