1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.aiplatform.v1; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21import "google/cloud/aiplatform/v1/encryption_spec.proto"; 22import "google/cloud/aiplatform/v1/env_var.proto"; 23import "google/cloud/aiplatform/v1/io.proto"; 24import "google/cloud/aiplatform/v1/job_state.proto"; 25import "google/cloud/aiplatform/v1/machine_resources.proto"; 26import "google/protobuf/duration.proto"; 27import "google/protobuf/timestamp.proto"; 28import "google/rpc/status.proto"; 29 30option csharp_namespace = "Google.Cloud.AIPlatform.V1"; 31option go_package = "cloud.google.com/go/aiplatform/apiv1/aiplatformpb;aiplatformpb"; 32option java_multiple_files = true; 33option java_outer_classname = "CustomJobProto"; 34option java_package = "com.google.cloud.aiplatform.v1"; 35option php_namespace = "Google\\Cloud\\AIPlatform\\V1"; 36option ruby_package = "Google::Cloud::AIPlatform::V1"; 37 38// Represents a job that runs custom workloads such as a Docker container or a 39// Python package. A CustomJob can have multiple worker pools and each worker 40// pool can have its own machine and input spec. A CustomJob will be cleaned up 41// once the job enters terminal state (failed or succeeded). 42message CustomJob { 43 option (google.api.resource) = { 44 type: "aiplatform.googleapis.com/CustomJob" 45 pattern: "projects/{project}/locations/{location}/customJobs/{custom_job}" 46 }; 47 48 // Output only. Resource name of a CustomJob. 49 string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 50 51 // Required. The display name of the CustomJob. 52 // The name can be up to 128 characters long and can consist of any UTF-8 53 // characters. 54 string display_name = 2 [(google.api.field_behavior) = REQUIRED]; 55 56 // Required. Job spec. 57 CustomJobSpec job_spec = 4 [(google.api.field_behavior) = REQUIRED]; 58 59 // Output only. The detailed state of the job. 60 JobState state = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; 61 62 // Output only. Time when the CustomJob was created. 63 google.protobuf.Timestamp create_time = 6 64 [(google.api.field_behavior) = OUTPUT_ONLY]; 65 66 // Output only. Time when the CustomJob for the first time entered the 67 // `JOB_STATE_RUNNING` state. 68 google.protobuf.Timestamp start_time = 7 69 [(google.api.field_behavior) = OUTPUT_ONLY]; 70 71 // Output only. Time when the CustomJob entered any of the following states: 72 // `JOB_STATE_SUCCEEDED`, `JOB_STATE_FAILED`, `JOB_STATE_CANCELLED`. 73 google.protobuf.Timestamp end_time = 8 74 [(google.api.field_behavior) = OUTPUT_ONLY]; 75 76 // Output only. Time when the CustomJob was most recently updated. 77 google.protobuf.Timestamp update_time = 9 78 [(google.api.field_behavior) = OUTPUT_ONLY]; 79 80 // Output only. Only populated when job's state is `JOB_STATE_FAILED` or 81 // `JOB_STATE_CANCELLED`. 82 google.rpc.Status error = 10 [(google.api.field_behavior) = OUTPUT_ONLY]; 83 84 // The labels with user-defined metadata to organize CustomJobs. 85 // 86 // Label keys and values can be no longer than 64 characters 87 // (Unicode codepoints), can only contain lowercase letters, numeric 88 // characters, underscores and dashes. International characters are allowed. 89 // 90 // See https://goo.gl/xmQnxf for more information and examples of labels. 91 map<string, string> labels = 11; 92 93 // Customer-managed encryption key options for a CustomJob. If this is set, 94 // then all resources created by the CustomJob will be encrypted with the 95 // provided encryption key. 96 EncryptionSpec encryption_spec = 12; 97 98 // Output only. URIs for accessing [interactive 99 // shells](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) 100 // (one URI for each training node). Only available if 101 // [job_spec.enable_web_access][google.cloud.aiplatform.v1.CustomJobSpec.enable_web_access] 102 // is `true`. 103 // 104 // The keys are names of each node in the training job; for example, 105 // `workerpool0-0` for the primary node, `workerpool1-0` for the first node in 106 // the second worker pool, and `workerpool1-1` for the second node in the 107 // second worker pool. 108 // 109 // The values are the URIs for each node's interactive shell. 110 map<string, string> web_access_uris = 16 111 [(google.api.field_behavior) = OUTPUT_ONLY]; 112} 113 114// Represents the spec of a CustomJob. 115message CustomJobSpec { 116 // Optional. The ID of the PersistentResource in the same Project and Location 117 // which to run 118 // 119 // If this is specified, the job will be run on existing machines held by the 120 // PersistentResource instead of on-demand short-live machines. 121 // The network and CMEK configs on the job should be consistent with those on 122 // the PersistentResource, otherwise, the job will be rejected. 123 string persistent_resource_id = 14 [ 124 (google.api.field_behavior) = OPTIONAL, 125 (google.api.resource_reference) = { 126 type: "aiplatform.googleapis.com/PersistentResource" 127 } 128 ]; 129 130 // Required. The spec of the worker pools including machine type and Docker 131 // image. All worker pools except the first one are optional and can be 132 // skipped by providing an empty value. 133 repeated WorkerPoolSpec worker_pool_specs = 1 134 [(google.api.field_behavior) = REQUIRED]; 135 136 // Scheduling options for a CustomJob. 137 Scheduling scheduling = 3; 138 139 // Specifies the service account for workload run-as account. 140 // Users submitting jobs must have act-as permission on this run-as account. 141 // If unspecified, the [Vertex AI Custom Code Service 142 // Agent](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents) 143 // for the CustomJob's project is used. 144 string service_account = 4; 145 146 // Optional. The full name of the Compute Engine 147 // [network](/compute/docs/networks-and-firewalls#networks) to which the Job 148 // should be peered. For example, `projects/12345/global/networks/myVPC`. 149 // [Format](/compute/docs/reference/rest/v1/networks/insert) 150 // is of the form `projects/{project}/global/networks/{network}`. 151 // Where {project} is a project number, as in `12345`, and {network} is a 152 // network name. 153 // 154 // To specify this field, you must have already [configured VPC Network 155 // Peering for Vertex 156 // AI](https://cloud.google.com/vertex-ai/docs/general/vpc-peering). 157 // 158 // If this field is left unspecified, the job is not peered with any network. 159 string network = 5 [ 160 (google.api.field_behavior) = OPTIONAL, 161 (google.api.resource_reference) = { type: "compute.googleapis.com/Network" } 162 ]; 163 164 // Optional. A list of names for the reserved ip ranges under the VPC network 165 // that can be used for this job. 166 // 167 // If set, we will deploy the job within the provided ip ranges. Otherwise, 168 // the job will be deployed to any ip ranges under the provided VPC 169 // network. 170 // 171 // Example: ['vertex-ai-ip-range']. 172 repeated string reserved_ip_ranges = 13 173 [(google.api.field_behavior) = OPTIONAL]; 174 175 // The Cloud Storage location to store the output of this CustomJob or 176 // HyperparameterTuningJob. For HyperparameterTuningJob, 177 // the baseOutputDirectory of 178 // each child CustomJob backing a Trial is set to a subdirectory of name 179 // [id][google.cloud.aiplatform.v1.Trial.id] under its parent 180 // HyperparameterTuningJob's baseOutputDirectory. 181 // 182 // The following Vertex AI environment variables will be passed to 183 // containers or python modules when this field is set: 184 // 185 // For CustomJob: 186 // 187 // * AIP_MODEL_DIR = `<base_output_directory>/model/` 188 // * AIP_CHECKPOINT_DIR = `<base_output_directory>/checkpoints/` 189 // * AIP_TENSORBOARD_LOG_DIR = `<base_output_directory>/logs/` 190 // 191 // For CustomJob backing a Trial of HyperparameterTuningJob: 192 // 193 // * AIP_MODEL_DIR = `<base_output_directory>/<trial_id>/model/` 194 // * AIP_CHECKPOINT_DIR = `<base_output_directory>/<trial_id>/checkpoints/` 195 // * AIP_TENSORBOARD_LOG_DIR = `<base_output_directory>/<trial_id>/logs/` 196 GcsDestination base_output_directory = 6; 197 198 // The ID of the location to store protected artifacts. e.g. us-central1. 199 // Populate only when the location is different than CustomJob location. 200 // List of supported locations: 201 // https://cloud.google.com/vertex-ai/docs/general/locations 202 string protected_artifact_location_id = 19; 203 204 // Optional. The name of a Vertex AI 205 // [Tensorboard][google.cloud.aiplatform.v1.Tensorboard] resource to which 206 // this CustomJob will upload Tensorboard logs. Format: 207 // `projects/{project}/locations/{location}/tensorboards/{tensorboard}` 208 string tensorboard = 7 [ 209 (google.api.field_behavior) = OPTIONAL, 210 (google.api.resource_reference) = { 211 type: "aiplatform.googleapis.com/Tensorboard" 212 } 213 ]; 214 215 // Optional. Whether you want Vertex AI to enable [interactive shell 216 // access](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) 217 // to training containers. 218 // 219 // If set to `true`, you can access interactive shells at the URIs given 220 // by 221 // [CustomJob.web_access_uris][google.cloud.aiplatform.v1.CustomJob.web_access_uris] 222 // or 223 // [Trial.web_access_uris][google.cloud.aiplatform.v1.Trial.web_access_uris] 224 // (within 225 // [HyperparameterTuningJob.trials][google.cloud.aiplatform.v1.HyperparameterTuningJob.trials]). 226 bool enable_web_access = 10 [(google.api.field_behavior) = OPTIONAL]; 227 228 // Optional. Whether you want Vertex AI to enable access to the customized 229 // dashboard in training chief container. 230 // 231 // If set to `true`, you can access the dashboard at the URIs given 232 // by 233 // [CustomJob.web_access_uris][google.cloud.aiplatform.v1.CustomJob.web_access_uris] 234 // or 235 // [Trial.web_access_uris][google.cloud.aiplatform.v1.Trial.web_access_uris] 236 // (within 237 // [HyperparameterTuningJob.trials][google.cloud.aiplatform.v1.HyperparameterTuningJob.trials]). 238 bool enable_dashboard_access = 16 [(google.api.field_behavior) = OPTIONAL]; 239 240 // Optional. The Experiment associated with this job. 241 // Format: 242 // `projects/{project}/locations/{location}/metadataStores/{metadataStores}/contexts/{experiment-name}` 243 string experiment = 17 [ 244 (google.api.field_behavior) = OPTIONAL, 245 (google.api.resource_reference) = { 246 type: "aiplatform.googleapis.com/Context" 247 } 248 ]; 249 250 // Optional. The Experiment Run associated with this job. 251 // Format: 252 // `projects/{project}/locations/{location}/metadataStores/{metadataStores}/contexts/{experiment-name}-{experiment-run-name}` 253 string experiment_run = 18 [ 254 (google.api.field_behavior) = OPTIONAL, 255 (google.api.resource_reference) = { 256 type: "aiplatform.googleapis.com/Context" 257 } 258 ]; 259 260 // Optional. The name of the Model resources for which to generate a mapping 261 // to artifact URIs. Applicable only to some of the Google-provided custom 262 // jobs. Format: `projects/{project}/locations/{location}/models/{model}` 263 // 264 // In order to retrieve a specific version of the model, also provide 265 // the version ID or version alias. 266 // Example: `projects/{project}/locations/{location}/models/{model}@2` 267 // or 268 // `projects/{project}/locations/{location}/models/{model}@golden` 269 // If no version ID or alias is specified, the "default" version will be 270 // returned. The "default" version alias is created for the first version of 271 // the model, and can be moved to other versions later on. There will be 272 // exactly one default version. 273 repeated string models = 20 [ 274 (google.api.field_behavior) = OPTIONAL, 275 (google.api.resource_reference) = { 276 type: "aiplatform.googleapis.com/Model" 277 } 278 ]; 279} 280 281// Represents the spec of a worker pool in a job. 282message WorkerPoolSpec { 283 // The custom task to be executed in this worker pool. 284 oneof task { 285 // The custom container task. 286 ContainerSpec container_spec = 6; 287 288 // The Python packaged task. 289 PythonPackageSpec python_package_spec = 7; 290 } 291 292 // Optional. Immutable. The specification of a single machine. 293 MachineSpec machine_spec = 1 [ 294 (google.api.field_behavior) = OPTIONAL, 295 (google.api.field_behavior) = IMMUTABLE 296 ]; 297 298 // Optional. The number of worker replicas to use for this worker pool. 299 int64 replica_count = 2 [(google.api.field_behavior) = OPTIONAL]; 300 301 // Optional. List of NFS mount spec. 302 repeated NfsMount nfs_mounts = 4 [(google.api.field_behavior) = OPTIONAL]; 303 304 // Disk spec. 305 DiskSpec disk_spec = 5; 306} 307 308// The spec of a Container. 309message ContainerSpec { 310 // Required. The URI of a container image in the Container Registry that is to 311 // be run on each worker replica. 312 string image_uri = 1 [(google.api.field_behavior) = REQUIRED]; 313 314 // The command to be invoked when the container is started. 315 // It overrides the entrypoint instruction in Dockerfile when provided. 316 repeated string command = 2; 317 318 // The arguments to be passed when starting the container. 319 repeated string args = 3; 320 321 // Environment variables to be passed to the container. 322 // Maximum limit is 100. 323 repeated EnvVar env = 4; 324} 325 326// The spec of a Python packaged code. 327message PythonPackageSpec { 328 // Required. The URI of a container image in Artifact Registry that will run 329 // the provided Python package. Vertex AI provides a wide range of executor 330 // images with pre-installed packages to meet users' various use cases. See 331 // the list of [pre-built containers for 332 // training](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers). 333 // You must use an image from this list. 334 string executor_image_uri = 1 [(google.api.field_behavior) = REQUIRED]; 335 336 // Required. The Google Cloud Storage location of the Python package files 337 // which are the training program and its dependent packages. The maximum 338 // number of package URIs is 100. 339 repeated string package_uris = 2 [(google.api.field_behavior) = REQUIRED]; 340 341 // Required. The Python module name to run after installing the packages. 342 string python_module = 3 [(google.api.field_behavior) = REQUIRED]; 343 344 // Command line arguments to be passed to the Python task. 345 repeated string args = 4; 346 347 // Environment variables to be passed to the python module. 348 // Maximum limit is 100. 349 repeated EnvVar env = 5; 350} 351 352// All parameters related to queuing and scheduling of custom jobs. 353message Scheduling { 354 // The maximum job running time. The default is 7 days. 355 google.protobuf.Duration timeout = 1; 356 357 // Restarts the entire CustomJob if a worker gets restarted. 358 // This feature can be used by distributed training jobs that are not 359 // resilient to workers leaving and joining a job. 360 bool restart_job_on_worker_restart = 3; 361 362 // Optional. Indicates if the job should retry for internal errors after the 363 // job starts running. If true, overrides 364 // `Scheduling.restart_job_on_worker_restart` to false. 365 bool disable_retries = 5 [(google.api.field_behavior) = OPTIONAL]; 366} 367