xref: /aosp_15_r20/external/googleapis/google/cloud/aiplatform/v1/custom_job.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.aiplatform.v1;
18
19import "google/api/field_behavior.proto";
20import "google/api/resource.proto";
21import "google/cloud/aiplatform/v1/encryption_spec.proto";
22import "google/cloud/aiplatform/v1/env_var.proto";
23import "google/cloud/aiplatform/v1/io.proto";
24import "google/cloud/aiplatform/v1/job_state.proto";
25import "google/cloud/aiplatform/v1/machine_resources.proto";
26import "google/protobuf/duration.proto";
27import "google/protobuf/timestamp.proto";
28import "google/rpc/status.proto";
29
30option csharp_namespace = "Google.Cloud.AIPlatform.V1";
31option go_package = "cloud.google.com/go/aiplatform/apiv1/aiplatformpb;aiplatformpb";
32option java_multiple_files = true;
33option java_outer_classname = "CustomJobProto";
34option java_package = "com.google.cloud.aiplatform.v1";
35option php_namespace = "Google\\Cloud\\AIPlatform\\V1";
36option ruby_package = "Google::Cloud::AIPlatform::V1";
37
38// Represents a job that runs custom workloads such as a Docker container or a
39// Python package. A CustomJob can have multiple worker pools and each worker
40// pool can have its own machine and input spec. A CustomJob will be cleaned up
41// once the job enters terminal state (failed or succeeded).
42message CustomJob {
43  option (google.api.resource) = {
44    type: "aiplatform.googleapis.com/CustomJob"
45    pattern: "projects/{project}/locations/{location}/customJobs/{custom_job}"
46  };
47
48  // Output only. Resource name of a CustomJob.
49  string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
50
51  // Required. The display name of the CustomJob.
52  // The name can be up to 128 characters long and can consist of any UTF-8
53  // characters.
54  string display_name = 2 [(google.api.field_behavior) = REQUIRED];
55
56  // Required. Job spec.
57  CustomJobSpec job_spec = 4 [(google.api.field_behavior) = REQUIRED];
58
59  // Output only. The detailed state of the job.
60  JobState state = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
61
62  // Output only. Time when the CustomJob was created.
63  google.protobuf.Timestamp create_time = 6
64      [(google.api.field_behavior) = OUTPUT_ONLY];
65
66  // Output only. Time when the CustomJob for the first time entered the
67  // `JOB_STATE_RUNNING` state.
68  google.protobuf.Timestamp start_time = 7
69      [(google.api.field_behavior) = OUTPUT_ONLY];
70
71  // Output only. Time when the CustomJob entered any of the following states:
72  // `JOB_STATE_SUCCEEDED`, `JOB_STATE_FAILED`, `JOB_STATE_CANCELLED`.
73  google.protobuf.Timestamp end_time = 8
74      [(google.api.field_behavior) = OUTPUT_ONLY];
75
76  // Output only. Time when the CustomJob was most recently updated.
77  google.protobuf.Timestamp update_time = 9
78      [(google.api.field_behavior) = OUTPUT_ONLY];
79
80  // Output only. Only populated when job's state is `JOB_STATE_FAILED` or
81  // `JOB_STATE_CANCELLED`.
82  google.rpc.Status error = 10 [(google.api.field_behavior) = OUTPUT_ONLY];
83
84  // The labels with user-defined metadata to organize CustomJobs.
85  //
86  // Label keys and values can be no longer than 64 characters
87  // (Unicode codepoints), can only contain lowercase letters, numeric
88  // characters, underscores and dashes. International characters are allowed.
89  //
90  // See https://goo.gl/xmQnxf for more information and examples of labels.
91  map<string, string> labels = 11;
92
93  // Customer-managed encryption key options for a CustomJob. If this is set,
94  // then all resources created by the CustomJob will be encrypted with the
95  // provided encryption key.
96  EncryptionSpec encryption_spec = 12;
97
98  // Output only. URIs for accessing [interactive
99  // shells](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell)
100  // (one URI for each training node). Only available if
101  // [job_spec.enable_web_access][google.cloud.aiplatform.v1.CustomJobSpec.enable_web_access]
102  // is `true`.
103  //
104  // The keys are names of each node in the training job; for example,
105  // `workerpool0-0` for the primary node, `workerpool1-0` for the first node in
106  // the second worker pool, and `workerpool1-1` for the second node in the
107  // second worker pool.
108  //
109  // The values are the URIs for each node's interactive shell.
110  map<string, string> web_access_uris = 16
111      [(google.api.field_behavior) = OUTPUT_ONLY];
112}
113
114// Represents the spec of a CustomJob.
115message CustomJobSpec {
116  // Optional. The ID of the PersistentResource in the same Project and Location
117  // which to run
118  //
119  // If this is specified, the job will be run on existing machines held by the
120  // PersistentResource instead of on-demand short-live machines.
121  // The network and CMEK configs on the job should be consistent with those on
122  // the PersistentResource, otherwise, the job will be rejected.
123  string persistent_resource_id = 14 [
124    (google.api.field_behavior) = OPTIONAL,
125    (google.api.resource_reference) = {
126      type: "aiplatform.googleapis.com/PersistentResource"
127    }
128  ];
129
130  // Required. The spec of the worker pools including machine type and Docker
131  // image. All worker pools except the first one are optional and can be
132  // skipped by providing an empty value.
133  repeated WorkerPoolSpec worker_pool_specs = 1
134      [(google.api.field_behavior) = REQUIRED];
135
136  // Scheduling options for a CustomJob.
137  Scheduling scheduling = 3;
138
139  // Specifies the service account for workload run-as account.
140  // Users submitting jobs must have act-as permission on this run-as account.
141  // If unspecified, the [Vertex AI Custom Code Service
142  // Agent](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents)
143  // for the CustomJob's project is used.
144  string service_account = 4;
145
146  // Optional. The full name of the Compute Engine
147  // [network](/compute/docs/networks-and-firewalls#networks) to which the Job
148  // should be peered. For example, `projects/12345/global/networks/myVPC`.
149  // [Format](/compute/docs/reference/rest/v1/networks/insert)
150  // is of the form `projects/{project}/global/networks/{network}`.
151  // Where {project} is a project number, as in `12345`, and {network} is a
152  // network name.
153  //
154  // To specify this field, you must have already [configured VPC Network
155  // Peering for Vertex
156  // AI](https://cloud.google.com/vertex-ai/docs/general/vpc-peering).
157  //
158  // If this field is left unspecified, the job is not peered with any network.
159  string network = 5 [
160    (google.api.field_behavior) = OPTIONAL,
161    (google.api.resource_reference) = { type: "compute.googleapis.com/Network" }
162  ];
163
164  // Optional. A list of names for the reserved ip ranges under the VPC network
165  // that can be used for this job.
166  //
167  // If set, we will deploy the job within the provided ip ranges. Otherwise,
168  // the job will be deployed to any ip ranges under the provided VPC
169  // network.
170  //
171  // Example: ['vertex-ai-ip-range'].
172  repeated string reserved_ip_ranges = 13
173      [(google.api.field_behavior) = OPTIONAL];
174
175  // The Cloud Storage location to store the output of this CustomJob or
176  // HyperparameterTuningJob. For HyperparameterTuningJob,
177  // the baseOutputDirectory of
178  // each child CustomJob backing a Trial is set to a subdirectory of name
179  // [id][google.cloud.aiplatform.v1.Trial.id] under its parent
180  // HyperparameterTuningJob's baseOutputDirectory.
181  //
182  // The following Vertex AI environment variables will be passed to
183  // containers or python modules when this field is set:
184  //
185  //   For CustomJob:
186  //
187  //   * AIP_MODEL_DIR = `<base_output_directory>/model/`
188  //   * AIP_CHECKPOINT_DIR = `<base_output_directory>/checkpoints/`
189  //   * AIP_TENSORBOARD_LOG_DIR = `<base_output_directory>/logs/`
190  //
191  //   For CustomJob backing a Trial of HyperparameterTuningJob:
192  //
193  //   * AIP_MODEL_DIR = `<base_output_directory>/<trial_id>/model/`
194  //   * AIP_CHECKPOINT_DIR = `<base_output_directory>/<trial_id>/checkpoints/`
195  //   * AIP_TENSORBOARD_LOG_DIR = `<base_output_directory>/<trial_id>/logs/`
196  GcsDestination base_output_directory = 6;
197
198  // The ID of the location to store protected artifacts. e.g. us-central1.
199  // Populate only when the location is different than CustomJob location.
200  // List of supported locations:
201  // https://cloud.google.com/vertex-ai/docs/general/locations
202  string protected_artifact_location_id = 19;
203
204  // Optional. The name of a Vertex AI
205  // [Tensorboard][google.cloud.aiplatform.v1.Tensorboard] resource to which
206  // this CustomJob will upload Tensorboard logs. Format:
207  // `projects/{project}/locations/{location}/tensorboards/{tensorboard}`
208  string tensorboard = 7 [
209    (google.api.field_behavior) = OPTIONAL,
210    (google.api.resource_reference) = {
211      type: "aiplatform.googleapis.com/Tensorboard"
212    }
213  ];
214
215  // Optional. Whether you want Vertex AI to enable [interactive shell
216  // access](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell)
217  // to training containers.
218  //
219  // If set to `true`, you can access interactive shells at the URIs given
220  // by
221  // [CustomJob.web_access_uris][google.cloud.aiplatform.v1.CustomJob.web_access_uris]
222  // or
223  // [Trial.web_access_uris][google.cloud.aiplatform.v1.Trial.web_access_uris]
224  // (within
225  // [HyperparameterTuningJob.trials][google.cloud.aiplatform.v1.HyperparameterTuningJob.trials]).
226  bool enable_web_access = 10 [(google.api.field_behavior) = OPTIONAL];
227
228  // Optional. Whether you want Vertex AI to enable access to the customized
229  // dashboard in training chief container.
230  //
231  // If set to `true`, you can access the dashboard at the URIs given
232  // by
233  // [CustomJob.web_access_uris][google.cloud.aiplatform.v1.CustomJob.web_access_uris]
234  // or
235  // [Trial.web_access_uris][google.cloud.aiplatform.v1.Trial.web_access_uris]
236  // (within
237  // [HyperparameterTuningJob.trials][google.cloud.aiplatform.v1.HyperparameterTuningJob.trials]).
238  bool enable_dashboard_access = 16 [(google.api.field_behavior) = OPTIONAL];
239
240  // Optional. The Experiment associated with this job.
241  // Format:
242  // `projects/{project}/locations/{location}/metadataStores/{metadataStores}/contexts/{experiment-name}`
243  string experiment = 17 [
244    (google.api.field_behavior) = OPTIONAL,
245    (google.api.resource_reference) = {
246      type: "aiplatform.googleapis.com/Context"
247    }
248  ];
249
250  // Optional. The Experiment Run associated with this job.
251  // Format:
252  // `projects/{project}/locations/{location}/metadataStores/{metadataStores}/contexts/{experiment-name}-{experiment-run-name}`
253  string experiment_run = 18 [
254    (google.api.field_behavior) = OPTIONAL,
255    (google.api.resource_reference) = {
256      type: "aiplatform.googleapis.com/Context"
257    }
258  ];
259
260  // Optional. The name of the Model resources for which to generate a mapping
261  // to artifact URIs. Applicable only to some of the Google-provided custom
262  // jobs. Format: `projects/{project}/locations/{location}/models/{model}`
263  //
264  // In order to retrieve a specific version of the model, also provide
265  // the version ID or version alias.
266  //   Example: `projects/{project}/locations/{location}/models/{model}@2`
267  //              or
268  //            `projects/{project}/locations/{location}/models/{model}@golden`
269  // If no version ID or alias is specified, the "default" version will be
270  // returned. The "default" version alias is created for the first version of
271  // the model, and can be moved to other versions later on. There will be
272  // exactly one default version.
273  repeated string models = 20 [
274    (google.api.field_behavior) = OPTIONAL,
275    (google.api.resource_reference) = {
276      type: "aiplatform.googleapis.com/Model"
277    }
278  ];
279}
280
281// Represents the spec of a worker pool in a job.
282message WorkerPoolSpec {
283  // The custom task to be executed in this worker pool.
284  oneof task {
285    // The custom container task.
286    ContainerSpec container_spec = 6;
287
288    // The Python packaged task.
289    PythonPackageSpec python_package_spec = 7;
290  }
291
292  // Optional. Immutable. The specification of a single machine.
293  MachineSpec machine_spec = 1 [
294    (google.api.field_behavior) = OPTIONAL,
295    (google.api.field_behavior) = IMMUTABLE
296  ];
297
298  // Optional. The number of worker replicas to use for this worker pool.
299  int64 replica_count = 2 [(google.api.field_behavior) = OPTIONAL];
300
301  // Optional. List of NFS mount spec.
302  repeated NfsMount nfs_mounts = 4 [(google.api.field_behavior) = OPTIONAL];
303
304  // Disk spec.
305  DiskSpec disk_spec = 5;
306}
307
308// The spec of a Container.
309message ContainerSpec {
310  // Required. The URI of a container image in the Container Registry that is to
311  // be run on each worker replica.
312  string image_uri = 1 [(google.api.field_behavior) = REQUIRED];
313
314  // The command to be invoked when the container is started.
315  // It overrides the entrypoint instruction in Dockerfile when provided.
316  repeated string command = 2;
317
318  // The arguments to be passed when starting the container.
319  repeated string args = 3;
320
321  // Environment variables to be passed to the container.
322  // Maximum limit is 100.
323  repeated EnvVar env = 4;
324}
325
326// The spec of a Python packaged code.
327message PythonPackageSpec {
328  // Required. The URI of a container image in Artifact Registry that will run
329  // the provided Python package. Vertex AI provides a wide range of executor
330  // images with pre-installed packages to meet users' various use cases. See
331  // the list of [pre-built containers for
332  // training](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers).
333  // You must use an image from this list.
334  string executor_image_uri = 1 [(google.api.field_behavior) = REQUIRED];
335
336  // Required. The Google Cloud Storage location of the Python package files
337  // which are the training program and its dependent packages. The maximum
338  // number of package URIs is 100.
339  repeated string package_uris = 2 [(google.api.field_behavior) = REQUIRED];
340
341  // Required. The Python module name to run after installing the packages.
342  string python_module = 3 [(google.api.field_behavior) = REQUIRED];
343
344  // Command line arguments to be passed to the Python task.
345  repeated string args = 4;
346
347  // Environment variables to be passed to the python module.
348  // Maximum limit is 100.
349  repeated EnvVar env = 5;
350}
351
352// All parameters related to queuing and scheduling of custom jobs.
353message Scheduling {
354  // The maximum job running time. The default is 7 days.
355  google.protobuf.Duration timeout = 1;
356
357  // Restarts the entire CustomJob if a worker gets restarted.
358  // This feature can be used by distributed training jobs that are not
359  // resilient to workers leaving and joining a job.
360  bool restart_job_on_worker_restart = 3;
361
362  // Optional. Indicates if the job should retry for internal errors after the
363  // job starts running. If true, overrides
364  // `Scheduling.restart_job_on_worker_restart` to false.
365  bool disable_retries = 5 [(google.api.field_behavior) = OPTIONAL];
366}
367