xref: /aosp_15_r20/external/googleapis/google/cloud/aiplatform/v1beta1/training_pipeline.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.aiplatform.v1beta1;
18
19import "google/api/field_behavior.proto";
20import "google/api/resource.proto";
21import "google/cloud/aiplatform/v1beta1/encryption_spec.proto";
22import "google/cloud/aiplatform/v1beta1/io.proto";
23import "google/cloud/aiplatform/v1beta1/model.proto";
24import "google/cloud/aiplatform/v1beta1/pipeline_state.proto";
25import "google/protobuf/struct.proto";
26import "google/protobuf/timestamp.proto";
27import "google/rpc/status.proto";
28
29option csharp_namespace = "Google.Cloud.AIPlatform.V1Beta1";
30option go_package = "cloud.google.com/go/aiplatform/apiv1beta1/aiplatformpb;aiplatformpb";
31option java_multiple_files = true;
32option java_outer_classname = "TrainingPipelineProto";
33option java_package = "com.google.cloud.aiplatform.v1beta1";
34option php_namespace = "Google\\Cloud\\AIPlatform\\V1beta1";
35option ruby_package = "Google::Cloud::AIPlatform::V1beta1";
36
37// The TrainingPipeline orchestrates tasks associated with training a Model. It
38// always executes the training task, and optionally may also
39// export data from Vertex AI's Dataset which becomes the training input,
40// [upload][google.cloud.aiplatform.v1beta1.ModelService.UploadModel] the Model
41// to Vertex AI, and evaluate the Model.
42message TrainingPipeline {
43  option (google.api.resource) = {
44    type: "aiplatform.googleapis.com/TrainingPipeline"
45    pattern: "projects/{project}/locations/{location}/trainingPipelines/{training_pipeline}"
46  };
47
48  // Output only. Resource name of the TrainingPipeline.
49  string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
50
51  // Required. The user-defined name of this TrainingPipeline.
52  string display_name = 2 [(google.api.field_behavior) = REQUIRED];
53
54  // Specifies Vertex AI owned input data that may be used for training the
55  // Model. The TrainingPipeline's
56  // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]
57  // should make clear whether this config is used and if there are any special
58  // requirements on how it should be filled. If nothing about this config is
59  // mentioned in the
60  // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition],
61  // then it should be assumed that the TrainingPipeline does not depend on this
62  // configuration.
63  InputDataConfig input_data_config = 3;
64
65  // Required. A Google Cloud Storage path to the YAML file that defines the
66  // training task which is responsible for producing the model artifact, and
67  // may also include additional auxiliary work. The definition files that can
68  // be used here are found in
69  // gs://google-cloud-aiplatform/schema/trainingjob/definition/.
70  // Note: The URI given on output will be immutable and probably different,
71  // including the URI scheme, than the one given on input. The output URI will
72  // point to a location where the user only has a read access.
73  string training_task_definition = 4 [(google.api.field_behavior) = REQUIRED];
74
75  // Required. The training task's parameter(s), as specified in the
76  // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]'s
77  // `inputs`.
78  google.protobuf.Value training_task_inputs = 5
79      [(google.api.field_behavior) = REQUIRED];
80
81  // Output only. The metadata information as specified in the
82  // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]'s
83  // `metadata`. This metadata is an auxiliary runtime and final information
84  // about the training task. While the pipeline is running this information is
85  // populated only at a best effort basis. Only present if the
86  // pipeline's
87  // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]
88  // contains `metadata` object.
89  google.protobuf.Value training_task_metadata = 6
90      [(google.api.field_behavior) = OUTPUT_ONLY];
91
92  // Describes the Model that may be uploaded (via
93  // [ModelService.UploadModel][google.cloud.aiplatform.v1beta1.ModelService.UploadModel])
94  // by this TrainingPipeline. The TrainingPipeline's
95  // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]
96  // should make clear whether this Model description should be populated, and
97  // if there are any special requirements regarding how it should be filled. If
98  // nothing is mentioned in the
99  // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition],
100  // then it should be assumed that this field should not be filled and the
101  // training task either uploads the Model without a need of this information,
102  // or that training task does not support uploading a Model as part of the
103  // pipeline. When the Pipeline's state becomes `PIPELINE_STATE_SUCCEEDED` and
104  // the trained Model had been uploaded into Vertex AI, then the
105  // model_to_upload's resource
106  // [name][google.cloud.aiplatform.v1beta1.Model.name] is populated. The Model
107  // is always uploaded into the Project and Location in which this pipeline
108  // is.
109  Model model_to_upload = 7;
110
111  // Optional. The ID to use for the uploaded Model, which will become the final
112  // component of the model resource name.
113  //
114  // This value may be up to 63 characters, and valid characters are
115  // `[a-z0-9_-]`. The first character cannot be a number or hyphen.
116  string model_id = 22 [(google.api.field_behavior) = OPTIONAL];
117
118  // Optional. When specify this field, the `model_to_upload` will not be
119  // uploaded as a new model, instead, it will become a new version of this
120  // `parent_model`.
121  string parent_model = 21 [(google.api.field_behavior) = OPTIONAL];
122
123  // Output only. The detailed state of the pipeline.
124  PipelineState state = 9 [(google.api.field_behavior) = OUTPUT_ONLY];
125
126  // Output only. Only populated when the pipeline's state is
127  // `PIPELINE_STATE_FAILED` or `PIPELINE_STATE_CANCELLED`.
128  google.rpc.Status error = 10 [(google.api.field_behavior) = OUTPUT_ONLY];
129
130  // Output only. Time when the TrainingPipeline was created.
131  google.protobuf.Timestamp create_time = 11
132      [(google.api.field_behavior) = OUTPUT_ONLY];
133
134  // Output only. Time when the TrainingPipeline for the first time entered the
135  // `PIPELINE_STATE_RUNNING` state.
136  google.protobuf.Timestamp start_time = 12
137      [(google.api.field_behavior) = OUTPUT_ONLY];
138
139  // Output only. Time when the TrainingPipeline entered any of the following
140  // states: `PIPELINE_STATE_SUCCEEDED`, `PIPELINE_STATE_FAILED`,
141  // `PIPELINE_STATE_CANCELLED`.
142  google.protobuf.Timestamp end_time = 13
143      [(google.api.field_behavior) = OUTPUT_ONLY];
144
145  // Output only. Time when the TrainingPipeline was most recently updated.
146  google.protobuf.Timestamp update_time = 14
147      [(google.api.field_behavior) = OUTPUT_ONLY];
148
149  // The labels with user-defined metadata to organize TrainingPipelines.
150  //
151  // Label keys and values can be no longer than 64 characters
152  // (Unicode codepoints), can only contain lowercase letters, numeric
153  // characters, underscores and dashes. International characters are allowed.
154  //
155  // See https://goo.gl/xmQnxf for more information and examples of labels.
156  map<string, string> labels = 15;
157
158  // Customer-managed encryption key spec for a TrainingPipeline. If set, this
159  // TrainingPipeline will be secured by this key.
160  //
161  // Note: Model trained by this TrainingPipeline is also secured by this key if
162  // [model_to_upload][google.cloud.aiplatform.v1beta1.TrainingPipeline.encryption_spec]
163  // is not set separately.
164  EncryptionSpec encryption_spec = 18;
165}
166
167// Specifies Vertex AI owned input data to be used for training, and
168// possibly evaluating, the Model.
169message InputDataConfig {
170  // The instructions how the input data should be split between the
171  // training, validation and test sets.
172  // If no split type is provided, the
173  // [fraction_split][google.cloud.aiplatform.v1beta1.InputDataConfig.fraction_split]
174  // is used by default.
175  oneof split {
176    // Split based on fractions defining the size of each set.
177    FractionSplit fraction_split = 2;
178
179    // Split based on the provided filters for each set.
180    FilterSplit filter_split = 3;
181
182    // Supported only for tabular Datasets.
183    //
184    // Split based on a predefined key.
185    PredefinedSplit predefined_split = 4;
186
187    // Supported only for tabular Datasets.
188    //
189    // Split based on the timestamp of the input data pieces.
190    TimestampSplit timestamp_split = 5;
191
192    // Supported only for tabular Datasets.
193    //
194    // Split based on the distribution of the specified column.
195    StratifiedSplit stratified_split = 12;
196  }
197
198  // Only applicable to Custom and Hyperparameter Tuning TrainingPipelines.
199  //
200  // The destination of the training data to be written to.
201  //
202  // Supported destination file formats:
203  //   * For non-tabular data: "jsonl".
204  //   * For tabular data: "csv" and "bigquery".
205  //
206  // The following Vertex AI environment variables are passed to containers
207  // or python modules of the training task when this field is set:
208  //
209  // * AIP_DATA_FORMAT : Exported data format.
210  // * AIP_TRAINING_DATA_URI : Sharded exported training data uris.
211  // * AIP_VALIDATION_DATA_URI : Sharded exported validation data uris.
212  // * AIP_TEST_DATA_URI : Sharded exported test data uris.
213  oneof destination {
214    // The Cloud Storage location where the training data is to be
215    // written to. In the given directory a new directory is created with
216    // name:
217    // `dataset-<dataset-id>-<annotation-type>-<timestamp-of-training-call>`
218    // where timestamp is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format.
219    // All training input data is written into that directory.
220    //
221    // The Vertex AI environment variables representing Cloud Storage
222    // data URIs are represented in the Cloud Storage wildcard
223    // format to support sharded data. e.g.: "gs://.../training-*.jsonl"
224    //
225    // * AIP_DATA_FORMAT = "jsonl" for non-tabular data, "csv" for tabular data
226    // * AIP_TRAINING_DATA_URI =
227    // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/training-*.${AIP_DATA_FORMAT}"
228    //
229    // * AIP_VALIDATION_DATA_URI =
230    // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/validation-*.${AIP_DATA_FORMAT}"
231    //
232    // * AIP_TEST_DATA_URI =
233    // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/test-*.${AIP_DATA_FORMAT}"
234    GcsDestination gcs_destination = 8;
235
236    // Only applicable to custom training with tabular Dataset with BigQuery
237    // source.
238    //
239    // The BigQuery project location where the training data is to be written
240    // to. In the given project a new dataset is created with name
241    // `dataset_<dataset-id>_<annotation-type>_<timestamp-of-training-call>`
242    // where timestamp is in YYYY_MM_DDThh_mm_ss_sssZ format. All training
243    // input data is written into that dataset. In the dataset three
244    // tables are created, `training`, `validation` and `test`.
245    //
246    // * AIP_DATA_FORMAT = "bigquery".
247    // * AIP_TRAINING_DATA_URI  =
248    // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.training"
249    //
250    // * AIP_VALIDATION_DATA_URI =
251    // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.validation"
252    //
253    // * AIP_TEST_DATA_URI =
254    // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.test"
255    BigQueryDestination bigquery_destination = 10;
256  }
257
258  // Required. The ID of the Dataset in the same Project and Location which data
259  // will be used to train the Model. The Dataset must use schema compatible
260  // with Model being trained, and what is compatible should be described in the
261  // used TrainingPipeline's [training_task_definition]
262  // [google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition].
263  // For tabular Datasets, all their data is exported to training, to pick
264  // and choose from.
265  string dataset_id = 1 [(google.api.field_behavior) = REQUIRED];
266
267  // Applicable only to Datasets that have DataItems and Annotations.
268  //
269  // A filter on Annotations of the Dataset. Only Annotations that both
270  // match this filter and belong to DataItems not ignored by the split method
271  // are used in respectively training, validation or test role, depending on
272  // the role of the DataItem they are on (for the auto-assigned that role is
273  // decided by Vertex AI). A filter with same syntax as the one used in
274  // [ListAnnotations][google.cloud.aiplatform.v1beta1.DatasetService.ListAnnotations]
275  // may be used, but note here it filters across all Annotations of the
276  // Dataset, and not just within a single DataItem.
277  string annotations_filter = 6;
278
279  // Applicable only to custom training with Datasets that have DataItems and
280  // Annotations.
281  //
282  // Cloud Storage URI that points to a YAML file describing the annotation
283  // schema. The schema is defined as an OpenAPI 3.0.2 [Schema
284  // Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject).
285  // The schema files that can be used here are found in
286  // gs://google-cloud-aiplatform/schema/dataset/annotation/ , note that the
287  // chosen schema must be consistent with
288  // [metadata][google.cloud.aiplatform.v1beta1.Dataset.metadata_schema_uri] of
289  // the Dataset specified by
290  // [dataset_id][google.cloud.aiplatform.v1beta1.InputDataConfig.dataset_id].
291  //
292  // Only Annotations that both match this schema and belong to DataItems not
293  // ignored by the split method are used in respectively training, validation
294  // or test role, depending on the role of the DataItem they are on.
295  //
296  // When used in conjunction with
297  // [annotations_filter][google.cloud.aiplatform.v1beta1.InputDataConfig.annotations_filter],
298  // the Annotations used for training are filtered by both
299  // [annotations_filter][google.cloud.aiplatform.v1beta1.InputDataConfig.annotations_filter]
300  // and
301  // [annotation_schema_uri][google.cloud.aiplatform.v1beta1.InputDataConfig.annotation_schema_uri].
302  string annotation_schema_uri = 9;
303
304  // Only applicable to Datasets that have SavedQueries.
305  //
306  // The ID of a SavedQuery (annotation set) under the Dataset specified by
307  // [dataset_id][google.cloud.aiplatform.v1beta1.InputDataConfig.dataset_id]
308  // used for filtering Annotations for training.
309  //
310  // Only Annotations that are associated with this SavedQuery are used in
311  // respectively training. When used in conjunction with
312  // [annotations_filter][google.cloud.aiplatform.v1beta1.InputDataConfig.annotations_filter],
313  // the Annotations used for training are filtered by both
314  // [saved_query_id][google.cloud.aiplatform.v1beta1.InputDataConfig.saved_query_id]
315  // and
316  // [annotations_filter][google.cloud.aiplatform.v1beta1.InputDataConfig.annotations_filter].
317  //
318  // Only one of
319  // [saved_query_id][google.cloud.aiplatform.v1beta1.InputDataConfig.saved_query_id]
320  // and
321  // [annotation_schema_uri][google.cloud.aiplatform.v1beta1.InputDataConfig.annotation_schema_uri]
322  // should be specified as both of them represent the same thing: problem type.
323  string saved_query_id = 7;
324
325  // Whether to persist the ML use assignment to data item system labels.
326  bool persist_ml_use_assignment = 11;
327}
328
329// Assigns the input data to training, validation, and test sets as per the
330// given fractions. Any of `training_fraction`, `validation_fraction` and
331// `test_fraction` may optionally be provided, they must sum to up to 1. If the
332// provided ones sum to less than 1, the remainder is assigned to sets as
333// decided by Vertex AI. If none of the fractions are set, by default roughly
334// 80% of data is used for training, 10% for validation, and 10% for test.
335message FractionSplit {
336  // The fraction of the input data that is to be used to train the Model.
337  double training_fraction = 1;
338
339  // The fraction of the input data that is to be used to validate the Model.
340  double validation_fraction = 2;
341
342  // The fraction of the input data that is to be used to evaluate the Model.
343  double test_fraction = 3;
344}
345
346// Assigns input data to training, validation, and test sets based on the given
347// filters, data pieces not matched by any filter are ignored. Currently only
348// supported for Datasets containing DataItems.
349// If any of the filters in this message are to match nothing, then they can be
350// set as '-' (the minus sign).
351//
352// Supported only for unstructured Datasets.
353//
354message FilterSplit {
355  // Required. A filter on DataItems of the Dataset. DataItems that match
356  // this filter are used to train the Model. A filter with same syntax
357  // as the one used in
358  // [DatasetService.ListDataItems][google.cloud.aiplatform.v1beta1.DatasetService.ListDataItems]
359  // may be used. If a single DataItem is matched by more than one of the
360  // FilterSplit filters, then it is assigned to the first set that applies to
361  // it in the training, validation, test order.
362  string training_filter = 1 [(google.api.field_behavior) = REQUIRED];
363
364  // Required. A filter on DataItems of the Dataset. DataItems that match
365  // this filter are used to validate the Model. A filter with same syntax
366  // as the one used in
367  // [DatasetService.ListDataItems][google.cloud.aiplatform.v1beta1.DatasetService.ListDataItems]
368  // may be used. If a single DataItem is matched by more than one of the
369  // FilterSplit filters, then it is assigned to the first set that applies to
370  // it in the training, validation, test order.
371  string validation_filter = 2 [(google.api.field_behavior) = REQUIRED];
372
373  // Required. A filter on DataItems of the Dataset. DataItems that match
374  // this filter are used to test the Model. A filter with same syntax
375  // as the one used in
376  // [DatasetService.ListDataItems][google.cloud.aiplatform.v1beta1.DatasetService.ListDataItems]
377  // may be used. If a single DataItem is matched by more than one of the
378  // FilterSplit filters, then it is assigned to the first set that applies to
379  // it in the training, validation, test order.
380  string test_filter = 3 [(google.api.field_behavior) = REQUIRED];
381}
382
383// Assigns input data to training, validation, and test sets based on the
384// value of a provided key.
385//
386// Supported only for tabular Datasets.
387message PredefinedSplit {
388  // Required. The key is a name of one of the Dataset's data columns.
389  // The value of the key (either the label's value or value in the column)
390  // must be one of {`training`, `validation`, `test`}, and it defines to which
391  // set the given piece of data is assigned. If for a piece of data the key
392  // is not present or has an invalid value, that piece is ignored by the
393  // pipeline.
394  string key = 1 [(google.api.field_behavior) = REQUIRED];
395}
396
397// Assigns input data to training, validation, and test sets based on a
398// provided timestamps. The youngest data pieces are assigned to training set,
399// next to validation set, and the oldest to the test set.
400//
401// Supported only for tabular Datasets.
402message TimestampSplit {
403  // The fraction of the input data that is to be used to train the Model.
404  double training_fraction = 1;
405
406  // The fraction of the input data that is to be used to validate the Model.
407  double validation_fraction = 2;
408
409  // The fraction of the input data that is to be used to evaluate the Model.
410  double test_fraction = 3;
411
412  // Required. The key is a name of one of the Dataset's data columns.
413  // The values of the key (the values in the column) must be in RFC 3339
414  // `date-time` format, where `time-offset` = `"Z"`
415  // (e.g. 1985-04-12T23:20:50.52Z). If for a piece of data the key is not
416  // present or has an invalid value, that piece is ignored by the pipeline.
417  string key = 4 [(google.api.field_behavior) = REQUIRED];
418}
419
420// Assigns input data to the training, validation, and test sets so that the
421// distribution of values found in the categorical column (as specified by the
422// `key` field) is mirrored within each split. The fraction values determine
423// the relative sizes of the splits.
424//
425// For example, if the specified column has three values, with 50% of the rows
426// having value "A", 25% value "B", and 25% value "C", and the split fractions
427// are specified as 80/10/10, then the training set will constitute 80% of the
428// training data, with about 50% of the training set rows having the value "A"
429// for the specified column, about 25% having the value "B", and about 25%
430// having the value "C".
431//
432// Only the top 500 occurring values are used; any values not in the top
433// 500 values are randomly assigned to a split. If less than three rows contain
434// a specific value, those rows are randomly assigned.
435//
436// Supported only for tabular Datasets.
437message StratifiedSplit {
438  // The fraction of the input data that is to be used to train the Model.
439  double training_fraction = 1;
440
441  // The fraction of the input data that is to be used to validate the Model.
442  double validation_fraction = 2;
443
444  // The fraction of the input data that is to be used to evaluate the Model.
445  double test_fraction = 3;
446
447  // Required. The key is a name of one of the Dataset's data columns.
448  // The key provided must be for a categorical column.
449  string key = 4 [(google.api.field_behavior) = REQUIRED];
450}
451