1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.aiplatform.v1beta1; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21import "google/cloud/aiplatform/v1beta1/encryption_spec.proto"; 22import "google/cloud/aiplatform/v1beta1/io.proto"; 23import "google/cloud/aiplatform/v1beta1/model.proto"; 24import "google/cloud/aiplatform/v1beta1/pipeline_state.proto"; 25import "google/protobuf/struct.proto"; 26import "google/protobuf/timestamp.proto"; 27import "google/rpc/status.proto"; 28 29option csharp_namespace = "Google.Cloud.AIPlatform.V1Beta1"; 30option go_package = "cloud.google.com/go/aiplatform/apiv1beta1/aiplatformpb;aiplatformpb"; 31option java_multiple_files = true; 32option java_outer_classname = "TrainingPipelineProto"; 33option java_package = "com.google.cloud.aiplatform.v1beta1"; 34option php_namespace = "Google\\Cloud\\AIPlatform\\V1beta1"; 35option ruby_package = "Google::Cloud::AIPlatform::V1beta1"; 36 37// The TrainingPipeline orchestrates tasks associated with training a Model. It 38// always executes the training task, and optionally may also 39// export data from Vertex AI's Dataset which becomes the training input, 40// [upload][google.cloud.aiplatform.v1beta1.ModelService.UploadModel] the Model 41// to Vertex AI, and evaluate the Model. 42message TrainingPipeline { 43 option (google.api.resource) = { 44 type: "aiplatform.googleapis.com/TrainingPipeline" 45 pattern: "projects/{project}/locations/{location}/trainingPipelines/{training_pipeline}" 46 }; 47 48 // Output only. Resource name of the TrainingPipeline. 49 string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 50 51 // Required. The user-defined name of this TrainingPipeline. 52 string display_name = 2 [(google.api.field_behavior) = REQUIRED]; 53 54 // Specifies Vertex AI owned input data that may be used for training the 55 // Model. The TrainingPipeline's 56 // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition] 57 // should make clear whether this config is used and if there are any special 58 // requirements on how it should be filled. If nothing about this config is 59 // mentioned in the 60 // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition], 61 // then it should be assumed that the TrainingPipeline does not depend on this 62 // configuration. 63 InputDataConfig input_data_config = 3; 64 65 // Required. A Google Cloud Storage path to the YAML file that defines the 66 // training task which is responsible for producing the model artifact, and 67 // may also include additional auxiliary work. The definition files that can 68 // be used here are found in 69 // gs://google-cloud-aiplatform/schema/trainingjob/definition/. 70 // Note: The URI given on output will be immutable and probably different, 71 // including the URI scheme, than the one given on input. The output URI will 72 // point to a location where the user only has a read access. 73 string training_task_definition = 4 [(google.api.field_behavior) = REQUIRED]; 74 75 // Required. The training task's parameter(s), as specified in the 76 // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]'s 77 // `inputs`. 78 google.protobuf.Value training_task_inputs = 5 79 [(google.api.field_behavior) = REQUIRED]; 80 81 // Output only. The metadata information as specified in the 82 // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]'s 83 // `metadata`. This metadata is an auxiliary runtime and final information 84 // about the training task. While the pipeline is running this information is 85 // populated only at a best effort basis. Only present if the 86 // pipeline's 87 // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition] 88 // contains `metadata` object. 89 google.protobuf.Value training_task_metadata = 6 90 [(google.api.field_behavior) = OUTPUT_ONLY]; 91 92 // Describes the Model that may be uploaded (via 93 // [ModelService.UploadModel][google.cloud.aiplatform.v1beta1.ModelService.UploadModel]) 94 // by this TrainingPipeline. The TrainingPipeline's 95 // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition] 96 // should make clear whether this Model description should be populated, and 97 // if there are any special requirements regarding how it should be filled. If 98 // nothing is mentioned in the 99 // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition], 100 // then it should be assumed that this field should not be filled and the 101 // training task either uploads the Model without a need of this information, 102 // or that training task does not support uploading a Model as part of the 103 // pipeline. When the Pipeline's state becomes `PIPELINE_STATE_SUCCEEDED` and 104 // the trained Model had been uploaded into Vertex AI, then the 105 // model_to_upload's resource 106 // [name][google.cloud.aiplatform.v1beta1.Model.name] is populated. The Model 107 // is always uploaded into the Project and Location in which this pipeline 108 // is. 109 Model model_to_upload = 7; 110 111 // Optional. The ID to use for the uploaded Model, which will become the final 112 // component of the model resource name. 113 // 114 // This value may be up to 63 characters, and valid characters are 115 // `[a-z0-9_-]`. The first character cannot be a number or hyphen. 116 string model_id = 22 [(google.api.field_behavior) = OPTIONAL]; 117 118 // Optional. When specify this field, the `model_to_upload` will not be 119 // uploaded as a new model, instead, it will become a new version of this 120 // `parent_model`. 121 string parent_model = 21 [(google.api.field_behavior) = OPTIONAL]; 122 123 // Output only. The detailed state of the pipeline. 124 PipelineState state = 9 [(google.api.field_behavior) = OUTPUT_ONLY]; 125 126 // Output only. Only populated when the pipeline's state is 127 // `PIPELINE_STATE_FAILED` or `PIPELINE_STATE_CANCELLED`. 128 google.rpc.Status error = 10 [(google.api.field_behavior) = OUTPUT_ONLY]; 129 130 // Output only. Time when the TrainingPipeline was created. 131 google.protobuf.Timestamp create_time = 11 132 [(google.api.field_behavior) = OUTPUT_ONLY]; 133 134 // Output only. Time when the TrainingPipeline for the first time entered the 135 // `PIPELINE_STATE_RUNNING` state. 136 google.protobuf.Timestamp start_time = 12 137 [(google.api.field_behavior) = OUTPUT_ONLY]; 138 139 // Output only. Time when the TrainingPipeline entered any of the following 140 // states: `PIPELINE_STATE_SUCCEEDED`, `PIPELINE_STATE_FAILED`, 141 // `PIPELINE_STATE_CANCELLED`. 142 google.protobuf.Timestamp end_time = 13 143 [(google.api.field_behavior) = OUTPUT_ONLY]; 144 145 // Output only. Time when the TrainingPipeline was most recently updated. 146 google.protobuf.Timestamp update_time = 14 147 [(google.api.field_behavior) = OUTPUT_ONLY]; 148 149 // The labels with user-defined metadata to organize TrainingPipelines. 150 // 151 // Label keys and values can be no longer than 64 characters 152 // (Unicode codepoints), can only contain lowercase letters, numeric 153 // characters, underscores and dashes. International characters are allowed. 154 // 155 // See https://goo.gl/xmQnxf for more information and examples of labels. 156 map<string, string> labels = 15; 157 158 // Customer-managed encryption key spec for a TrainingPipeline. If set, this 159 // TrainingPipeline will be secured by this key. 160 // 161 // Note: Model trained by this TrainingPipeline is also secured by this key if 162 // [model_to_upload][google.cloud.aiplatform.v1beta1.TrainingPipeline.encryption_spec] 163 // is not set separately. 164 EncryptionSpec encryption_spec = 18; 165} 166 167// Specifies Vertex AI owned input data to be used for training, and 168// possibly evaluating, the Model. 169message InputDataConfig { 170 // The instructions how the input data should be split between the 171 // training, validation and test sets. 172 // If no split type is provided, the 173 // [fraction_split][google.cloud.aiplatform.v1beta1.InputDataConfig.fraction_split] 174 // is used by default. 175 oneof split { 176 // Split based on fractions defining the size of each set. 177 FractionSplit fraction_split = 2; 178 179 // Split based on the provided filters for each set. 180 FilterSplit filter_split = 3; 181 182 // Supported only for tabular Datasets. 183 // 184 // Split based on a predefined key. 185 PredefinedSplit predefined_split = 4; 186 187 // Supported only for tabular Datasets. 188 // 189 // Split based on the timestamp of the input data pieces. 190 TimestampSplit timestamp_split = 5; 191 192 // Supported only for tabular Datasets. 193 // 194 // Split based on the distribution of the specified column. 195 StratifiedSplit stratified_split = 12; 196 } 197 198 // Only applicable to Custom and Hyperparameter Tuning TrainingPipelines. 199 // 200 // The destination of the training data to be written to. 201 // 202 // Supported destination file formats: 203 // * For non-tabular data: "jsonl". 204 // * For tabular data: "csv" and "bigquery". 205 // 206 // The following Vertex AI environment variables are passed to containers 207 // or python modules of the training task when this field is set: 208 // 209 // * AIP_DATA_FORMAT : Exported data format. 210 // * AIP_TRAINING_DATA_URI : Sharded exported training data uris. 211 // * AIP_VALIDATION_DATA_URI : Sharded exported validation data uris. 212 // * AIP_TEST_DATA_URI : Sharded exported test data uris. 213 oneof destination { 214 // The Cloud Storage location where the training data is to be 215 // written to. In the given directory a new directory is created with 216 // name: 217 // `dataset-<dataset-id>-<annotation-type>-<timestamp-of-training-call>` 218 // where timestamp is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format. 219 // All training input data is written into that directory. 220 // 221 // The Vertex AI environment variables representing Cloud Storage 222 // data URIs are represented in the Cloud Storage wildcard 223 // format to support sharded data. e.g.: "gs://.../training-*.jsonl" 224 // 225 // * AIP_DATA_FORMAT = "jsonl" for non-tabular data, "csv" for tabular data 226 // * AIP_TRAINING_DATA_URI = 227 // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/training-*.${AIP_DATA_FORMAT}" 228 // 229 // * AIP_VALIDATION_DATA_URI = 230 // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/validation-*.${AIP_DATA_FORMAT}" 231 // 232 // * AIP_TEST_DATA_URI = 233 // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/test-*.${AIP_DATA_FORMAT}" 234 GcsDestination gcs_destination = 8; 235 236 // Only applicable to custom training with tabular Dataset with BigQuery 237 // source. 238 // 239 // The BigQuery project location where the training data is to be written 240 // to. In the given project a new dataset is created with name 241 // `dataset_<dataset-id>_<annotation-type>_<timestamp-of-training-call>` 242 // where timestamp is in YYYY_MM_DDThh_mm_ss_sssZ format. All training 243 // input data is written into that dataset. In the dataset three 244 // tables are created, `training`, `validation` and `test`. 245 // 246 // * AIP_DATA_FORMAT = "bigquery". 247 // * AIP_TRAINING_DATA_URI = 248 // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.training" 249 // 250 // * AIP_VALIDATION_DATA_URI = 251 // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.validation" 252 // 253 // * AIP_TEST_DATA_URI = 254 // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.test" 255 BigQueryDestination bigquery_destination = 10; 256 } 257 258 // Required. The ID of the Dataset in the same Project and Location which data 259 // will be used to train the Model. The Dataset must use schema compatible 260 // with Model being trained, and what is compatible should be described in the 261 // used TrainingPipeline's [training_task_definition] 262 // [google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]. 263 // For tabular Datasets, all their data is exported to training, to pick 264 // and choose from. 265 string dataset_id = 1 [(google.api.field_behavior) = REQUIRED]; 266 267 // Applicable only to Datasets that have DataItems and Annotations. 268 // 269 // A filter on Annotations of the Dataset. Only Annotations that both 270 // match this filter and belong to DataItems not ignored by the split method 271 // are used in respectively training, validation or test role, depending on 272 // the role of the DataItem they are on (for the auto-assigned that role is 273 // decided by Vertex AI). A filter with same syntax as the one used in 274 // [ListAnnotations][google.cloud.aiplatform.v1beta1.DatasetService.ListAnnotations] 275 // may be used, but note here it filters across all Annotations of the 276 // Dataset, and not just within a single DataItem. 277 string annotations_filter = 6; 278 279 // Applicable only to custom training with Datasets that have DataItems and 280 // Annotations. 281 // 282 // Cloud Storage URI that points to a YAML file describing the annotation 283 // schema. The schema is defined as an OpenAPI 3.0.2 [Schema 284 // Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject). 285 // The schema files that can be used here are found in 286 // gs://google-cloud-aiplatform/schema/dataset/annotation/ , note that the 287 // chosen schema must be consistent with 288 // [metadata][google.cloud.aiplatform.v1beta1.Dataset.metadata_schema_uri] of 289 // the Dataset specified by 290 // [dataset_id][google.cloud.aiplatform.v1beta1.InputDataConfig.dataset_id]. 291 // 292 // Only Annotations that both match this schema and belong to DataItems not 293 // ignored by the split method are used in respectively training, validation 294 // or test role, depending on the role of the DataItem they are on. 295 // 296 // When used in conjunction with 297 // [annotations_filter][google.cloud.aiplatform.v1beta1.InputDataConfig.annotations_filter], 298 // the Annotations used for training are filtered by both 299 // [annotations_filter][google.cloud.aiplatform.v1beta1.InputDataConfig.annotations_filter] 300 // and 301 // [annotation_schema_uri][google.cloud.aiplatform.v1beta1.InputDataConfig.annotation_schema_uri]. 302 string annotation_schema_uri = 9; 303 304 // Only applicable to Datasets that have SavedQueries. 305 // 306 // The ID of a SavedQuery (annotation set) under the Dataset specified by 307 // [dataset_id][google.cloud.aiplatform.v1beta1.InputDataConfig.dataset_id] 308 // used for filtering Annotations for training. 309 // 310 // Only Annotations that are associated with this SavedQuery are used in 311 // respectively training. When used in conjunction with 312 // [annotations_filter][google.cloud.aiplatform.v1beta1.InputDataConfig.annotations_filter], 313 // the Annotations used for training are filtered by both 314 // [saved_query_id][google.cloud.aiplatform.v1beta1.InputDataConfig.saved_query_id] 315 // and 316 // [annotations_filter][google.cloud.aiplatform.v1beta1.InputDataConfig.annotations_filter]. 317 // 318 // Only one of 319 // [saved_query_id][google.cloud.aiplatform.v1beta1.InputDataConfig.saved_query_id] 320 // and 321 // [annotation_schema_uri][google.cloud.aiplatform.v1beta1.InputDataConfig.annotation_schema_uri] 322 // should be specified as both of them represent the same thing: problem type. 323 string saved_query_id = 7; 324 325 // Whether to persist the ML use assignment to data item system labels. 326 bool persist_ml_use_assignment = 11; 327} 328 329// Assigns the input data to training, validation, and test sets as per the 330// given fractions. Any of `training_fraction`, `validation_fraction` and 331// `test_fraction` may optionally be provided, they must sum to up to 1. If the 332// provided ones sum to less than 1, the remainder is assigned to sets as 333// decided by Vertex AI. If none of the fractions are set, by default roughly 334// 80% of data is used for training, 10% for validation, and 10% for test. 335message FractionSplit { 336 // The fraction of the input data that is to be used to train the Model. 337 double training_fraction = 1; 338 339 // The fraction of the input data that is to be used to validate the Model. 340 double validation_fraction = 2; 341 342 // The fraction of the input data that is to be used to evaluate the Model. 343 double test_fraction = 3; 344} 345 346// Assigns input data to training, validation, and test sets based on the given 347// filters, data pieces not matched by any filter are ignored. Currently only 348// supported for Datasets containing DataItems. 349// If any of the filters in this message are to match nothing, then they can be 350// set as '-' (the minus sign). 351// 352// Supported only for unstructured Datasets. 353// 354message FilterSplit { 355 // Required. A filter on DataItems of the Dataset. DataItems that match 356 // this filter are used to train the Model. A filter with same syntax 357 // as the one used in 358 // [DatasetService.ListDataItems][google.cloud.aiplatform.v1beta1.DatasetService.ListDataItems] 359 // may be used. If a single DataItem is matched by more than one of the 360 // FilterSplit filters, then it is assigned to the first set that applies to 361 // it in the training, validation, test order. 362 string training_filter = 1 [(google.api.field_behavior) = REQUIRED]; 363 364 // Required. A filter on DataItems of the Dataset. DataItems that match 365 // this filter are used to validate the Model. A filter with same syntax 366 // as the one used in 367 // [DatasetService.ListDataItems][google.cloud.aiplatform.v1beta1.DatasetService.ListDataItems] 368 // may be used. If a single DataItem is matched by more than one of the 369 // FilterSplit filters, then it is assigned to the first set that applies to 370 // it in the training, validation, test order. 371 string validation_filter = 2 [(google.api.field_behavior) = REQUIRED]; 372 373 // Required. A filter on DataItems of the Dataset. DataItems that match 374 // this filter are used to test the Model. A filter with same syntax 375 // as the one used in 376 // [DatasetService.ListDataItems][google.cloud.aiplatform.v1beta1.DatasetService.ListDataItems] 377 // may be used. If a single DataItem is matched by more than one of the 378 // FilterSplit filters, then it is assigned to the first set that applies to 379 // it in the training, validation, test order. 380 string test_filter = 3 [(google.api.field_behavior) = REQUIRED]; 381} 382 383// Assigns input data to training, validation, and test sets based on the 384// value of a provided key. 385// 386// Supported only for tabular Datasets. 387message PredefinedSplit { 388 // Required. The key is a name of one of the Dataset's data columns. 389 // The value of the key (either the label's value or value in the column) 390 // must be one of {`training`, `validation`, `test`}, and it defines to which 391 // set the given piece of data is assigned. If for a piece of data the key 392 // is not present or has an invalid value, that piece is ignored by the 393 // pipeline. 394 string key = 1 [(google.api.field_behavior) = REQUIRED]; 395} 396 397// Assigns input data to training, validation, and test sets based on a 398// provided timestamps. The youngest data pieces are assigned to training set, 399// next to validation set, and the oldest to the test set. 400// 401// Supported only for tabular Datasets. 402message TimestampSplit { 403 // The fraction of the input data that is to be used to train the Model. 404 double training_fraction = 1; 405 406 // The fraction of the input data that is to be used to validate the Model. 407 double validation_fraction = 2; 408 409 // The fraction of the input data that is to be used to evaluate the Model. 410 double test_fraction = 3; 411 412 // Required. The key is a name of one of the Dataset's data columns. 413 // The values of the key (the values in the column) must be in RFC 3339 414 // `date-time` format, where `time-offset` = `"Z"` 415 // (e.g. 1985-04-12T23:20:50.52Z). If for a piece of data the key is not 416 // present or has an invalid value, that piece is ignored by the pipeline. 417 string key = 4 [(google.api.field_behavior) = REQUIRED]; 418} 419 420// Assigns input data to the training, validation, and test sets so that the 421// distribution of values found in the categorical column (as specified by the 422// `key` field) is mirrored within each split. The fraction values determine 423// the relative sizes of the splits. 424// 425// For example, if the specified column has three values, with 50% of the rows 426// having value "A", 25% value "B", and 25% value "C", and the split fractions 427// are specified as 80/10/10, then the training set will constitute 80% of the 428// training data, with about 50% of the training set rows having the value "A" 429// for the specified column, about 25% having the value "B", and about 25% 430// having the value "C". 431// 432// Only the top 500 occurring values are used; any values not in the top 433// 500 values are randomly assigned to a split. If less than three rows contain 434// a specific value, those rows are randomly assigned. 435// 436// Supported only for tabular Datasets. 437message StratifiedSplit { 438 // The fraction of the input data that is to be used to train the Model. 439 double training_fraction = 1; 440 441 // The fraction of the input data that is to be used to validate the Model. 442 double validation_fraction = 2; 443 444 // The fraction of the input data that is to be used to evaluate the Model. 445 double test_fraction = 3; 446 447 // Required. The key is a name of one of the Dataset's data columns. 448 // The key provided must be for a categorical column. 449 string key = 4 [(google.api.field_behavior) = REQUIRED]; 450} 451