1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.aiplatform.v1; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21import "google/cloud/aiplatform/v1/encryption_spec.proto"; 22import "google/cloud/aiplatform/v1/io.proto"; 23import "google/cloud/aiplatform/v1/saved_query.proto"; 24import "google/protobuf/struct.proto"; 25import "google/protobuf/timestamp.proto"; 26 27option csharp_namespace = "Google.Cloud.AIPlatform.V1"; 28option go_package = "cloud.google.com/go/aiplatform/apiv1/aiplatformpb;aiplatformpb"; 29option java_multiple_files = true; 30option java_outer_classname = "DatasetProto"; 31option java_package = "com.google.cloud.aiplatform.v1"; 32option php_namespace = "Google\\Cloud\\AIPlatform\\V1"; 33option ruby_package = "Google::Cloud::AIPlatform::V1"; 34 35// A collection of DataItems and Annotations on them. 36message Dataset { 37 option (google.api.resource) = { 38 type: "aiplatform.googleapis.com/Dataset" 39 pattern: "projects/{project}/locations/{location}/datasets/{dataset}" 40 }; 41 42 // Output only. The resource name of the Dataset. 43 string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 44 45 // Required. The user-defined name of the Dataset. 46 // The name can be up to 128 characters long and can consist of any UTF-8 47 // characters. 48 string display_name = 2 [(google.api.field_behavior) = REQUIRED]; 49 50 // The description of the Dataset. 51 string description = 16; 52 53 // Required. Points to a YAML file stored on Google Cloud Storage describing 54 // additional information about the Dataset. The schema is defined as an 55 // OpenAPI 3.0.2 Schema Object. The schema files that can be used here are 56 // found in gs://google-cloud-aiplatform/schema/dataset/metadata/. 57 string metadata_schema_uri = 3 [(google.api.field_behavior) = REQUIRED]; 58 59 // Required. Additional information about the Dataset. 60 google.protobuf.Value metadata = 8 [(google.api.field_behavior) = REQUIRED]; 61 62 // Output only. The number of DataItems in this Dataset. Only apply for 63 // non-structured Dataset. 64 int64 data_item_count = 10 [(google.api.field_behavior) = OUTPUT_ONLY]; 65 66 // Output only. Timestamp when this Dataset was created. 67 google.protobuf.Timestamp create_time = 4 68 [(google.api.field_behavior) = OUTPUT_ONLY]; 69 70 // Output only. Timestamp when this Dataset was last updated. 71 google.protobuf.Timestamp update_time = 5 72 [(google.api.field_behavior) = OUTPUT_ONLY]; 73 74 // Used to perform consistent read-modify-write updates. If not set, a blind 75 // "overwrite" update happens. 76 string etag = 6; 77 78 // The labels with user-defined metadata to organize your Datasets. 79 // 80 // Label keys and values can be no longer than 64 characters 81 // (Unicode codepoints), can only contain lowercase letters, numeric 82 // characters, underscores and dashes. International characters are allowed. 83 // No more than 64 user labels can be associated with one Dataset (System 84 // labels are excluded). 85 // 86 // See https://goo.gl/xmQnxf for more information and examples of labels. 87 // System reserved label keys are prefixed with "aiplatform.googleapis.com/" 88 // and are immutable. Following system labels exist for each Dataset: 89 // 90 // * "aiplatform.googleapis.com/dataset_metadata_schema": output only, its 91 // value is the 92 // [metadata_schema's][google.cloud.aiplatform.v1.Dataset.metadata_schema_uri] 93 // title. 94 map<string, string> labels = 7; 95 96 // All SavedQueries belong to the Dataset will be returned in List/Get 97 // Dataset response. The annotation_specs field 98 // will not be populated except for UI cases which will only use 99 // [annotation_spec_count][google.cloud.aiplatform.v1.SavedQuery.annotation_spec_count]. 100 // In CreateDataset request, a SavedQuery is created together if 101 // this field is set, up to one SavedQuery can be set in CreateDatasetRequest. 102 // The SavedQuery should not contain any AnnotationSpec. 103 repeated SavedQuery saved_queries = 9; 104 105 // Customer-managed encryption key spec for a Dataset. If set, this Dataset 106 // and all sub-resources of this Dataset will be secured by this key. 107 EncryptionSpec encryption_spec = 11; 108 109 // Output only. The resource name of the Artifact that was created in 110 // MetadataStore when creating the Dataset. The Artifact resource name pattern 111 // is 112 // `projects/{project}/locations/{location}/metadataStores/{metadata_store}/artifacts/{artifact}`. 113 string metadata_artifact = 17 [(google.api.field_behavior) = OUTPUT_ONLY]; 114} 115 116// Describes the location from where we import data into a Dataset, together 117// with the labels that will be applied to the DataItems and the Annotations. 118message ImportDataConfig { 119 // The source of the input. 120 oneof source { 121 // The Google Cloud Storage location for the input content. 122 GcsSource gcs_source = 1; 123 } 124 125 // Labels that will be applied to newly imported DataItems. If an identical 126 // DataItem as one being imported already exists in the Dataset, then these 127 // labels will be appended to these of the already existing one, and if labels 128 // with identical key is imported before, the old label value will be 129 // overwritten. If two DataItems are identical in the same import data 130 // operation, the labels will be combined and if key collision happens in this 131 // case, one of the values will be picked randomly. Two DataItems are 132 // considered identical if their content bytes are identical (e.g. image bytes 133 // or pdf bytes). 134 // These labels will be overridden by Annotation labels specified inside index 135 // file referenced by 136 // [import_schema_uri][google.cloud.aiplatform.v1.ImportDataConfig.import_schema_uri], 137 // e.g. jsonl file. 138 map<string, string> data_item_labels = 2; 139 140 // Labels that will be applied to newly imported Annotations. If two 141 // Annotations are identical, one of them will be deduped. Two Annotations are 142 // considered identical if their 143 // [payload][google.cloud.aiplatform.v1.Annotation.payload], 144 // [payload_schema_uri][google.cloud.aiplatform.v1.Annotation.payload_schema_uri] 145 // and all of their [labels][google.cloud.aiplatform.v1.Annotation.labels] are 146 // the same. These labels will be overridden by Annotation labels specified 147 // inside index file referenced by 148 // [import_schema_uri][google.cloud.aiplatform.v1.ImportDataConfig.import_schema_uri], 149 // e.g. jsonl file. 150 map<string, string> annotation_labels = 3; 151 152 // Required. Points to a YAML file stored on Google Cloud Storage describing 153 // the import format. Validation will be done against the schema. The schema 154 // is defined as an [OpenAPI 3.0.2 Schema 155 // Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject). 156 string import_schema_uri = 4 [(google.api.field_behavior) = REQUIRED]; 157} 158 159// Describes what part of the Dataset is to be exported, the destination of 160// the export and how to export. 161message ExportDataConfig { 162 // ExportUse indicates the usage of the exported files. It restricts file 163 // destination, format, annotations to be exported, whether to allow 164 // unannotated data to be exported and whether to clone files to temp Cloud 165 // Storage bucket. 166 enum ExportUse { 167 // Regular user export. 168 EXPORT_USE_UNSPECIFIED = 0; 169 170 // Export for custom code training. 171 CUSTOM_CODE_TRAINING = 6; 172 } 173 174 // The destination of the output. 175 oneof destination { 176 // The Google Cloud Storage location where the output is to be written to. 177 // In the given directory a new directory will be created with name: 178 // `export-data-<dataset-display-name>-<timestamp-of-export-call>` where 179 // timestamp is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format. All export 180 // output will be written into that directory. Inside that directory, 181 // annotations with the same schema will be grouped into sub directories 182 // which are named with the corresponding annotations' schema title. Inside 183 // these sub directories, a schema.yaml will be created to describe the 184 // output format. 185 GcsDestination gcs_destination = 1; 186 } 187 188 // The instructions how the export data should be split between the 189 // training, validation and test sets. 190 oneof split { 191 // Split based on fractions defining the size of each set. 192 ExportFractionSplit fraction_split = 5; 193 194 // Split based on the provided filters for each set. 195 ExportFilterSplit filter_split = 7; 196 } 197 198 // An expression for filtering what part of the Dataset is to be exported. 199 // Only Annotations that match this filter will be exported. The filter syntax 200 // is the same as in 201 // [ListAnnotations][google.cloud.aiplatform.v1.DatasetService.ListAnnotations]. 202 string annotations_filter = 2; 203 204 // The ID of a SavedQuery (annotation set) under the Dataset specified by 205 // [dataset_id][] used for filtering Annotations for training. 206 // 207 // Only used for custom training data export use cases. 208 // Only applicable to Datasets that have SavedQueries. 209 // 210 // Only Annotations that are associated with this SavedQuery are used in 211 // respectively training. When used in conjunction with 212 // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter], 213 // the Annotations used for training are filtered by both 214 // [saved_query_id][google.cloud.aiplatform.v1.ExportDataConfig.saved_query_id] 215 // and 216 // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter]. 217 // 218 // Only one of 219 // [saved_query_id][google.cloud.aiplatform.v1.ExportDataConfig.saved_query_id] 220 // and 221 // [annotation_schema_uri][google.cloud.aiplatform.v1.ExportDataConfig.annotation_schema_uri] 222 // should be specified as both of them represent the same thing: problem type. 223 string saved_query_id = 11; 224 225 // The Cloud Storage URI that points to a YAML file describing the annotation 226 // schema. The schema is defined as an OpenAPI 3.0.2 [Schema 227 // Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject). 228 // The schema files that can be used here are found in 229 // gs://google-cloud-aiplatform/schema/dataset/annotation/, note that the 230 // chosen schema must be consistent with 231 // [metadata][google.cloud.aiplatform.v1.Dataset.metadata_schema_uri] of the 232 // Dataset specified by [dataset_id][]. 233 // 234 // Only used for custom training data export use cases. 235 // Only applicable to Datasets that have DataItems and Annotations. 236 // 237 // Only Annotations that both match this schema and belong to DataItems not 238 // ignored by the split method are used in respectively training, validation 239 // or test role, depending on the role of the DataItem they are on. 240 // 241 // When used in conjunction with 242 // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter], 243 // the Annotations used for training are filtered by both 244 // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter] 245 // and 246 // [annotation_schema_uri][google.cloud.aiplatform.v1.ExportDataConfig.annotation_schema_uri]. 247 string annotation_schema_uri = 12; 248 249 // Indicates the usage of the exported files. 250 ExportUse export_use = 4; 251} 252 253// Assigns the input data to training, validation, and test sets as per the 254// given fractions. Any of `training_fraction`, `validation_fraction` and 255// `test_fraction` may optionally be provided, they must sum to up to 1. If the 256// provided ones sum to less than 1, the remainder is assigned to sets as 257// decided by Vertex AI. If none of the fractions are set, by default roughly 258// 80% of data is used for training, 10% for validation, and 10% for test. 259message ExportFractionSplit { 260 // The fraction of the input data that is to be used to train the Model. 261 double training_fraction = 1; 262 263 // The fraction of the input data that is to be used to validate the Model. 264 double validation_fraction = 2; 265 266 // The fraction of the input data that is to be used to evaluate the Model. 267 double test_fraction = 3; 268} 269 270// Assigns input data to training, validation, and test sets based on the given 271// filters, data pieces not matched by any filter are ignored. Currently only 272// supported for Datasets containing DataItems. 273// If any of the filters in this message are to match nothing, then they can be 274// set as '-' (the minus sign). 275// 276// Supported only for unstructured Datasets. 277message ExportFilterSplit { 278 // Required. A filter on DataItems of the Dataset. DataItems that match 279 // this filter are used to train the Model. A filter with same syntax 280 // as the one used in 281 // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems] 282 // may be used. If a single DataItem is matched by more than one of the 283 // FilterSplit filters, then it is assigned to the first set that applies to 284 // it in the training, validation, test order. 285 string training_filter = 1 [(google.api.field_behavior) = REQUIRED]; 286 287 // Required. A filter on DataItems of the Dataset. DataItems that match 288 // this filter are used to validate the Model. A filter with same syntax 289 // as the one used in 290 // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems] 291 // may be used. If a single DataItem is matched by more than one of the 292 // FilterSplit filters, then it is assigned to the first set that applies to 293 // it in the training, validation, test order. 294 string validation_filter = 2 [(google.api.field_behavior) = REQUIRED]; 295 296 // Required. A filter on DataItems of the Dataset. DataItems that match 297 // this filter are used to test the Model. A filter with same syntax 298 // as the one used in 299 // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems] 300 // may be used. If a single DataItem is matched by more than one of the 301 // FilterSplit filters, then it is assigned to the first set that applies to 302 // it in the training, validation, test order. 303 string test_filter = 3 [(google.api.field_behavior) = REQUIRED]; 304} 305