aiplatform/v1/dataset.proto

// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.aiplatform.v1;

import "google/api/field_behavior.proto";
import "google/api/resource.proto";
import "google/cloud/aiplatform/v1/encryption_spec.proto";
import "google/cloud/aiplatform/v1/io.proto";
import "google/cloud/aiplatform/v1/saved_query.proto";
import "google/protobuf/struct.proto";
import "google/protobuf/timestamp.proto";

option csharp_namespace = "Google.Cloud.AIPlatform.V1";
option go_package = "cloud.google.com/go/aiplatform/apiv1/aiplatformpb;aiplatformpb";
option java_multiple_files = true;
option java_outer_classname = "DatasetProto";
option java_package = "com.google.cloud.aiplatform.v1";
option php_namespace = "Google\\Cloud\\AIPlatform\\V1";
option ruby_package = "Google::Cloud::AIPlatform::V1";

// A collection of DataItems and Annotations on them.
message Dataset {
  option (google.api.resource) = {
    type: "aiplatform.googleapis.com/Dataset"
    pattern: "projects/{project}/locations/{location}/datasets/{dataset}"
  };

  // Output only. The resource name of the Dataset.
  string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Required. The user-defined name of the Dataset.
  // The name can be up to 128 characters long and can consist of any UTF-8
  // characters.
  string display_name = 2 [(google.api.field_behavior) = REQUIRED];

  // The description of the Dataset.
  string description = 16;

  // Required. Points to a YAML file stored on Google Cloud Storage describing
  // additional information about the Dataset. The schema is defined as an
  // OpenAPI 3.0.2 Schema Object. The schema files that can be used here are
  // found in gs://google-cloud-aiplatform/schema/dataset/metadata/.
  string metadata_schema_uri = 3 [(google.api.field_behavior) = REQUIRED];

  // Required. Additional information about the Dataset.
  google.protobuf.Value metadata = 8 [(google.api.field_behavior) = REQUIRED];

  // Output only. The number of DataItems in this Dataset. Only apply for
  // non-structured Dataset.
  int64 data_item_count = 10 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Timestamp when this Dataset was created.
  google.protobuf.Timestamp create_time = 4
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Timestamp when this Dataset was last updated.
  google.protobuf.Timestamp update_time = 5
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Used to perform consistent read-modify-write updates. If not set, a blind
  // "overwrite" update happens.
  string etag = 6;

  // The labels with user-defined metadata to organize your Datasets.
  //
  // Label keys and values can be no longer than 64 characters
  // (Unicode codepoints), can only contain lowercase letters, numeric
  // characters, underscores and dashes. International characters are allowed.
  // No more than 64 user labels can be associated with one Dataset (System
  // labels are excluded).
  //
  // See https://goo.gl/xmQnxf for more information and examples of labels.
  // System reserved label keys are prefixed with "aiplatform.googleapis.com/"
  // and are immutable. Following system labels exist for each Dataset:
  //
  // * "aiplatform.googleapis.com/dataset_metadata_schema": output only, its
  //   value is the
  //   [metadata_schema's][google.cloud.aiplatform.v1.Dataset.metadata_schema_uri]
  //   title.
  map<string, string> labels = 7;

  // All SavedQueries belong to the Dataset will be returned in List/Get
  // Dataset response. The annotation_specs field
  // will not be populated except for UI cases which will only use
  // [annotation_spec_count][google.cloud.aiplatform.v1.SavedQuery.annotation_spec_count].
  // In CreateDataset request, a SavedQuery is created together if
  // this field is set, up to one SavedQuery can be set in CreateDatasetRequest.
  // The SavedQuery should not contain any AnnotationSpec.
  repeated SavedQuery saved_queries = 9;

  // Customer-managed encryption key spec for a Dataset. If set, this Dataset
  // and all sub-resources of this Dataset will be secured by this key.
  EncryptionSpec encryption_spec = 11;

  // Output only. The resource name of the Artifact that was created in
  // MetadataStore when creating the Dataset. The Artifact resource name pattern
  // is
  // `projects/{project}/locations/{location}/metadataStores/{metadata_store}/artifacts/{artifact}`.
  string metadata_artifact = 17 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Describes the location from where we import data into a Dataset, together
// with the labels that will be applied to the DataItems and the Annotations.
message ImportDataConfig {
  // The source of the input.
  oneof source {
    // The Google Cloud Storage location for the input content.
    GcsSource gcs_source = 1;
  }

  // Labels that will be applied to newly imported DataItems. If an identical
  // DataItem as one being imported already exists in the Dataset, then these
  // labels will be appended to these of the already existing one, and if labels
  // with identical key is imported before, the old label value will be
  // overwritten. If two DataItems are identical in the same import data
  // operation, the labels will be combined and if key collision happens in this
  // case, one of the values will be picked randomly. Two DataItems are
  // considered identical if their content bytes are identical (e.g. image bytes
  // or pdf bytes).
  // These labels will be overridden by Annotation labels specified inside index
  // file referenced by
  // [import_schema_uri][google.cloud.aiplatform.v1.ImportDataConfig.import_schema_uri],
  // e.g. jsonl file.
  map<string, string> data_item_labels = 2;

  // Labels that will be applied to newly imported Annotations. If two
  // Annotations are identical, one of them will be deduped. Two Annotations are
  // considered identical if their
  // [payload][google.cloud.aiplatform.v1.Annotation.payload],
  // [payload_schema_uri][google.cloud.aiplatform.v1.Annotation.payload_schema_uri]
  // and all of their [labels][google.cloud.aiplatform.v1.Annotation.labels] are
  // the same. These labels will be overridden by Annotation labels specified
  // inside index file referenced by
  // [import_schema_uri][google.cloud.aiplatform.v1.ImportDataConfig.import_schema_uri],
  // e.g. jsonl file.
  map<string, string> annotation_labels = 3;

  // Required. Points to a YAML file stored on Google Cloud Storage describing
  // the import format. Validation will be done against the schema. The schema
  // is defined as an [OpenAPI 3.0.2 Schema
  // Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject).
  string import_schema_uri = 4 [(google.api.field_behavior) = REQUIRED];
}

// Describes what part of the Dataset is to be exported, the destination of
// the export and how to export.
message ExportDataConfig {
  // ExportUse indicates the usage of the exported files. It restricts file
  // destination, format, annotations to be exported, whether to allow
  // unannotated data to be exported and whether to clone files to temp Cloud
  // Storage bucket.
  enum ExportUse {
    // Regular user export.
    EXPORT_USE_UNSPECIFIED = 0;

    // Export for custom code training.
    CUSTOM_CODE_TRAINING = 6;
  }

  // The destination of the output.
  oneof destination {
    // The Google Cloud Storage location where the output is to be written to.
    // In the given directory a new directory will be created with name:
    // `export-data-<dataset-display-name>-<timestamp-of-export-call>` where
    // timestamp is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format. All export
    // output will be written into that directory. Inside that directory,
    // annotations with the same schema will be grouped into sub directories
    // which are named with the corresponding annotations' schema title. Inside
    // these sub directories, a schema.yaml will be created to describe the
    // output format.
    GcsDestination gcs_destination = 1;
  }

  // The instructions how the export data should be split between the
  // training, validation and test sets.
  oneof split {
    // Split based on fractions defining the size of each set.
    ExportFractionSplit fraction_split = 5;

    // Split based on the provided filters for each set.
    ExportFilterSplit filter_split = 7;
  }

  // An expression for filtering what part of the Dataset is to be exported.
  // Only Annotations that match this filter will be exported. The filter syntax
  // is the same as in
  // [ListAnnotations][google.cloud.aiplatform.v1.DatasetService.ListAnnotations].
  string annotations_filter = 2;

  // The ID of a SavedQuery (annotation set) under the Dataset specified by
  // [dataset_id][] used for filtering Annotations for training.
  //
  // Only used for custom training data export use cases.
  // Only applicable to Datasets that have SavedQueries.
  //
  // Only Annotations that are associated with this SavedQuery are used in
  // respectively training. When used in conjunction with
  // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter],
  // the Annotations used for training are filtered by both
  // [saved_query_id][google.cloud.aiplatform.v1.ExportDataConfig.saved_query_id]
  // and
  // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter].
  //
  // Only one of
  // [saved_query_id][google.cloud.aiplatform.v1.ExportDataConfig.saved_query_id]
  // and
  // [annotation_schema_uri][google.cloud.aiplatform.v1.ExportDataConfig.annotation_schema_uri]
  // should be specified as both of them represent the same thing: problem type.
  string saved_query_id = 11;

  // The Cloud Storage URI that points to a YAML file describing the annotation
  // schema. The schema is defined as an OpenAPI 3.0.2 [Schema
  // Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject).
  // The schema files that can be used here are found in
  // gs://google-cloud-aiplatform/schema/dataset/annotation/, note that the
  // chosen schema must be consistent with
  // [metadata][google.cloud.aiplatform.v1.Dataset.metadata_schema_uri] of the
  // Dataset specified by [dataset_id][].
  //
  // Only used for custom training data export use cases.
  // Only applicable to Datasets that have DataItems and Annotations.
  //
  // Only Annotations that both match this schema and belong to DataItems not
  // ignored by the split method are used in respectively training, validation
  // or test role, depending on the role of the DataItem they are on.
  //
  // When used in conjunction with
  // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter],
  // the Annotations used for training are filtered by both
  // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter]
  // and
  // [annotation_schema_uri][google.cloud.aiplatform.v1.ExportDataConfig.annotation_schema_uri].
  string annotation_schema_uri = 12;

  // Indicates the usage of the exported files.
  ExportUse export_use = 4;
}

// Assigns the input data to training, validation, and test sets as per the
// given fractions. Any of `training_fraction`, `validation_fraction` and
// `test_fraction` may optionally be provided, they must sum to up to 1. If the
// provided ones sum to less than 1, the remainder is assigned to sets as
// decided by Vertex AI. If none of the fractions are set, by default roughly
// 80% of data is used for training, 10% for validation, and 10% for test.
message ExportFractionSplit {
  // The fraction of the input data that is to be used to train the Model.
  double training_fraction = 1;

  // The fraction of the input data that is to be used to validate the Model.
  double validation_fraction = 2;

  // The fraction of the input data that is to be used to evaluate the Model.
  double test_fraction = 3;
}

// Assigns input data to training, validation, and test sets based on the given
// filters, data pieces not matched by any filter are ignored. Currently only
// supported for Datasets containing DataItems.
// If any of the filters in this message are to match nothing, then they can be
// set as '-' (the minus sign).
//
// Supported only for unstructured Datasets.
message ExportFilterSplit {
  // Required. A filter on DataItems of the Dataset. DataItems that match
  // this filter are used to train the Model. A filter with same syntax
  // as the one used in
  // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems]
  // may be used. If a single DataItem is matched by more than one of the
  // FilterSplit filters, then it is assigned to the first set that applies to
  // it in the training, validation, test order.
  string training_filter = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. A filter on DataItems of the Dataset. DataItems that match
  // this filter are used to validate the Model. A filter with same syntax
  // as the one used in
  // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems]
  // may be used. If a single DataItem is matched by more than one of the
  // FilterSplit filters, then it is assigned to the first set that applies to
  // it in the training, validation, test order.
  string validation_filter = 2 [(google.api.field_behavior) = REQUIRED];

  // Required. A filter on DataItems of the Dataset. DataItems that match
  // this filter are used to test the Model. A filter with same syntax
  // as the one used in
  // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems]
  // may be used. If a single DataItem is matched by more than one of the
  // FilterSplit filters, then it is assigned to the first set that applies to
  // it in the training, validation, test order.
  string test_filter = 3 [(google.api.field_behavior) = REQUIRED];
}