1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.ai.generativelanguage.v1beta; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21import "google/protobuf/timestamp.proto"; 22 23option go_package = "cloud.google.com/go/ai/generativelanguage/apiv1beta/generativelanguagepb;generativelanguagepb"; 24option java_multiple_files = true; 25option java_outer_classname = "RetrieverProto"; 26option java_package = "com.google.ai.generativelanguage.v1beta"; 27 28// A `Corpus` is a collection of `Document`s. 29// A project can create up to 5 corpora. 30message Corpus { 31 option (google.api.resource) = { 32 type: "generativelanguage.googleapis.com/Corpus" 33 pattern: "corpora/{corpus}" 34 plural: "corpora" 35 singular: "corpus" 36 }; 37 38 // Immutable. Identifier. The `Corpus` resource name. The ID (name excluding 39 // the "corpora/" prefix) can contain up to 40 characters that are lowercase 40 // alphanumeric or dashes 41 // (-). The ID cannot start or end with a dash. If the name is empty on 42 // create, a unique name will be derived from `display_name` along with a 12 43 // character random suffix. 44 // Example: `corpora/my-awesome-corpora-123a456b789c` 45 string name = 1 [ 46 (google.api.field_behavior) = IDENTIFIER, 47 (google.api.field_behavior) = IMMUTABLE 48 ]; 49 50 // Optional. The human-readable display name for the `Corpus`. The display 51 // name must be no more than 512 characters in length, including spaces. 52 // Example: "Docs on Semantic Retriever" 53 string display_name = 2 [(google.api.field_behavior) = OPTIONAL]; 54 55 // Output only. The Timestamp of when the `Corpus` was created. 56 google.protobuf.Timestamp create_time = 3 57 [(google.api.field_behavior) = OUTPUT_ONLY]; 58 59 // Output only. The Timestamp of when the `Corpus` was last updated. 60 google.protobuf.Timestamp update_time = 4 61 [(google.api.field_behavior) = OUTPUT_ONLY]; 62} 63 64// A `Document` is a collection of `Chunk`s. 65// A `Corpus` can have a maximum of 10,000 `Document`s. 66message Document { 67 option (google.api.resource) = { 68 type: "generativelanguage.googleapis.com/Document" 69 pattern: "corpora/{corpus}/documents/{document}" 70 plural: "documents" 71 singular: "document" 72 }; 73 74 // Immutable. Identifier. The `Document` resource name. The ID (name excluding 75 // the "corpora/*/documents/" prefix) can contain up to 40 characters that are 76 // lowercase alphanumeric or dashes (-). The ID cannot start or end with a 77 // dash. If the name is empty on create, a unique name will be derived from 78 // `display_name` along with a 12 character random suffix. 79 // Example: `corpora/{corpus_id}/documents/my-awesome-doc-123a456b789c` 80 string name = 1 [ 81 (google.api.field_behavior) = IDENTIFIER, 82 (google.api.field_behavior) = IMMUTABLE 83 ]; 84 85 // Optional. The human-readable display name for the `Document`. The display 86 // name must be no more than 512 characters in length, including spaces. 87 // Example: "Semantic Retriever Documentation" 88 string display_name = 2 [(google.api.field_behavior) = OPTIONAL]; 89 90 // Optional. User provided custom metadata stored as key-value pairs used for 91 // querying. A `Document` can have a maximum of 20 `CustomMetadata`. 92 repeated CustomMetadata custom_metadata = 3 93 [(google.api.field_behavior) = OPTIONAL]; 94 95 // Output only. The Timestamp of when the `Document` was last updated. 96 google.protobuf.Timestamp update_time = 4 97 [(google.api.field_behavior) = OUTPUT_ONLY]; 98 99 // Output only. The Timestamp of when the `Document` was created. 100 google.protobuf.Timestamp create_time = 5 101 [(google.api.field_behavior) = OUTPUT_ONLY]; 102} 103 104// User provided string values assigned to a single metadata key. 105message StringList { 106 // The string values of the metadata to store. 107 repeated string values = 1; 108} 109 110// User provided metadata stored as key-value pairs. 111message CustomMetadata { 112 oneof value { 113 // The string value of the metadata to store. 114 string string_value = 2; 115 116 // The StringList value of the metadata to store. 117 StringList string_list_value = 6; 118 119 // The numeric value of the metadata to store. 120 float numeric_value = 7; 121 } 122 123 // Required. The key of the metadata to store. 124 string key = 1 [(google.api.field_behavior) = REQUIRED]; 125} 126 127// User provided filter to limit retrieval based on `Chunk` or `Document` level 128// metadata values. 129// Example (genre = drama OR genre = action): 130// key = "document.custom_metadata.genre" 131// conditions = [{string_value = "drama", operation = EQUAL}, 132// {string_value = "action", operation = EQUAL}] 133message MetadataFilter { 134 // Required. The key of the metadata to filter on. 135 string key = 1 [(google.api.field_behavior) = REQUIRED]; 136 137 // Required. The `Condition`s for the given key that will trigger this filter. 138 // Multiple `Condition`s are joined by logical ORs. 139 repeated Condition conditions = 2 [(google.api.field_behavior) = REQUIRED]; 140} 141 142// Filter condition applicable to a single key. 143message Condition { 144 // Defines the valid operators that can be applied to a key-value pair. 145 enum Operator { 146 // The default value. This value is unused. 147 OPERATOR_UNSPECIFIED = 0; 148 149 // Supported by numeric. 150 LESS = 1; 151 152 // Supported by numeric. 153 LESS_EQUAL = 2; 154 155 // Supported by numeric & string. 156 EQUAL = 3; 157 158 // Supported by numeric. 159 GREATER_EQUAL = 4; 160 161 // Supported by numeric. 162 GREATER = 5; 163 164 // Supported by numeric & string. 165 NOT_EQUAL = 6; 166 167 // Supported by string only when `CustomMetadata` value type for the given 168 // key has a `string_list_value`. 169 INCLUDES = 7; 170 171 // Supported by string only when `CustomMetadata` value type for the given 172 // key has a `string_list_value`. 173 EXCLUDES = 8; 174 } 175 176 // The value type must be consistent with the value type defined in the field 177 // for the corresponding key. If the value types are not consistent, the 178 // result will be an empty set. When the `CustomMetadata` has a `StringList` 179 // value type, the filtering condition should use `string_value` paired with 180 // an INCLUDES/EXCLUDES operation, otherwise the result will also be an empty 181 // set. 182 oneof value { 183 // The string value to filter the metadata on. 184 string string_value = 1; 185 186 // The numeric value to filter the metadata on. 187 float numeric_value = 6; 188 } 189 190 // Required. Operator applied to the given key-value pair to trigger the 191 // condition. 192 Operator operation = 5 [(google.api.field_behavior) = REQUIRED]; 193} 194 195// A `Chunk` is a subpart of a `Document` that is treated as an independent unit 196// for the purposes of vector representation and storage. 197// A `Corpus` can have a maximum of 1 million `Chunk`s. 198message Chunk { 199 option (google.api.resource) = { 200 type: "generativelanguage.googleapis.com/Chunk" 201 pattern: "corpora/{corpus}/documents/{document}/chunks/{chunk}" 202 plural: "chunks" 203 singular: "chunk" 204 }; 205 206 // States for the lifecycle of a `Chunk`. 207 enum State { 208 // The default value. This value is used if the state is omitted. 209 STATE_UNSPECIFIED = 0; 210 211 // `Chunk` is being processed (embedding and vector storage). 212 STATE_PENDING_PROCESSING = 1; 213 214 // `Chunk` is processed and available for querying. 215 STATE_ACTIVE = 2; 216 217 // `Chunk` failed processing. 218 STATE_FAILED = 10; 219 } 220 221 // Immutable. Identifier. The `Chunk` resource name. The ID (name excluding 222 // the "corpora/*/documents/*/chunks/" prefix) can contain up to 40 characters 223 // that are lowercase alphanumeric or dashes (-). The ID cannot start or end 224 // with a dash. If the name is empty on create, a random 12-character unique 225 // ID will be generated. 226 // Example: `corpora/{corpus_id}/documents/{document_id}/chunks/123a456b789c` 227 string name = 1 [ 228 (google.api.field_behavior) = IDENTIFIER, 229 (google.api.field_behavior) = IMMUTABLE 230 ]; 231 232 // Required. The content for the `Chunk`, such as the text string. 233 // The maximum number of tokens per chunk is 2043. 234 ChunkData data = 2 [(google.api.field_behavior) = REQUIRED]; 235 236 // Optional. User provided custom metadata stored as key-value pairs. 237 // The maximum number of `CustomMetadata` per chunk is 20. 238 repeated CustomMetadata custom_metadata = 3 239 [(google.api.field_behavior) = OPTIONAL]; 240 241 // Output only. The Timestamp of when the `Chunk` was created. 242 google.protobuf.Timestamp create_time = 4 243 [(google.api.field_behavior) = OUTPUT_ONLY]; 244 245 // Output only. The Timestamp of when the `Chunk` was last updated. 246 google.protobuf.Timestamp update_time = 5 247 [(google.api.field_behavior) = OUTPUT_ONLY]; 248 249 // Output only. Current state of the `Chunk`. 250 State state = 6 [(google.api.field_behavior) = OUTPUT_ONLY]; 251} 252 253// Extracted data that represents the `Chunk` content. 254message ChunkData { 255 oneof data { 256 // The `Chunk` content as a string. 257 // The maximum number of tokens per chunk is 2043. 258 string string_value = 1; 259 } 260} 261