xref: /aosp_15_r20/external/googleapis/google/ai/generativelanguage/v1beta/retriever.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.ai.generativelanguage.v1beta;
18
19import "google/api/field_behavior.proto";
20import "google/api/resource.proto";
21import "google/protobuf/timestamp.proto";
22
23option go_package = "cloud.google.com/go/ai/generativelanguage/apiv1beta/generativelanguagepb;generativelanguagepb";
24option java_multiple_files = true;
25option java_outer_classname = "RetrieverProto";
26option java_package = "com.google.ai.generativelanguage.v1beta";
27
28// A `Corpus` is a collection of `Document`s.
29// A project can create up to 5 corpora.
30message Corpus {
31  option (google.api.resource) = {
32    type: "generativelanguage.googleapis.com/Corpus"
33    pattern: "corpora/{corpus}"
34    plural: "corpora"
35    singular: "corpus"
36  };
37
38  // Immutable. Identifier. The `Corpus` resource name. The ID (name excluding
39  // the "corpora/" prefix) can contain up to 40 characters that are lowercase
40  // alphanumeric or dashes
41  // (-). The ID cannot start or end with a dash. If the name is empty on
42  // create, a unique name will be derived from `display_name` along with a 12
43  // character random suffix.
44  // Example: `corpora/my-awesome-corpora-123a456b789c`
45  string name = 1 [
46    (google.api.field_behavior) = IDENTIFIER,
47    (google.api.field_behavior) = IMMUTABLE
48  ];
49
50  // Optional. The human-readable display name for the `Corpus`. The display
51  // name must be no more than 512 characters in length, including spaces.
52  // Example: "Docs on Semantic Retriever"
53  string display_name = 2 [(google.api.field_behavior) = OPTIONAL];
54
55  // Output only. The Timestamp of when the `Corpus` was created.
56  google.protobuf.Timestamp create_time = 3
57      [(google.api.field_behavior) = OUTPUT_ONLY];
58
59  // Output only. The Timestamp of when the `Corpus` was last updated.
60  google.protobuf.Timestamp update_time = 4
61      [(google.api.field_behavior) = OUTPUT_ONLY];
62}
63
64// A `Document` is a collection of `Chunk`s.
65// A `Corpus` can have a maximum of 10,000 `Document`s.
66message Document {
67  option (google.api.resource) = {
68    type: "generativelanguage.googleapis.com/Document"
69    pattern: "corpora/{corpus}/documents/{document}"
70    plural: "documents"
71    singular: "document"
72  };
73
74  // Immutable. Identifier. The `Document` resource name. The ID (name excluding
75  // the "corpora/*/documents/" prefix) can contain up to 40 characters that are
76  // lowercase alphanumeric or dashes (-). The ID cannot start or end with a
77  // dash. If the name is empty on create, a unique name will be derived from
78  // `display_name` along with a 12 character random suffix.
79  // Example: `corpora/{corpus_id}/documents/my-awesome-doc-123a456b789c`
80  string name = 1 [
81    (google.api.field_behavior) = IDENTIFIER,
82    (google.api.field_behavior) = IMMUTABLE
83  ];
84
85  // Optional. The human-readable display name for the `Document`. The display
86  // name must be no more than 512 characters in length, including spaces.
87  // Example: "Semantic Retriever Documentation"
88  string display_name = 2 [(google.api.field_behavior) = OPTIONAL];
89
90  // Optional. User provided custom metadata stored as key-value pairs used for
91  // querying. A `Document` can have a maximum of 20 `CustomMetadata`.
92  repeated CustomMetadata custom_metadata = 3
93      [(google.api.field_behavior) = OPTIONAL];
94
95  // Output only. The Timestamp of when the `Document` was last updated.
96  google.protobuf.Timestamp update_time = 4
97      [(google.api.field_behavior) = OUTPUT_ONLY];
98
99  // Output only. The Timestamp of when the `Document` was created.
100  google.protobuf.Timestamp create_time = 5
101      [(google.api.field_behavior) = OUTPUT_ONLY];
102}
103
104// User provided string values assigned to a single metadata key.
105message StringList {
106  // The string values of the metadata to store.
107  repeated string values = 1;
108}
109
110// User provided metadata stored as key-value pairs.
111message CustomMetadata {
112  oneof value {
113    // The string value of the metadata to store.
114    string string_value = 2;
115
116    // The StringList value of the metadata to store.
117    StringList string_list_value = 6;
118
119    // The numeric value of the metadata to store.
120    float numeric_value = 7;
121  }
122
123  // Required. The key of the metadata to store.
124  string key = 1 [(google.api.field_behavior) = REQUIRED];
125}
126
127// User provided filter to limit retrieval based on `Chunk` or `Document` level
128// metadata values.
129// Example (genre = drama OR genre = action):
130//   key = "document.custom_metadata.genre"
131//   conditions = [{string_value = "drama", operation = EQUAL},
132//                 {string_value = "action", operation = EQUAL}]
133message MetadataFilter {
134  // Required. The key of the metadata to filter on.
135  string key = 1 [(google.api.field_behavior) = REQUIRED];
136
137  // Required. The `Condition`s for the given key that will trigger this filter.
138  // Multiple `Condition`s are joined by logical ORs.
139  repeated Condition conditions = 2 [(google.api.field_behavior) = REQUIRED];
140}
141
142// Filter condition applicable to a single key.
143message Condition {
144  // Defines the valid operators that can be applied to a key-value pair.
145  enum Operator {
146    // The default value. This value is unused.
147    OPERATOR_UNSPECIFIED = 0;
148
149    // Supported by numeric.
150    LESS = 1;
151
152    // Supported by numeric.
153    LESS_EQUAL = 2;
154
155    // Supported by numeric & string.
156    EQUAL = 3;
157
158    // Supported by numeric.
159    GREATER_EQUAL = 4;
160
161    // Supported by numeric.
162    GREATER = 5;
163
164    // Supported by numeric & string.
165    NOT_EQUAL = 6;
166
167    // Supported by string only when `CustomMetadata` value type for the given
168    // key has a `string_list_value`.
169    INCLUDES = 7;
170
171    // Supported by string only when `CustomMetadata` value type for the given
172    // key has a `string_list_value`.
173    EXCLUDES = 8;
174  }
175
176  // The value type must be consistent with the value type defined in the field
177  // for the corresponding key. If the value types are not consistent, the
178  // result will be an empty set. When the `CustomMetadata` has a `StringList`
179  // value type, the filtering condition should use `string_value` paired with
180  // an INCLUDES/EXCLUDES operation, otherwise the result will also be an empty
181  // set.
182  oneof value {
183    // The string value to filter the metadata on.
184    string string_value = 1;
185
186    // The numeric value to filter the metadata on.
187    float numeric_value = 6;
188  }
189
190  // Required. Operator applied to the given key-value pair to trigger the
191  // condition.
192  Operator operation = 5 [(google.api.field_behavior) = REQUIRED];
193}
194
195// A `Chunk` is a subpart of a `Document` that is treated as an independent unit
196// for the purposes of vector representation and storage.
197// A `Corpus` can have a maximum of 1 million `Chunk`s.
198message Chunk {
199  option (google.api.resource) = {
200    type: "generativelanguage.googleapis.com/Chunk"
201    pattern: "corpora/{corpus}/documents/{document}/chunks/{chunk}"
202    plural: "chunks"
203    singular: "chunk"
204  };
205
206  // States for the lifecycle of a `Chunk`.
207  enum State {
208    // The default value. This value is used if the state is omitted.
209    STATE_UNSPECIFIED = 0;
210
211    // `Chunk` is being processed (embedding and vector storage).
212    STATE_PENDING_PROCESSING = 1;
213
214    // `Chunk` is processed and available for querying.
215    STATE_ACTIVE = 2;
216
217    // `Chunk` failed processing.
218    STATE_FAILED = 10;
219  }
220
221  // Immutable. Identifier. The `Chunk` resource name. The ID (name excluding
222  // the "corpora/*/documents/*/chunks/" prefix) can contain up to 40 characters
223  // that are lowercase alphanumeric or dashes (-). The ID cannot start or end
224  // with a dash. If the name is empty on create, a random 12-character unique
225  // ID will be generated.
226  // Example: `corpora/{corpus_id}/documents/{document_id}/chunks/123a456b789c`
227  string name = 1 [
228    (google.api.field_behavior) = IDENTIFIER,
229    (google.api.field_behavior) = IMMUTABLE
230  ];
231
232  // Required. The content for the `Chunk`, such as the text string.
233  // The maximum number of tokens per chunk is 2043.
234  ChunkData data = 2 [(google.api.field_behavior) = REQUIRED];
235
236  // Optional. User provided custom metadata stored as key-value pairs.
237  // The maximum number of `CustomMetadata` per chunk is 20.
238  repeated CustomMetadata custom_metadata = 3
239      [(google.api.field_behavior) = OPTIONAL];
240
241  // Output only. The Timestamp of when the `Chunk` was created.
242  google.protobuf.Timestamp create_time = 4
243      [(google.api.field_behavior) = OUTPUT_ONLY];
244
245  // Output only. The Timestamp of when the `Chunk` was last updated.
246  google.protobuf.Timestamp update_time = 5
247      [(google.api.field_behavior) = OUTPUT_ONLY];
248
249  // Output only. Current state of the `Chunk`.
250  State state = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
251}
252
253// Extracted data that represents the `Chunk` content.
254message ChunkData {
255  oneof data {
256    // The `Chunk` content as a string.
257    // The maximum number of tokens per chunk is 2043.
258    string string_value = 1;
259  }
260}
261