xref: /aosp_15_r20/external/googleapis/google/assistant/embedded/v1alpha1/embedded_assistant.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2017 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.assistant.embedded.v1alpha1;
18
19import "google/api/annotations.proto";
20import "google/rpc/status.proto";
21
22option go_package = "google.golang.org/genproto/googleapis/assistant/embedded/v1alpha1;embedded";
23option java_multiple_files = true;
24option java_outer_classname = "AssistantProto";
25option java_package = "com.google.assistant.embedded.v1alpha1";
26
27// Service that implements Google Assistant API.
28service EmbeddedAssistant {
29  // Initiates or continues a conversation with the embedded assistant service.
30  // Each call performs one round-trip, sending an audio request to the service
31  // and receiving the audio response. Uses bidirectional streaming to receive
32  // results, such as the `END_OF_UTTERANCE` event, while sending audio.
33  //
34  // A conversation is one or more gRPC connections, each consisting of several
35  // streamed requests and responses.
36  // For example, the user says *Add to my shopping list* and the assistant
37  // responds *What do you want to add?*. The sequence of streamed requests and
38  // responses in the first gRPC message could be:
39  //
40  // *   ConverseRequest.config
41  // *   ConverseRequest.audio_in
42  // *   ConverseRequest.audio_in
43  // *   ConverseRequest.audio_in
44  // *   ConverseRequest.audio_in
45  // *   ConverseResponse.event_type.END_OF_UTTERANCE
46  // *   ConverseResponse.result.microphone_mode.DIALOG_FOLLOW_ON
47  // *   ConverseResponse.audio_out
48  // *   ConverseResponse.audio_out
49  // *   ConverseResponse.audio_out
50  //
51  // The user then says *bagels* and the assistant responds
52  // *OK, I've added bagels to your shopping list*. This is sent as another gRPC
53  // connection call to the `Converse` method, again with streamed requests and
54  // responses, such as:
55  //
56  // *   ConverseRequest.config
57  // *   ConverseRequest.audio_in
58  // *   ConverseRequest.audio_in
59  // *   ConverseRequest.audio_in
60  // *   ConverseResponse.event_type.END_OF_UTTERANCE
61  // *   ConverseResponse.result.microphone_mode.CLOSE_MICROPHONE
62  // *   ConverseResponse.audio_out
63  // *   ConverseResponse.audio_out
64  // *   ConverseResponse.audio_out
65  // *   ConverseResponse.audio_out
66  //
67  // Although the precise order of responses is not guaranteed, sequential
68  // ConverseResponse.audio_out messages will always contain sequential portions
69  // of audio.
70  rpc Converse(stream ConverseRequest) returns (stream ConverseResponse);
71}
72
73// Specifies how to process the `ConverseRequest` messages.
74message ConverseConfig {
75  // *Required* Specifies how to process the subsequent incoming audio.
76  AudioInConfig audio_in_config = 1;
77
78  // *Required* Specifies how to format the audio that will be returned.
79  AudioOutConfig audio_out_config = 2;
80
81  // *Required* Represents the current dialog state.
82  ConverseState converse_state = 3;
83}
84
85// Specifies how to process the `audio_in` data that will be provided in
86// subsequent requests. For recommended settings, see the Google Assistant SDK
87// [best
88// practices](https://developers.google.com/assistant/sdk/develop/grpc/best-practices/audio).
89message AudioInConfig {
90  // Audio encoding of the data sent in the audio message.
91  // Audio must be one-channel (mono). The only language supported is "en-US".
92  enum Encoding {
93    // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
94    ENCODING_UNSPECIFIED = 0;
95
96    // Uncompressed 16-bit signed little-endian samples (Linear PCM).
97    // This encoding includes no header, only the raw audio bytes.
98    LINEAR16 = 1;
99
100    // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
101    // Codec) is the recommended encoding because it is
102    // lossless--therefore recognition is not compromised--and
103    // requires only about half the bandwidth of `LINEAR16`. This encoding
104    // includes the `FLAC` stream header followed by audio data. It supports
105    // 16-bit and 24-bit samples, however, not all fields in `STREAMINFO` are
106    // supported.
107    FLAC = 2;
108  }
109
110  // *Required* Encoding of audio data sent in all `audio_in` messages.
111  Encoding encoding = 1;
112
113  // *Required* Sample rate (in Hertz) of the audio data sent in all `audio_in`
114  // messages. Valid values are from 16000-24000, but 16000 is optimal.
115  // For best results, set the sampling rate of the audio source to 16000 Hz.
116  // If that's not possible, use the native sample rate of the audio source
117  // (instead of re-sampling).
118  int32 sample_rate_hertz = 2;
119}
120
121// Specifies the desired format for the server to use when it returns
122// `audio_out` messages.
123message AudioOutConfig {
124  // Audio encoding of the data returned in the audio message. All encodings are
125  // raw audio bytes with no header, except as indicated below.
126  enum Encoding {
127    // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
128    ENCODING_UNSPECIFIED = 0;
129
130    // Uncompressed 16-bit signed little-endian samples (Linear PCM).
131    LINEAR16 = 1;
132
133    // MP3 audio encoding. The sample rate is encoded in the payload.
134    MP3 = 2;
135
136    // Opus-encoded audio wrapped in an ogg container. The result will be a
137    // file which can be played natively on Android and in some browsers (such
138    // as Chrome). The quality of the encoding is considerably higher than MP3
139    // while using the same bitrate. The sample rate is encoded in the payload.
140    OPUS_IN_OGG = 3;
141  }
142
143  // *Required* The encoding of audio data to be returned in all `audio_out`
144  // messages.
145  Encoding encoding = 1;
146
147  // *Required* The sample rate in Hertz of the audio data returned in
148  // `audio_out` messages. Valid values are: 16000-24000.
149  int32 sample_rate_hertz = 2;
150
151  // *Required* Current volume setting of the device's audio output.
152  // Valid values are 1 to 100 (corresponding to 1% to 100%).
153  int32 volume_percentage = 3;
154}
155
156// Provides information about the current dialog state.
157message ConverseState {
158  // *Required* The `conversation_state` value returned in the prior
159  // `ConverseResponse`. Omit (do not set the field) if there was no prior
160  // `ConverseResponse`. If there was a prior `ConverseResponse`, do not omit
161  // this field; doing so will end that conversation (and this new request will
162  // start a new conversation).
163  bytes conversation_state = 1;
164}
165
166// The audio containing the assistant's response to the query. Sequential chunks
167// of audio data are received in sequential `ConverseResponse` messages.
168message AudioOut {
169  // *Output-only* The audio data containing the assistant's response to the
170  // query. Sequential chunks of audio data are received in sequential
171  // `ConverseResponse` messages.
172  bytes audio_data = 1;
173}
174
175// The semantic result for the user's spoken query.
176message ConverseResult {
177  // Possible states of the microphone after a `Converse` RPC completes.
178  enum MicrophoneMode {
179    // No mode specified.
180    MICROPHONE_MODE_UNSPECIFIED = 0;
181
182    // The service is not expecting a follow-on question from the user.
183    // The microphone should remain off until the user re-activates it.
184    CLOSE_MICROPHONE = 1;
185
186    // The service is expecting a follow-on question from the user. The
187    // microphone should be re-opened when the `AudioOut` playback completes
188    // (by starting a new `Converse` RPC call to send the new audio).
189    DIALOG_FOLLOW_ON = 2;
190  }
191
192  // *Output-only* The recognized transcript of what the user said.
193  string spoken_request_text = 1;
194
195  // *Output-only* The text of the assistant's spoken response. This is only
196  // returned for an IFTTT action.
197  string spoken_response_text = 2;
198
199  // *Output-only* State information for subsequent `ConverseRequest`. This
200  // value should be saved in the client and returned in the
201  // `conversation_state` with the next `ConverseRequest`. (The client does not
202  // need to interpret or otherwise use this value.) There is no need to save
203  // this information across device restarts.
204  bytes conversation_state = 3;
205
206  // *Output-only* Specifies the mode of the microphone after this `Converse`
207  // RPC is processed.
208  MicrophoneMode microphone_mode = 4;
209
210  // *Output-only* Updated volume level. The value will be 0 or omitted
211  // (indicating no change) unless a voice command such as "Increase the volume"
212  // or "Set volume level 4" was recognized, in which case the value will be
213  // between 1 and 100 (corresponding to the new volume level of 1% to 100%).
214  // Typically, a client should use this volume level when playing the
215  // `audio_out` data, and retain this value as the current volume level and
216  // supply it in the `AudioOutConfig` of the next `ConverseRequest`. (Some
217  // clients may also implement other ways to allow the current volume level to
218  // be changed, for example, by providing a knob that the user can turn.)
219  int32 volume_percentage = 5;
220}
221
222// The top-level message sent by the client. Clients must send at least two, and
223// typically numerous `ConverseRequest` messages. The first message must
224// contain a `config` message and must not contain `audio_in` data. All
225// subsequent messages must contain `audio_in` data and must not contain a
226// `config` message.
227message ConverseRequest {
228  // Exactly one of these fields must be specified in each `ConverseRequest`.
229  oneof converse_request {
230    // The `config` message provides information to the recognizer that
231    // specifies how to process the request.
232    // The first `ConverseRequest` message must contain a `config` message.
233    ConverseConfig config = 1;
234
235    // The audio data to be recognized. Sequential chunks of audio data are sent
236    // in sequential `ConverseRequest` messages. The first `ConverseRequest`
237    // message must not contain `audio_in` data and all subsequent
238    // `ConverseRequest` messages must contain `audio_in` data. The audio bytes
239    // must be encoded as specified in `AudioInConfig`.
240    // Audio must be sent at approximately real-time (16000 samples per second).
241    // An error will be returned if audio is sent significantly faster or
242    // slower.
243    bytes audio_in = 2;
244  }
245}
246
247// The top-level message received by the client. A series of one or more
248// `ConverseResponse` messages are streamed back to the client.
249message ConverseResponse {
250  // Indicates the type of event.
251  enum EventType {
252    // No event specified.
253    EVENT_TYPE_UNSPECIFIED = 0;
254
255    // This event indicates that the server has detected the end of the user's
256    // speech utterance and expects no additional speech. Therefore, the server
257    // will not process additional audio (although it may subsequently return
258    // additional results). The client should stop sending additional audio
259    // data, half-close the gRPC connection, and wait for any additional results
260    // until the server closes the gRPC connection.
261    END_OF_UTTERANCE = 1;
262  }
263
264  // Exactly one of these fields will be populated in each `ConverseResponse`.
265  oneof converse_response {
266    // *Output-only* If set, returns a [google.rpc.Status][google.rpc.Status]
267    // message that specifies the error for the operation. If an error occurs
268    // during processing, this message will be set and there will be no further
269    // messages sent.
270    google.rpc.Status error = 1;
271
272    // *Output-only* Indicates the type of event.
273    EventType event_type = 2;
274
275    // *Output-only* The audio containing the assistant's response to the query.
276    AudioOut audio_out = 3;
277
278    // *Output-only* The semantic result for the user's spoken query.
279    ConverseResult result = 5;
280  }
281}
282