1// Copyright 2017 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.assistant.embedded.v1alpha1; 18 19import "google/api/annotations.proto"; 20import "google/rpc/status.proto"; 21 22option go_package = "google.golang.org/genproto/googleapis/assistant/embedded/v1alpha1;embedded"; 23option java_multiple_files = true; 24option java_outer_classname = "AssistantProto"; 25option java_package = "com.google.assistant.embedded.v1alpha1"; 26 27// Service that implements Google Assistant API. 28service EmbeddedAssistant { 29 // Initiates or continues a conversation with the embedded assistant service. 30 // Each call performs one round-trip, sending an audio request to the service 31 // and receiving the audio response. Uses bidirectional streaming to receive 32 // results, such as the `END_OF_UTTERANCE` event, while sending audio. 33 // 34 // A conversation is one or more gRPC connections, each consisting of several 35 // streamed requests and responses. 36 // For example, the user says *Add to my shopping list* and the assistant 37 // responds *What do you want to add?*. The sequence of streamed requests and 38 // responses in the first gRPC message could be: 39 // 40 // * ConverseRequest.config 41 // * ConverseRequest.audio_in 42 // * ConverseRequest.audio_in 43 // * ConverseRequest.audio_in 44 // * ConverseRequest.audio_in 45 // * ConverseResponse.event_type.END_OF_UTTERANCE 46 // * ConverseResponse.result.microphone_mode.DIALOG_FOLLOW_ON 47 // * ConverseResponse.audio_out 48 // * ConverseResponse.audio_out 49 // * ConverseResponse.audio_out 50 // 51 // The user then says *bagels* and the assistant responds 52 // *OK, I've added bagels to your shopping list*. This is sent as another gRPC 53 // connection call to the `Converse` method, again with streamed requests and 54 // responses, such as: 55 // 56 // * ConverseRequest.config 57 // * ConverseRequest.audio_in 58 // * ConverseRequest.audio_in 59 // * ConverseRequest.audio_in 60 // * ConverseResponse.event_type.END_OF_UTTERANCE 61 // * ConverseResponse.result.microphone_mode.CLOSE_MICROPHONE 62 // * ConverseResponse.audio_out 63 // * ConverseResponse.audio_out 64 // * ConverseResponse.audio_out 65 // * ConverseResponse.audio_out 66 // 67 // Although the precise order of responses is not guaranteed, sequential 68 // ConverseResponse.audio_out messages will always contain sequential portions 69 // of audio. 70 rpc Converse(stream ConverseRequest) returns (stream ConverseResponse); 71} 72 73// Specifies how to process the `ConverseRequest` messages. 74message ConverseConfig { 75 // *Required* Specifies how to process the subsequent incoming audio. 76 AudioInConfig audio_in_config = 1; 77 78 // *Required* Specifies how to format the audio that will be returned. 79 AudioOutConfig audio_out_config = 2; 80 81 // *Required* Represents the current dialog state. 82 ConverseState converse_state = 3; 83} 84 85// Specifies how to process the `audio_in` data that will be provided in 86// subsequent requests. For recommended settings, see the Google Assistant SDK 87// [best 88// practices](https://developers.google.com/assistant/sdk/develop/grpc/best-practices/audio). 89message AudioInConfig { 90 // Audio encoding of the data sent in the audio message. 91 // Audio must be one-channel (mono). The only language supported is "en-US". 92 enum Encoding { 93 // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][]. 94 ENCODING_UNSPECIFIED = 0; 95 96 // Uncompressed 16-bit signed little-endian samples (Linear PCM). 97 // This encoding includes no header, only the raw audio bytes. 98 LINEAR16 = 1; 99 100 // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio 101 // Codec) is the recommended encoding because it is 102 // lossless--therefore recognition is not compromised--and 103 // requires only about half the bandwidth of `LINEAR16`. This encoding 104 // includes the `FLAC` stream header followed by audio data. It supports 105 // 16-bit and 24-bit samples, however, not all fields in `STREAMINFO` are 106 // supported. 107 FLAC = 2; 108 } 109 110 // *Required* Encoding of audio data sent in all `audio_in` messages. 111 Encoding encoding = 1; 112 113 // *Required* Sample rate (in Hertz) of the audio data sent in all `audio_in` 114 // messages. Valid values are from 16000-24000, but 16000 is optimal. 115 // For best results, set the sampling rate of the audio source to 16000 Hz. 116 // If that's not possible, use the native sample rate of the audio source 117 // (instead of re-sampling). 118 int32 sample_rate_hertz = 2; 119} 120 121// Specifies the desired format for the server to use when it returns 122// `audio_out` messages. 123message AudioOutConfig { 124 // Audio encoding of the data returned in the audio message. All encodings are 125 // raw audio bytes with no header, except as indicated below. 126 enum Encoding { 127 // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][]. 128 ENCODING_UNSPECIFIED = 0; 129 130 // Uncompressed 16-bit signed little-endian samples (Linear PCM). 131 LINEAR16 = 1; 132 133 // MP3 audio encoding. The sample rate is encoded in the payload. 134 MP3 = 2; 135 136 // Opus-encoded audio wrapped in an ogg container. The result will be a 137 // file which can be played natively on Android and in some browsers (such 138 // as Chrome). The quality of the encoding is considerably higher than MP3 139 // while using the same bitrate. The sample rate is encoded in the payload. 140 OPUS_IN_OGG = 3; 141 } 142 143 // *Required* The encoding of audio data to be returned in all `audio_out` 144 // messages. 145 Encoding encoding = 1; 146 147 // *Required* The sample rate in Hertz of the audio data returned in 148 // `audio_out` messages. Valid values are: 16000-24000. 149 int32 sample_rate_hertz = 2; 150 151 // *Required* Current volume setting of the device's audio output. 152 // Valid values are 1 to 100 (corresponding to 1% to 100%). 153 int32 volume_percentage = 3; 154} 155 156// Provides information about the current dialog state. 157message ConverseState { 158 // *Required* The `conversation_state` value returned in the prior 159 // `ConverseResponse`. Omit (do not set the field) if there was no prior 160 // `ConverseResponse`. If there was a prior `ConverseResponse`, do not omit 161 // this field; doing so will end that conversation (and this new request will 162 // start a new conversation). 163 bytes conversation_state = 1; 164} 165 166// The audio containing the assistant's response to the query. Sequential chunks 167// of audio data are received in sequential `ConverseResponse` messages. 168message AudioOut { 169 // *Output-only* The audio data containing the assistant's response to the 170 // query. Sequential chunks of audio data are received in sequential 171 // `ConverseResponse` messages. 172 bytes audio_data = 1; 173} 174 175// The semantic result for the user's spoken query. 176message ConverseResult { 177 // Possible states of the microphone after a `Converse` RPC completes. 178 enum MicrophoneMode { 179 // No mode specified. 180 MICROPHONE_MODE_UNSPECIFIED = 0; 181 182 // The service is not expecting a follow-on question from the user. 183 // The microphone should remain off until the user re-activates it. 184 CLOSE_MICROPHONE = 1; 185 186 // The service is expecting a follow-on question from the user. The 187 // microphone should be re-opened when the `AudioOut` playback completes 188 // (by starting a new `Converse` RPC call to send the new audio). 189 DIALOG_FOLLOW_ON = 2; 190 } 191 192 // *Output-only* The recognized transcript of what the user said. 193 string spoken_request_text = 1; 194 195 // *Output-only* The text of the assistant's spoken response. This is only 196 // returned for an IFTTT action. 197 string spoken_response_text = 2; 198 199 // *Output-only* State information for subsequent `ConverseRequest`. This 200 // value should be saved in the client and returned in the 201 // `conversation_state` with the next `ConverseRequest`. (The client does not 202 // need to interpret or otherwise use this value.) There is no need to save 203 // this information across device restarts. 204 bytes conversation_state = 3; 205 206 // *Output-only* Specifies the mode of the microphone after this `Converse` 207 // RPC is processed. 208 MicrophoneMode microphone_mode = 4; 209 210 // *Output-only* Updated volume level. The value will be 0 or omitted 211 // (indicating no change) unless a voice command such as "Increase the volume" 212 // or "Set volume level 4" was recognized, in which case the value will be 213 // between 1 and 100 (corresponding to the new volume level of 1% to 100%). 214 // Typically, a client should use this volume level when playing the 215 // `audio_out` data, and retain this value as the current volume level and 216 // supply it in the `AudioOutConfig` of the next `ConverseRequest`. (Some 217 // clients may also implement other ways to allow the current volume level to 218 // be changed, for example, by providing a knob that the user can turn.) 219 int32 volume_percentage = 5; 220} 221 222// The top-level message sent by the client. Clients must send at least two, and 223// typically numerous `ConverseRequest` messages. The first message must 224// contain a `config` message and must not contain `audio_in` data. All 225// subsequent messages must contain `audio_in` data and must not contain a 226// `config` message. 227message ConverseRequest { 228 // Exactly one of these fields must be specified in each `ConverseRequest`. 229 oneof converse_request { 230 // The `config` message provides information to the recognizer that 231 // specifies how to process the request. 232 // The first `ConverseRequest` message must contain a `config` message. 233 ConverseConfig config = 1; 234 235 // The audio data to be recognized. Sequential chunks of audio data are sent 236 // in sequential `ConverseRequest` messages. The first `ConverseRequest` 237 // message must not contain `audio_in` data and all subsequent 238 // `ConverseRequest` messages must contain `audio_in` data. The audio bytes 239 // must be encoded as specified in `AudioInConfig`. 240 // Audio must be sent at approximately real-time (16000 samples per second). 241 // An error will be returned if audio is sent significantly faster or 242 // slower. 243 bytes audio_in = 2; 244 } 245} 246 247// The top-level message received by the client. A series of one or more 248// `ConverseResponse` messages are streamed back to the client. 249message ConverseResponse { 250 // Indicates the type of event. 251 enum EventType { 252 // No event specified. 253 EVENT_TYPE_UNSPECIFIED = 0; 254 255 // This event indicates that the server has detected the end of the user's 256 // speech utterance and expects no additional speech. Therefore, the server 257 // will not process additional audio (although it may subsequently return 258 // additional results). The client should stop sending additional audio 259 // data, half-close the gRPC connection, and wait for any additional results 260 // until the server closes the gRPC connection. 261 END_OF_UTTERANCE = 1; 262 } 263 264 // Exactly one of these fields will be populated in each `ConverseResponse`. 265 oneof converse_response { 266 // *Output-only* If set, returns a [google.rpc.Status][google.rpc.Status] 267 // message that specifies the error for the operation. If an error occurs 268 // during processing, this message will be set and there will be no further 269 // messages sent. 270 google.rpc.Status error = 1; 271 272 // *Output-only* Indicates the type of event. 273 EventType event_type = 2; 274 275 // *Output-only* The audio containing the assistant's response to the query. 276 AudioOut audio_out = 3; 277 278 // *Output-only* The semantic result for the user's spoken query. 279 ConverseResult result = 5; 280 } 281} 282