1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.dialogflow.v2beta1; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21import "google/protobuf/duration.proto"; 22 23option cc_enable_arenas = true; 24option csharp_namespace = "Google.Cloud.Dialogflow.V2Beta1"; 25option go_package = "cloud.google.com/go/dialogflow/apiv2beta1/dialogflowpb;dialogflowpb"; 26option java_multiple_files = true; 27option java_outer_classname = "AudioConfigProto"; 28option java_package = "com.google.cloud.dialogflow.v2beta1"; 29option objc_class_prefix = "DF"; 30option (google.api.resource_definition) = { 31 type: "automl.googleapis.com/Model" 32 pattern: "projects/{project}/locations/{location}/models/{model}" 33}; 34option (google.api.resource_definition) = { 35 type: "speech.googleapis.com/PhraseSet" 36 pattern: "projects/{project}/locations/{location}/phraseSets/{phrase_set}" 37}; 38 39// Hints for the speech recognizer to help with recognition in a specific 40// conversation state. 41message SpeechContext { 42 // Optional. A list of strings containing words and phrases that the speech 43 // recognizer should recognize with higher likelihood. 44 // 45 // This list can be used to: 46 // 47 // * improve accuracy for words and phrases you expect the user to say, 48 // e.g. typical commands for your Dialogflow agent 49 // * add additional words to the speech recognizer vocabulary 50 // * ... 51 // 52 // See the [Cloud Speech 53 // documentation](https://cloud.google.com/speech-to-text/quotas) for usage 54 // limits. 55 repeated string phrases = 1 [(google.api.field_behavior) = OPTIONAL]; 56 57 // Optional. Boost for this context compared to other contexts: 58 // 59 // * If the boost is positive, Dialogflow will increase the probability that 60 // the phrases in this context are recognized over similar sounding phrases. 61 // * If the boost is unspecified or non-positive, Dialogflow will not apply 62 // any boost. 63 // 64 // Dialogflow recommends that you use boosts in the range (0, 20] and that you 65 // find a value that fits your use case with binary search. 66 float boost = 2 [(google.api.field_behavior) = OPTIONAL]; 67} 68 69// Information for a word recognized by the speech recognizer. 70message SpeechWordInfo { 71 // The word this info is for. 72 string word = 3; 73 74 // Time offset relative to the beginning of the audio that corresponds to the 75 // start of the spoken word. This is an experimental feature and the accuracy 76 // of the time offset can vary. 77 google.protobuf.Duration start_offset = 1; 78 79 // Time offset relative to the beginning of the audio that corresponds to the 80 // end of the spoken word. This is an experimental feature and the accuracy of 81 // the time offset can vary. 82 google.protobuf.Duration end_offset = 2; 83 84 // The Speech confidence between 0.0 and 1.0 for this word. A higher number 85 // indicates an estimated greater likelihood that the recognized word is 86 // correct. The default of 0.0 is a sentinel value indicating that confidence 87 // was not set. 88 // 89 // This field is not guaranteed to be fully stable over time for the same 90 // audio input. Users should also not rely on it to always be provided. 91 float confidence = 4; 92} 93 94// Configuration of the barge-in behavior. Barge-in instructs the API to return 95// a detected utterance at a proper time while the client is playing back the 96// response audio from a previous request. When the client sees the 97// utterance, it should stop the playback and immediately get ready for 98// receiving the responses for the current request. 99// 100// The barge-in handling requires the client to start streaming audio input 101// as soon as it starts playing back the audio from the previous response. The 102// playback is modeled into two phases: 103// 104// * No barge-in phase: which goes first and during which speech detection 105// should not be carried out. 106// 107// * Barge-in phase: which follows the no barge-in phase and during which 108// the API starts speech detection and may inform the client that an utterance 109// has been detected. Note that no-speech event is not expected in this 110// phase. 111// 112// The client provides this configuration in terms of the durations of those 113// two phases. The durations are measured in terms of the audio length from 114// the start of the input audio. 115// 116// The flow goes like below: 117// 118// ``` 119// --> Time 120// 121// without speech detection | utterance only | utterance or no-speech event 122// | | 123// +-------------+ | +------------+ | +---------------+ 124// ----------+ no barge-in +-|-+ barge-in +-|-+ normal period +----------- 125// +-------------+ | +------------+ | +---------------+ 126// ``` 127// 128// No-speech event is a response with END_OF_UTTERANCE without any transcript 129// following up. 130message BargeInConfig { 131 // Duration that is not eligible for barge-in at the beginning of the input 132 // audio. 133 google.protobuf.Duration no_barge_in_duration = 1; 134 135 // Total duration for the playback at the beginning of the input audio. 136 google.protobuf.Duration total_duration = 2; 137} 138 139// Instructs the speech recognizer on how to process the audio content. 140message InputAudioConfig { 141 // Required. Audio encoding of the audio content to process. 142 AudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED]; 143 144 // Required. Sample rate (in Hertz) of the audio content sent in the query. 145 // Refer to [Cloud Speech API 146 // documentation](https://cloud.google.com/speech-to-text/docs/basics) for 147 // more details. 148 int32 sample_rate_hertz = 2 [(google.api.field_behavior) = REQUIRED]; 149 150 // Required. The language of the supplied audio. Dialogflow does not do 151 // translations. See [Language 152 // Support](https://cloud.google.com/dialogflow/docs/reference/language) 153 // for a list of the currently supported language codes. Note that queries in 154 // the same session do not necessarily need to specify the same language. 155 string language_code = 3 [(google.api.field_behavior) = REQUIRED]; 156 157 // If `true`, Dialogflow returns 158 // [SpeechWordInfo][google.cloud.dialogflow.v2beta1.SpeechWordInfo] in 159 // [StreamingRecognitionResult][google.cloud.dialogflow.v2beta1.StreamingRecognitionResult] 160 // with information about the recognized speech words, e.g. start and end time 161 // offsets. If false or unspecified, Speech doesn't return any word-level 162 // information. 163 bool enable_word_info = 13; 164 165 // A list of strings containing words and phrases that the speech 166 // recognizer should recognize with higher likelihood. 167 // 168 // See [the Cloud Speech 169 // documentation](https://cloud.google.com/speech-to-text/docs/basics#phrase-hints) 170 // for more details. 171 // 172 // This field is deprecated. Please use [`speech_contexts`]() instead. If you 173 // specify both [`phrase_hints`]() and [`speech_contexts`](), Dialogflow will 174 // treat the [`phrase_hints`]() as a single additional [`SpeechContext`](). 175 repeated string phrase_hints = 4 [deprecated = true]; 176 177 // Context information to assist speech recognition. 178 // 179 // See [the Cloud Speech 180 // documentation](https://cloud.google.com/speech-to-text/docs/basics#phrase-hints) 181 // for more details. 182 repeated SpeechContext speech_contexts = 11; 183 184 // Optional. Which Speech model to select for the given request. 185 // For more information, see 186 // [Speech models](https://cloud.google.com/dialogflow/es/docs/speech-models). 187 string model = 7; 188 189 // Which variant of the [Speech 190 // model][google.cloud.dialogflow.v2beta1.InputAudioConfig.model] to use. 191 SpeechModelVariant model_variant = 10; 192 193 // If `false` (default), recognition does not cease until the 194 // client closes the stream. 195 // If `true`, the recognizer will detect a single spoken utterance in input 196 // audio. Recognition ceases when it detects the audio's voice has 197 // stopped or paused. In this case, once a detected intent is received, the 198 // client should close the stream and start a new request with a new stream as 199 // needed. 200 // Note: This setting is relevant only for streaming methods. 201 // Note: When specified, InputAudioConfig.single_utterance takes precedence 202 // over StreamingDetectIntentRequest.single_utterance. 203 bool single_utterance = 8; 204 205 // Only used in 206 // [Participants.AnalyzeContent][google.cloud.dialogflow.v2beta1.Participants.AnalyzeContent] 207 // and 208 // [Participants.StreamingAnalyzeContent][google.cloud.dialogflow.v2beta1.Participants.StreamingAnalyzeContent]. 209 // If `false` and recognition doesn't return any result, trigger 210 // `NO_SPEECH_RECOGNIZED` event to Dialogflow agent. 211 bool disable_no_speech_recognized_event = 14; 212 213 // Configuration of barge-in behavior during the streaming of input audio. 214 BargeInConfig barge_in_config = 15; 215 216 // Enable automatic punctuation option at the speech backend. 217 bool enable_automatic_punctuation = 17; 218 219 // If `true`, the request will opt out for STT conformer model migration. 220 // This field will be deprecated once force migration takes place in June 221 // 2024. Please refer to [Dialogflow ES Speech model 222 // migration](https://cloud.google.com/dialogflow/es/docs/speech-model-migration). 223 bool opt_out_conformer_model_migration = 26; 224} 225 226// Description of which voice to use for speech synthesis. 227message VoiceSelectionParams { 228 // Optional. The name of the voice. If not set, the service will choose a 229 // voice based on the other parameters such as language_code and 230 // [ssml_gender][google.cloud.dialogflow.v2beta1.VoiceSelectionParams.ssml_gender]. 231 // 232 // For the list of available voices, please refer to [Supported voices and 233 // languages](https://cloud.google.com/text-to-speech/docs/voices). 234 string name = 1 [(google.api.field_behavior) = OPTIONAL]; 235 236 // Optional. The preferred gender of the voice. If not set, the service will 237 // choose a voice based on the other parameters such as language_code and 238 // [name][google.cloud.dialogflow.v2beta1.VoiceSelectionParams.name]. Note 239 // that this is only a preference, not requirement. If a voice of the 240 // appropriate gender is not available, the synthesizer should substitute a 241 // voice with a different gender rather than failing the request. 242 SsmlVoiceGender ssml_gender = 2 [(google.api.field_behavior) = OPTIONAL]; 243} 244 245// Configuration of how speech should be synthesized. 246message SynthesizeSpeechConfig { 247 // Optional. Speaking rate/speed, in the range [0.25, 4.0]. 1.0 is the normal 248 // native speed supported by the specific voice. 2.0 is twice as fast, and 0.5 249 // is half as fast. If unset(0.0), defaults to the native 1.0 speed. Any other 250 // values < 0.25 or > 4.0 will return an error. 251 double speaking_rate = 1 [(google.api.field_behavior) = OPTIONAL]; 252 253 // Optional. Speaking pitch, in the range [-20.0, 20.0]. 20 means increase 20 254 // semitones from the original pitch. -20 means decrease 20 semitones from the 255 // original pitch. 256 double pitch = 2 [(google.api.field_behavior) = OPTIONAL]; 257 258 // Optional. Volume gain (in dB) of the normal native volume supported by the 259 // specific voice, in the range [-96.0, 16.0]. If unset, or set to a value of 260 // 0.0 (dB), will play at normal native signal amplitude. A value of -6.0 (dB) 261 // will play at approximately half the amplitude of the normal native signal 262 // amplitude. A value of +6.0 (dB) will play at approximately twice the 263 // amplitude of the normal native signal amplitude. We strongly recommend not 264 // to exceed +10 (dB) as there's usually no effective increase in loudness for 265 // any value greater than that. 266 double volume_gain_db = 3 [(google.api.field_behavior) = OPTIONAL]; 267 268 // Optional. An identifier which selects 'audio effects' profiles that are 269 // applied on (post synthesized) text to speech. Effects are applied on top of 270 // each other in the order they are given. 271 repeated string effects_profile_id = 5 272 [(google.api.field_behavior) = OPTIONAL]; 273 274 // Optional. The desired voice of the synthesized audio. 275 VoiceSelectionParams voice = 4 [(google.api.field_behavior) = OPTIONAL]; 276} 277 278// Instructs the speech synthesizer how to generate the output audio content. 279// If this audio config is supplied in a request, it overrides all existing 280// text-to-speech settings applied to the agent. 281message OutputAudioConfig { 282 // Required. Audio encoding of the synthesized audio content. 283 OutputAudioEncoding audio_encoding = 1 284 [(google.api.field_behavior) = REQUIRED]; 285 286 // The synthesis sample rate (in hertz) for this audio. If not 287 // provided, then the synthesizer will use the default sample rate based on 288 // the audio encoding. If this is different from the voice's natural sample 289 // rate, then the synthesizer will honor this request by converting to the 290 // desired sample rate (which might result in worse audio quality). 291 int32 sample_rate_hertz = 2; 292 293 // Configuration of how speech should be synthesized. 294 SynthesizeSpeechConfig synthesize_speech_config = 3; 295} 296 297// A wrapper of repeated TelephonyDtmf digits. 298message TelephonyDtmfEvents { 299 // A sequence of TelephonyDtmf digits. 300 repeated TelephonyDtmf dtmf_events = 1; 301} 302 303// Configures speech transcription for 304// [ConversationProfile][google.cloud.dialogflow.v2beta1.ConversationProfile]. 305message SpeechToTextConfig { 306 // The speech model used in speech to text. 307 // `SPEECH_MODEL_VARIANT_UNSPECIFIED`, `USE_BEST_AVAILABLE` will be treated as 308 // `USE_ENHANCED`. It can be overridden in 309 // [AnalyzeContentRequest][google.cloud.dialogflow.v2beta1.AnalyzeContentRequest] 310 // and 311 // [StreamingAnalyzeContentRequest][google.cloud.dialogflow.v2beta1.StreamingAnalyzeContentRequest] 312 // request. If enhanced model variant is specified and an enhanced version of 313 // the specified model for the language does not exist, then it would emit an 314 // error. 315 SpeechModelVariant speech_model_variant = 1; 316 317 // Which Speech model to select. Select the 318 // model best suited to your domain to get best results. If a model is not 319 // explicitly specified, then Dialogflow auto-selects a model based on other 320 // parameters in the SpeechToTextConfig and Agent settings. 321 // If enhanced speech model is enabled for the agent and an enhanced 322 // version of the specified model for the language does not exist, then the 323 // speech is recognized using the standard version of the specified model. 324 // Refer to 325 // [Cloud Speech API 326 // documentation](https://cloud.google.com/speech-to-text/docs/basics#select-model) 327 // for more details. 328 // If you specify a model, the following models typically have the best 329 // performance: 330 // 331 // - phone_call (best for Agent Assist and telephony) 332 // - latest_short (best for Dialogflow non-telephony) 333 // - command_and_search 334 // 335 // Leave this field unspecified to use 336 // [Agent Speech 337 // settings](https://cloud.google.com/dialogflow/cx/docs/concept/agent#settings-speech) 338 // for model selection. 339 string model = 2; 340 341 // Use timeout based endpointing, interpreting endpointer sensitivy as 342 // seconds of timeout value. 343 bool use_timeout_based_endpointing = 11; 344} 345 346// [DTMF](https://en.wikipedia.org/wiki/Dual-tone_multi-frequency_signaling) 347// digit in Telephony Gateway. 348enum TelephonyDtmf { 349 // Not specified. This value may be used to indicate an absent digit. 350 TELEPHONY_DTMF_UNSPECIFIED = 0; 351 352 // Number: '1'. 353 DTMF_ONE = 1; 354 355 // Number: '2'. 356 DTMF_TWO = 2; 357 358 // Number: '3'. 359 DTMF_THREE = 3; 360 361 // Number: '4'. 362 DTMF_FOUR = 4; 363 364 // Number: '5'. 365 DTMF_FIVE = 5; 366 367 // Number: '6'. 368 DTMF_SIX = 6; 369 370 // Number: '7'. 371 DTMF_SEVEN = 7; 372 373 // Number: '8'. 374 DTMF_EIGHT = 8; 375 376 // Number: '9'. 377 DTMF_NINE = 9; 378 379 // Number: '0'. 380 DTMF_ZERO = 10; 381 382 // Letter: 'A'. 383 DTMF_A = 11; 384 385 // Letter: 'B'. 386 DTMF_B = 12; 387 388 // Letter: 'C'. 389 DTMF_C = 13; 390 391 // Letter: 'D'. 392 DTMF_D = 14; 393 394 // Asterisk/star: '*'. 395 DTMF_STAR = 15; 396 397 // Pound/diamond/hash/square/gate/octothorpe: '#'. 398 DTMF_POUND = 16; 399} 400 401// Audio encoding of the audio content sent in the conversational query request. 402// Refer to the 403// [Cloud Speech API 404// documentation](https://cloud.google.com/speech-to-text/docs/basics) for more 405// details. 406enum AudioEncoding { 407 // Not specified. 408 AUDIO_ENCODING_UNSPECIFIED = 0; 409 410 // Uncompressed 16-bit signed little-endian samples (Linear PCM). 411 AUDIO_ENCODING_LINEAR_16 = 1; 412 413 // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio 414 // Codec) is the recommended encoding because it is lossless (therefore 415 // recognition is not compromised) and requires only about half the 416 // bandwidth of `LINEAR16`. `FLAC` stream encoding supports 16-bit and 417 // 24-bit samples, however, not all fields in `STREAMINFO` are supported. 418 AUDIO_ENCODING_FLAC = 2; 419 420 // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. 421 AUDIO_ENCODING_MULAW = 3; 422 423 // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000. 424 AUDIO_ENCODING_AMR = 4; 425 426 // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000. 427 AUDIO_ENCODING_AMR_WB = 5; 428 429 // Opus encoded audio frames in Ogg container 430 // ([OggOpus](https://wiki.xiph.org/OggOpus)). 431 // `sample_rate_hertz` must be 16000. 432 AUDIO_ENCODING_OGG_OPUS = 6; 433 434 // Although the use of lossy encodings is not recommended, if a very low 435 // bitrate encoding is required, `OGG_OPUS` is highly preferred over 436 // Speex encoding. The [Speex](https://speex.org/) encoding supported by 437 // Dialogflow API has a header byte in each block, as in MIME type 438 // `audio/x-speex-with-header-byte`. 439 // It is a variant of the RTP Speex encoding defined in 440 // [RFC 5574](https://tools.ietf.org/html/rfc5574). 441 // The stream is a sequence of blocks, one block per RTP packet. Each block 442 // starts with a byte containing the length of the block, in bytes, followed 443 // by one or more frames of Speex data, padded to an integral number of 444 // bytes (octets) as specified in RFC 5574. In other words, each RTP header 445 // is replaced with a single byte containing the block length. Only Speex 446 // wideband is supported. `sample_rate_hertz` must be 16000. 447 AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7; 448} 449 450// Variant of the specified [Speech 451// model][google.cloud.dialogflow.v2beta1.InputAudioConfig.model] to use. 452// 453// See the [Cloud Speech 454// documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models) 455// for which models have different variants. For example, the "phone_call" model 456// has both a standard and an enhanced variant. When you use an enhanced model, 457// you will generally receive higher quality results than for a standard model. 458enum SpeechModelVariant { 459 // No model variant specified. In this case Dialogflow defaults to 460 // USE_BEST_AVAILABLE. 461 SPEECH_MODEL_VARIANT_UNSPECIFIED = 0; 462 463 // Use the best available variant of the [Speech 464 // model][InputAudioConfig.model] that the caller is eligible for. 465 // 466 // Please see the [Dialogflow 467 // docs](https://cloud.google.com/dialogflow/docs/data-logging) for 468 // how to make your project eligible for enhanced models. 469 USE_BEST_AVAILABLE = 1; 470 471 // Use standard model variant even if an enhanced model is available. See the 472 // [Cloud Speech 473 // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models) 474 // for details about enhanced models. 475 USE_STANDARD = 2; 476 477 // Use an enhanced model variant: 478 // 479 // * If an enhanced variant does not exist for the given 480 // [model][google.cloud.dialogflow.v2beta1.InputAudioConfig.model] and 481 // request language, Dialogflow falls back to the standard variant. 482 // 483 // The [Cloud Speech 484 // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models) 485 // describes which models have enhanced variants. 486 // 487 // * If the API caller isn't eligible for enhanced models, Dialogflow returns 488 // an error. Please see the [Dialogflow 489 // docs](https://cloud.google.com/dialogflow/docs/data-logging) 490 // for how to make your project eligible. 491 USE_ENHANCED = 3; 492} 493 494// Gender of the voice as described in 495// [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice). 496enum SsmlVoiceGender { 497 // An unspecified gender, which means that the client doesn't care which 498 // gender the selected voice will have. 499 SSML_VOICE_GENDER_UNSPECIFIED = 0; 500 501 // A male voice. 502 SSML_VOICE_GENDER_MALE = 1; 503 504 // A female voice. 505 SSML_VOICE_GENDER_FEMALE = 2; 506 507 // A gender-neutral voice. 508 SSML_VOICE_GENDER_NEUTRAL = 3; 509} 510 511// Audio encoding of the output audio format in Text-To-Speech. 512enum OutputAudioEncoding { 513 // Not specified. 514 OUTPUT_AUDIO_ENCODING_UNSPECIFIED = 0; 515 516 // Uncompressed 16-bit signed little-endian samples (Linear PCM). 517 // Audio content returned as LINEAR16 also contains a WAV header. 518 OUTPUT_AUDIO_ENCODING_LINEAR_16 = 1; 519 520 // MP3 audio at 32kbps. 521 OUTPUT_AUDIO_ENCODING_MP3 = 2; 522 523 // MP3 audio at 64kbps. 524 OUTPUT_AUDIO_ENCODING_MP3_64_KBPS = 4; 525 526 // Opus encoded audio wrapped in an ogg container. The result will be a 527 // file which can be played natively on Android, and in browsers (at least 528 // Chrome and Firefox). The quality of the encoding is considerably higher 529 // than MP3 while using approximately the same bitrate. 530 OUTPUT_AUDIO_ENCODING_OGG_OPUS = 3; 531 532 // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. 533 OUTPUT_AUDIO_ENCODING_MULAW = 5; 534} 535