1// Copyright 2022 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.texttospeech.v1; 18 19import "google/api/annotations.proto"; 20import "google/api/client.proto"; 21import "google/api/field_behavior.proto"; 22import "google/api/resource.proto"; 23 24option cc_enable_arenas = true; 25option csharp_namespace = "Google.Cloud.TextToSpeech.V1"; 26option go_package = "cloud.google.com/go/texttospeech/apiv1/texttospeechpb;texttospeechpb"; 27option java_multiple_files = true; 28option java_outer_classname = "TextToSpeechProto"; 29option java_package = "com.google.cloud.texttospeech.v1"; 30option php_namespace = "Google\\Cloud\\TextToSpeech\\V1"; 31option ruby_package = "Google::Cloud::TextToSpeech::V1"; 32option (google.api.resource_definition) = { 33 type: "automl.googleapis.com/Model" 34 pattern: "projects/{project}/locations/{location}/models/{model}" 35}; 36 37// Service that implements Google Cloud Text-to-Speech API. 38service TextToSpeech { 39 option (google.api.default_host) = "texttospeech.googleapis.com"; 40 option (google.api.oauth_scopes) = 41 "https://www.googleapis.com/auth/cloud-platform"; 42 43 // Returns a list of Voice supported for synthesis. 44 rpc ListVoices(ListVoicesRequest) returns (ListVoicesResponse) { 45 option (google.api.http) = { 46 get: "/v1/voices" 47 }; 48 option (google.api.method_signature) = "language_code"; 49 } 50 51 // Synthesizes speech synchronously: receive results after all text input 52 // has been processed. 53 rpc SynthesizeSpeech(SynthesizeSpeechRequest) 54 returns (SynthesizeSpeechResponse) { 55 option (google.api.http) = { 56 post: "/v1/text:synthesize" 57 body: "*" 58 }; 59 option (google.api.method_signature) = "input,voice,audio_config"; 60 } 61} 62 63// Gender of the voice as described in 64// [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice). 65enum SsmlVoiceGender { 66 // An unspecified gender. 67 // In VoiceSelectionParams, this means that the client doesn't care which 68 // gender the selected voice will have. In the Voice field of 69 // ListVoicesResponse, this may mean that the voice doesn't fit any of the 70 // other categories in this enum, or that the gender of the voice isn't known. 71 SSML_VOICE_GENDER_UNSPECIFIED = 0; 72 73 // A male voice. 74 MALE = 1; 75 76 // A female voice. 77 FEMALE = 2; 78 79 // A gender-neutral voice. This voice is not yet supported. 80 NEUTRAL = 3; 81} 82 83// Configuration to set up audio encoder. The encoding determines the output 84// audio format that we'd like. 85enum AudioEncoding { 86 // Not specified. Will return result 87 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. 88 AUDIO_ENCODING_UNSPECIFIED = 0; 89 90 // Uncompressed 16-bit signed little-endian samples (Linear PCM). 91 // Audio content returned as LINEAR16 also contains a WAV header. 92 LINEAR16 = 1; 93 94 // MP3 audio at 32kbps. 95 MP3 = 2; 96 97 // Opus encoded audio wrapped in an ogg container. The result will be a 98 // file which can be played natively on Android, and in browsers (at least 99 // Chrome and Firefox). The quality of the encoding is considerably higher 100 // than MP3 while using approximately the same bitrate. 101 OGG_OPUS = 3; 102 103 // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. 104 // Audio content returned as MULAW also contains a WAV header. 105 MULAW = 5; 106 107 // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/A-law. 108 // Audio content returned as ALAW also contains a WAV header. 109 ALAW = 6; 110} 111 112// The top-level message sent by the client for the `ListVoices` method. 113message ListVoicesRequest { 114 // Optional. Recommended. 115 // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. 116 // If not specified, the API will return all supported voices. 117 // If specified, the ListVoices call will only return voices that can be used 118 // to synthesize this language_code. For example, if you specify `"en-NZ"`, 119 // all `"en-NZ"` voices will be returned. If you specify `"no"`, both 120 // `"no-\*"` (Norwegian) and `"nb-\*"` (Norwegian Bokmal) voices will be 121 // returned. 122 string language_code = 1 [(google.api.field_behavior) = OPTIONAL]; 123} 124 125// The message returned to the client by the `ListVoices` method. 126message ListVoicesResponse { 127 // The list of voices. 128 repeated Voice voices = 1; 129} 130 131// Description of a voice supported by the TTS service. 132message Voice { 133 // The languages that this voice supports, expressed as 134 // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags (e.g. 135 // "en-US", "es-419", "cmn-tw"). 136 repeated string language_codes = 1; 137 138 // The name of this voice. Each distinct voice has a unique name. 139 string name = 2; 140 141 // The gender of this voice. 142 SsmlVoiceGender ssml_gender = 3; 143 144 // The natural sample rate (in hertz) for this voice. 145 int32 natural_sample_rate_hertz = 4; 146} 147 148// The top-level message sent by the client for the `SynthesizeSpeech` method. 149message SynthesizeSpeechRequest { 150 // Required. The Synthesizer requires either plain text or SSML as input. 151 SynthesisInput input = 1 [(google.api.field_behavior) = REQUIRED]; 152 153 // Required. The desired voice of the synthesized audio. 154 VoiceSelectionParams voice = 2 [(google.api.field_behavior) = REQUIRED]; 155 156 // Required. The configuration of the synthesized audio. 157 AudioConfig audio_config = 3 [(google.api.field_behavior) = REQUIRED]; 158} 159 160// Contains text input to be synthesized. Either `text` or `ssml` must be 161// supplied. Supplying both or neither returns 162// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. The 163// input size is limited to 5000 bytes. 164message SynthesisInput { 165 // The input source, which is either plain text or SSML. 166 oneof input_source { 167 // The raw text to be synthesized. 168 string text = 1; 169 170 // The SSML document to be synthesized. The SSML document must be valid 171 // and well-formed. Otherwise the RPC will fail and return 172 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. For 173 // more information, see 174 // [SSML](https://cloud.google.com/text-to-speech/docs/ssml). 175 string ssml = 2; 176 } 177} 178 179// Description of which voice to use for a synthesis request. 180message VoiceSelectionParams { 181 // Required. The language (and potentially also the region) of the voice 182 // expressed as a [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) 183 // language tag, e.g. "en-US". This should not include a script tag (e.g. use 184 // "cmn-cn" rather than "cmn-Hant-cn"), because the script will be inferred 185 // from the input provided in the SynthesisInput. The TTS service 186 // will use this parameter to help choose an appropriate voice. Note that 187 // the TTS service may choose a voice with a slightly different language code 188 // than the one selected; it may substitute a different region 189 // (e.g. using en-US rather than en-CA if there isn't a Canadian voice 190 // available), or even a different language, e.g. using "nb" (Norwegian 191 // Bokmal) instead of "no" (Norwegian)". 192 string language_code = 1 [(google.api.field_behavior) = REQUIRED]; 193 194 // The name of the voice. If not set, the service will choose a 195 // voice based on the other parameters such as language_code and gender. 196 string name = 2; 197 198 // The preferred gender of the voice. If not set, the service will 199 // choose a voice based on the other parameters such as language_code and 200 // name. Note that this is only a preference, not requirement; if a 201 // voice of the appropriate gender is not available, the synthesizer should 202 // substitute a voice with a different gender rather than failing the request. 203 SsmlVoiceGender ssml_gender = 3; 204 205 // The configuration for a custom voice. If [CustomVoiceParams.model] is set, 206 // the service will choose the custom voice matching the specified 207 // configuration. 208 CustomVoiceParams custom_voice = 4; 209} 210 211// Description of audio data to be synthesized. 212message AudioConfig { 213 // Required. The format of the audio byte stream. 214 AudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED]; 215 216 // Optional. Input only. Speaking rate/speed, in the range [0.25, 4.0]. 1.0 is 217 // the normal native speed supported by the specific voice. 2.0 is twice as 218 // fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0 219 // speed. Any other values < 0.25 or > 4.0 will return an error. 220 double speaking_rate = 2 [ 221 (google.api.field_behavior) = INPUT_ONLY, 222 (google.api.field_behavior) = OPTIONAL 223 ]; 224 225 // Optional. Input only. Speaking pitch, in the range [-20.0, 20.0]. 20 means 226 // increase 20 semitones from the original pitch. -20 means decrease 20 227 // semitones from the original pitch. 228 double pitch = 3 [ 229 (google.api.field_behavior) = INPUT_ONLY, 230 (google.api.field_behavior) = OPTIONAL 231 ]; 232 233 // Optional. Input only. Volume gain (in dB) of the normal native volume 234 // supported by the specific voice, in the range [-96.0, 16.0]. If unset, or 235 // set to a value of 0.0 (dB), will play at normal native signal amplitude. A 236 // value of -6.0 (dB) will play at approximately half the amplitude of the 237 // normal native signal amplitude. A value of +6.0 (dB) will play at 238 // approximately twice the amplitude of the normal native signal amplitude. 239 // Strongly recommend not to exceed +10 (dB) as there's usually no effective 240 // increase in loudness for any value greater than that. 241 double volume_gain_db = 4 [ 242 (google.api.field_behavior) = INPUT_ONLY, 243 (google.api.field_behavior) = OPTIONAL 244 ]; 245 246 // Optional. The synthesis sample rate (in hertz) for this audio. When this is 247 // specified in SynthesizeSpeechRequest, if this is different from the voice's 248 // natural sample rate, then the synthesizer will honor this request by 249 // converting to the desired sample rate (which might result in worse audio 250 // quality), unless the specified sample rate is not supported for the 251 // encoding chosen, in which case it will fail the request and return 252 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. 253 int32 sample_rate_hertz = 5 [(google.api.field_behavior) = OPTIONAL]; 254 255 // Optional. Input only. An identifier which selects 'audio effects' profiles 256 // that are applied on (post synthesized) text to speech. Effects are applied 257 // on top of each other in the order they are given. See 258 // [audio 259 // profiles](https://cloud.google.com/text-to-speech/docs/audio-profiles) for 260 // current supported profile ids. 261 repeated string effects_profile_id = 6 [ 262 (google.api.field_behavior) = INPUT_ONLY, 263 (google.api.field_behavior) = OPTIONAL 264 ]; 265} 266 267// Description of the custom voice to be synthesized. 268message CustomVoiceParams { 269 // The usage of the synthesized audio. You must report your honest and 270 // correct usage of the service as it's regulated by contract and will cause 271 // significant difference in billing. 272 enum ReportedUsage { 273 // Request with reported usage unspecified will be rejected. 274 REPORTED_USAGE_UNSPECIFIED = 0; 275 276 // For scenarios where the synthesized audio is not downloadable and can 277 // only be used once. For example, real-time request in IVR system. 278 REALTIME = 1; 279 280 // For scenarios where the synthesized audio is downloadable and can be 281 // reused. For example, the synthesized audio is downloaded, stored in 282 // customer service system and played repeatedly. 283 OFFLINE = 2; 284 } 285 286 // Required. The name of the AutoML model that synthesizes the custom voice. 287 string model = 1 [ 288 (google.api.field_behavior) = REQUIRED, 289 (google.api.resource_reference) = { type: "automl.googleapis.com/Model" } 290 ]; 291 292 // Optional. The usage of the synthesized audio to be reported. 293 ReportedUsage reported_usage = 3 [(google.api.field_behavior) = OPTIONAL]; 294} 295 296// The message returned to the client by the `SynthesizeSpeech` method. 297message SynthesizeSpeechResponse { 298 // The audio data bytes encoded as specified in the request, including the 299 // header for encodings that are wrapped in containers (e.g. MP3, OGG_OPUS). 300 // For LINEAR16 audio, we include the WAV header. Note: as 301 // with all bytes fields, protobuffers use a pure binary representation, 302 // whereas JSON representations use base64. 303 bytes audio_content = 1; 304} 305