1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "modules/audio_coding/codecs/cng/audio_encoder_cng.h"
12
13 #include <cstdint>
14 #include <memory>
15 #include <utility>
16
17 #include "absl/types/optional.h"
18 #include "api/units/time_delta.h"
19 #include "modules/audio_coding/codecs/cng/webrtc_cng.h"
20 #include "rtc_base/checks.h"
21
22 namespace webrtc {
23
24 namespace {
25
26 const int kMaxFrameSizeMs = 60;
27
28 class AudioEncoderCng final : public AudioEncoder {
29 public:
30 explicit AudioEncoderCng(AudioEncoderCngConfig&& config);
31 ~AudioEncoderCng() override;
32
33 // Not copyable or moveable.
34 AudioEncoderCng(const AudioEncoderCng&) = delete;
35 AudioEncoderCng(AudioEncoderCng&&) = delete;
36 AudioEncoderCng& operator=(const AudioEncoderCng&) = delete;
37 AudioEncoderCng& operator=(AudioEncoderCng&&) = delete;
38
39 int SampleRateHz() const override;
40 size_t NumChannels() const override;
41 int RtpTimestampRateHz() const override;
42 size_t Num10MsFramesInNextPacket() const override;
43 size_t Max10MsFramesInAPacket() const override;
44 int GetTargetBitrate() const override;
45 EncodedInfo EncodeImpl(uint32_t rtp_timestamp,
46 rtc::ArrayView<const int16_t> audio,
47 rtc::Buffer* encoded) override;
48 void Reset() override;
49 bool SetFec(bool enable) override;
50 bool SetDtx(bool enable) override;
51 bool SetApplication(Application application) override;
52 void SetMaxPlaybackRate(int frequency_hz) override;
53 rtc::ArrayView<std::unique_ptr<AudioEncoder>> ReclaimContainedEncoders()
54 override;
55 void OnReceivedUplinkPacketLossFraction(
56 float uplink_packet_loss_fraction) override;
57 void OnReceivedUplinkBandwidth(
58 int target_audio_bitrate_bps,
59 absl::optional<int64_t> bwe_period_ms) override;
60 absl::optional<std::pair<TimeDelta, TimeDelta>> GetFrameLengthRange()
61 const override;
62
63 private:
64 EncodedInfo EncodePassive(size_t frames_to_encode, rtc::Buffer* encoded);
65 EncodedInfo EncodeActive(size_t frames_to_encode, rtc::Buffer* encoded);
66 size_t SamplesPer10msFrame() const;
67
68 std::unique_ptr<AudioEncoder> speech_encoder_;
69 const int cng_payload_type_;
70 const int num_cng_coefficients_;
71 const int sid_frame_interval_ms_;
72 std::vector<int16_t> speech_buffer_;
73 std::vector<uint32_t> rtp_timestamps_;
74 bool last_frame_active_;
75 std::unique_ptr<Vad> vad_;
76 std::unique_ptr<ComfortNoiseEncoder> cng_encoder_;
77 };
78
AudioEncoderCng(AudioEncoderCngConfig && config)79 AudioEncoderCng::AudioEncoderCng(AudioEncoderCngConfig&& config)
80 : speech_encoder_((static_cast<void>([&] {
81 RTC_CHECK(config.IsOk()) << "Invalid configuration.";
82 }()),
83 std::move(config.speech_encoder))),
84 cng_payload_type_(config.payload_type),
85 num_cng_coefficients_(config.num_cng_coefficients),
86 sid_frame_interval_ms_(config.sid_frame_interval_ms),
87 last_frame_active_(true),
88 vad_(config.vad ? std::unique_ptr<Vad>(config.vad)
89 : CreateVad(config.vad_mode)),
90 cng_encoder_(new ComfortNoiseEncoder(SampleRateHz(),
91 sid_frame_interval_ms_,
92 num_cng_coefficients_)) {}
93
94 AudioEncoderCng::~AudioEncoderCng() = default;
95
SampleRateHz() const96 int AudioEncoderCng::SampleRateHz() const {
97 return speech_encoder_->SampleRateHz();
98 }
99
NumChannels() const100 size_t AudioEncoderCng::NumChannels() const {
101 return 1;
102 }
103
RtpTimestampRateHz() const104 int AudioEncoderCng::RtpTimestampRateHz() const {
105 return speech_encoder_->RtpTimestampRateHz();
106 }
107
Num10MsFramesInNextPacket() const108 size_t AudioEncoderCng::Num10MsFramesInNextPacket() const {
109 return speech_encoder_->Num10MsFramesInNextPacket();
110 }
111
Max10MsFramesInAPacket() const112 size_t AudioEncoderCng::Max10MsFramesInAPacket() const {
113 return speech_encoder_->Max10MsFramesInAPacket();
114 }
115
GetTargetBitrate() const116 int AudioEncoderCng::GetTargetBitrate() const {
117 return speech_encoder_->GetTargetBitrate();
118 }
119
EncodeImpl(uint32_t rtp_timestamp,rtc::ArrayView<const int16_t> audio,rtc::Buffer * encoded)120 AudioEncoder::EncodedInfo AudioEncoderCng::EncodeImpl(
121 uint32_t rtp_timestamp,
122 rtc::ArrayView<const int16_t> audio,
123 rtc::Buffer* encoded) {
124 const size_t samples_per_10ms_frame = SamplesPer10msFrame();
125 RTC_CHECK_EQ(speech_buffer_.size(),
126 rtp_timestamps_.size() * samples_per_10ms_frame);
127 rtp_timestamps_.push_back(rtp_timestamp);
128 RTC_DCHECK_EQ(samples_per_10ms_frame, audio.size());
129 speech_buffer_.insert(speech_buffer_.end(), audio.cbegin(), audio.cend());
130 const size_t frames_to_encode = speech_encoder_->Num10MsFramesInNextPacket();
131 if (rtp_timestamps_.size() < frames_to_encode) {
132 return EncodedInfo();
133 }
134 RTC_CHECK_LE(frames_to_encode * 10, kMaxFrameSizeMs)
135 << "Frame size cannot be larger than " << kMaxFrameSizeMs
136 << " ms when using VAD/CNG.";
137
138 // Group several 10 ms blocks per VAD call. Call VAD once or twice using the
139 // following split sizes:
140 // 10 ms = 10 + 0 ms; 20 ms = 20 + 0 ms; 30 ms = 30 + 0 ms;
141 // 40 ms = 20 + 20 ms; 50 ms = 30 + 20 ms; 60 ms = 30 + 30 ms.
142 size_t blocks_in_first_vad_call =
143 (frames_to_encode > 3 ? 3 : frames_to_encode);
144 if (frames_to_encode == 4)
145 blocks_in_first_vad_call = 2;
146 RTC_CHECK_GE(frames_to_encode, blocks_in_first_vad_call);
147 const size_t blocks_in_second_vad_call =
148 frames_to_encode - blocks_in_first_vad_call;
149
150 // Check if all of the buffer is passive speech. Start with checking the first
151 // block.
152 Vad::Activity activity = vad_->VoiceActivity(
153 &speech_buffer_[0], samples_per_10ms_frame * blocks_in_first_vad_call,
154 SampleRateHz());
155 if (activity == Vad::kPassive && blocks_in_second_vad_call > 0) {
156 // Only check the second block if the first was passive.
157 activity = vad_->VoiceActivity(
158 &speech_buffer_[samples_per_10ms_frame * blocks_in_first_vad_call],
159 samples_per_10ms_frame * blocks_in_second_vad_call, SampleRateHz());
160 }
161
162 EncodedInfo info;
163 switch (activity) {
164 case Vad::kPassive: {
165 info = EncodePassive(frames_to_encode, encoded);
166 last_frame_active_ = false;
167 break;
168 }
169 case Vad::kActive: {
170 info = EncodeActive(frames_to_encode, encoded);
171 last_frame_active_ = true;
172 break;
173 }
174 default: {
175 RTC_CHECK_NOTREACHED();
176 }
177 }
178
179 speech_buffer_.erase(
180 speech_buffer_.begin(),
181 speech_buffer_.begin() + frames_to_encode * samples_per_10ms_frame);
182 rtp_timestamps_.erase(rtp_timestamps_.begin(),
183 rtp_timestamps_.begin() + frames_to_encode);
184 return info;
185 }
186
Reset()187 void AudioEncoderCng::Reset() {
188 speech_encoder_->Reset();
189 speech_buffer_.clear();
190 rtp_timestamps_.clear();
191 last_frame_active_ = true;
192 vad_->Reset();
193 cng_encoder_.reset(new ComfortNoiseEncoder(
194 SampleRateHz(), sid_frame_interval_ms_, num_cng_coefficients_));
195 }
196
SetFec(bool enable)197 bool AudioEncoderCng::SetFec(bool enable) {
198 return speech_encoder_->SetFec(enable);
199 }
200
SetDtx(bool enable)201 bool AudioEncoderCng::SetDtx(bool enable) {
202 return speech_encoder_->SetDtx(enable);
203 }
204
SetApplication(Application application)205 bool AudioEncoderCng::SetApplication(Application application) {
206 return speech_encoder_->SetApplication(application);
207 }
208
SetMaxPlaybackRate(int frequency_hz)209 void AudioEncoderCng::SetMaxPlaybackRate(int frequency_hz) {
210 speech_encoder_->SetMaxPlaybackRate(frequency_hz);
211 }
212
213 rtc::ArrayView<std::unique_ptr<AudioEncoder>>
ReclaimContainedEncoders()214 AudioEncoderCng::ReclaimContainedEncoders() {
215 return rtc::ArrayView<std::unique_ptr<AudioEncoder>>(&speech_encoder_, 1);
216 }
217
OnReceivedUplinkPacketLossFraction(float uplink_packet_loss_fraction)218 void AudioEncoderCng::OnReceivedUplinkPacketLossFraction(
219 float uplink_packet_loss_fraction) {
220 speech_encoder_->OnReceivedUplinkPacketLossFraction(
221 uplink_packet_loss_fraction);
222 }
223
OnReceivedUplinkBandwidth(int target_audio_bitrate_bps,absl::optional<int64_t> bwe_period_ms)224 void AudioEncoderCng::OnReceivedUplinkBandwidth(
225 int target_audio_bitrate_bps,
226 absl::optional<int64_t> bwe_period_ms) {
227 speech_encoder_->OnReceivedUplinkBandwidth(target_audio_bitrate_bps,
228 bwe_period_ms);
229 }
230
231 absl::optional<std::pair<TimeDelta, TimeDelta>>
GetFrameLengthRange() const232 AudioEncoderCng::GetFrameLengthRange() const {
233 return speech_encoder_->GetFrameLengthRange();
234 }
235
EncodePassive(size_t frames_to_encode,rtc::Buffer * encoded)236 AudioEncoder::EncodedInfo AudioEncoderCng::EncodePassive(
237 size_t frames_to_encode,
238 rtc::Buffer* encoded) {
239 bool force_sid = last_frame_active_;
240 bool output_produced = false;
241 const size_t samples_per_10ms_frame = SamplesPer10msFrame();
242 AudioEncoder::EncodedInfo info;
243
244 for (size_t i = 0; i < frames_to_encode; ++i) {
245 // It's important not to pass &info.encoded_bytes directly to
246 // WebRtcCng_Encode(), since later loop iterations may return zero in
247 // that value, in which case we don't want to overwrite any value from
248 // an earlier iteration.
249 size_t encoded_bytes_tmp =
250 cng_encoder_->Encode(rtc::ArrayView<const int16_t>(
251 &speech_buffer_[i * samples_per_10ms_frame],
252 samples_per_10ms_frame),
253 force_sid, encoded);
254
255 if (encoded_bytes_tmp > 0) {
256 RTC_CHECK(!output_produced);
257 info.encoded_bytes = encoded_bytes_tmp;
258 output_produced = true;
259 force_sid = false;
260 }
261 }
262
263 info.encoded_timestamp = rtp_timestamps_.front();
264 info.payload_type = cng_payload_type_;
265 info.send_even_if_empty = true;
266 info.speech = false;
267 return info;
268 }
269
EncodeActive(size_t frames_to_encode,rtc::Buffer * encoded)270 AudioEncoder::EncodedInfo AudioEncoderCng::EncodeActive(size_t frames_to_encode,
271 rtc::Buffer* encoded) {
272 const size_t samples_per_10ms_frame = SamplesPer10msFrame();
273 AudioEncoder::EncodedInfo info;
274 for (size_t i = 0; i < frames_to_encode; ++i) {
275 info =
276 speech_encoder_->Encode(rtp_timestamps_.front(),
277 rtc::ArrayView<const int16_t>(
278 &speech_buffer_[i * samples_per_10ms_frame],
279 samples_per_10ms_frame),
280 encoded);
281 if (i + 1 == frames_to_encode) {
282 RTC_CHECK_GT(info.encoded_bytes, 0) << "Encoder didn't deliver data.";
283 } else {
284 RTC_CHECK_EQ(info.encoded_bytes, 0)
285 << "Encoder delivered data too early.";
286 }
287 }
288 return info;
289 }
290
SamplesPer10msFrame() const291 size_t AudioEncoderCng::SamplesPer10msFrame() const {
292 return rtc::CheckedDivExact(10 * SampleRateHz(), 1000);
293 }
294
295 } // namespace
296
297 AudioEncoderCngConfig::AudioEncoderCngConfig() = default;
298 AudioEncoderCngConfig::AudioEncoderCngConfig(AudioEncoderCngConfig&&) = default;
299 AudioEncoderCngConfig::~AudioEncoderCngConfig() = default;
300
IsOk() const301 bool AudioEncoderCngConfig::IsOk() const {
302 if (num_channels != 1)
303 return false;
304 if (!speech_encoder)
305 return false;
306 if (num_channels != speech_encoder->NumChannels())
307 return false;
308 if (sid_frame_interval_ms <
309 static_cast<int>(speech_encoder->Max10MsFramesInAPacket() * 10))
310 return false;
311 if (num_cng_coefficients > WEBRTC_CNG_MAX_LPC_ORDER ||
312 num_cng_coefficients <= 0)
313 return false;
314 return true;
315 }
316
CreateComfortNoiseEncoder(AudioEncoderCngConfig && config)317 std::unique_ptr<AudioEncoder> CreateComfortNoiseEncoder(
318 AudioEncoderCngConfig&& config) {
319 return std::make_unique<AudioEncoderCng>(std::move(config));
320 }
321
322 } // namespace webrtc
323