xref: /aosp_15_r20/external/webrtc/modules/audio_processing/aec3/residual_echo_estimator.cc (revision d9f758449e529ab9291ac668be2861e7a55c2422)
1 /*
2  *  Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "modules/audio_processing/aec3/residual_echo_estimator.h"
12 
13 #include <stddef.h>
14 
15 #include <algorithm>
16 #include <vector>
17 
18 #include "api/array_view.h"
19 #include "modules/audio_processing/aec3/reverb_model.h"
20 #include "rtc_base/checks.h"
21 #include "system_wrappers/include/field_trial.h"
22 
23 namespace webrtc {
24 namespace {
25 
26 constexpr float kDefaultTransparentModeGain = 0.01f;
27 
GetTransparentModeGain()28 float GetTransparentModeGain() {
29   return kDefaultTransparentModeGain;
30 }
31 
GetEarlyReflectionsDefaultModeGain(const EchoCanceller3Config::EpStrength & config)32 float GetEarlyReflectionsDefaultModeGain(
33     const EchoCanceller3Config::EpStrength& config) {
34   if (field_trial::IsEnabled("WebRTC-Aec3UseLowEarlyReflectionsDefaultGain")) {
35     return 0.1f;
36   }
37   return config.default_gain;
38 }
39 
GetLateReflectionsDefaultModeGain(const EchoCanceller3Config::EpStrength & config)40 float GetLateReflectionsDefaultModeGain(
41     const EchoCanceller3Config::EpStrength& config) {
42   if (field_trial::IsEnabled("WebRTC-Aec3UseLowLateReflectionsDefaultGain")) {
43     return 0.1f;
44   }
45   return config.default_gain;
46 }
47 
UseErleOnsetCompensationInDominantNearend(const EchoCanceller3Config::EpStrength & config)48 bool UseErleOnsetCompensationInDominantNearend(
49     const EchoCanceller3Config::EpStrength& config) {
50   return config.erle_onset_compensation_in_dominant_nearend ||
51          field_trial::IsEnabled(
52              "WebRTC-Aec3UseErleOnsetCompensationInDominantNearend");
53 }
54 
55 // Computes the indexes that will be used for computing spectral power over
56 // the blocks surrounding the delay.
GetRenderIndexesToAnalyze(const SpectrumBuffer & spectrum_buffer,const EchoCanceller3Config::EchoModel & echo_model,int filter_delay_blocks,int * idx_start,int * idx_stop)57 void GetRenderIndexesToAnalyze(
58     const SpectrumBuffer& spectrum_buffer,
59     const EchoCanceller3Config::EchoModel& echo_model,
60     int filter_delay_blocks,
61     int* idx_start,
62     int* idx_stop) {
63   RTC_DCHECK(idx_start);
64   RTC_DCHECK(idx_stop);
65   size_t window_start;
66   size_t window_end;
67   window_start =
68       std::max(0, filter_delay_blocks -
69                       static_cast<int>(echo_model.render_pre_window_size));
70   window_end = filter_delay_blocks +
71                static_cast<int>(echo_model.render_post_window_size);
72   *idx_start = spectrum_buffer.OffsetIndex(spectrum_buffer.read, window_start);
73   *idx_stop = spectrum_buffer.OffsetIndex(spectrum_buffer.read, window_end + 1);
74 }
75 
76 // Estimates the residual echo power based on the echo return loss enhancement
77 // (ERLE) and the linear power estimate.
LinearEstimate(rtc::ArrayView<const std::array<float,kFftLengthBy2Plus1>> S2_linear,rtc::ArrayView<const std::array<float,kFftLengthBy2Plus1>> erle,rtc::ArrayView<std::array<float,kFftLengthBy2Plus1>> R2)78 void LinearEstimate(
79     rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> S2_linear,
80     rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> erle,
81     rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) {
82   RTC_DCHECK_EQ(S2_linear.size(), erle.size());
83   RTC_DCHECK_EQ(S2_linear.size(), R2.size());
84 
85   const size_t num_capture_channels = R2.size();
86   for (size_t ch = 0; ch < num_capture_channels; ++ch) {
87     for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
88       RTC_DCHECK_LT(0.f, erle[ch][k]);
89       R2[ch][k] = S2_linear[ch][k] / erle[ch][k];
90     }
91   }
92 }
93 
94 // Estimates the residual echo power based on the estimate of the echo path
95 // gain.
NonLinearEstimate(float echo_path_gain,const std::array<float,kFftLengthBy2Plus1> & X2,rtc::ArrayView<std::array<float,kFftLengthBy2Plus1>> R2)96 void NonLinearEstimate(
97     float echo_path_gain,
98     const std::array<float, kFftLengthBy2Plus1>& X2,
99     rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) {
100   const size_t num_capture_channels = R2.size();
101   for (size_t ch = 0; ch < num_capture_channels; ++ch) {
102     for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
103       R2[ch][k] = X2[k] * echo_path_gain;
104     }
105   }
106 }
107 
108 // Applies a soft noise gate to the echo generating power.
ApplyNoiseGate(const EchoCanceller3Config::EchoModel & config,rtc::ArrayView<float,kFftLengthBy2Plus1> X2)109 void ApplyNoiseGate(const EchoCanceller3Config::EchoModel& config,
110                     rtc::ArrayView<float, kFftLengthBy2Plus1> X2) {
111   for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
112     if (config.noise_gate_power > X2[k]) {
113       X2[k] = std::max(0.f, X2[k] - config.noise_gate_slope *
114                                         (config.noise_gate_power - X2[k]));
115     }
116   }
117 }
118 
119 // Estimates the echo generating signal power as gated maximal power over a
120 // time window.
EchoGeneratingPower(size_t num_render_channels,const SpectrumBuffer & spectrum_buffer,const EchoCanceller3Config::EchoModel & echo_model,int filter_delay_blocks,rtc::ArrayView<float,kFftLengthBy2Plus1> X2)121 void EchoGeneratingPower(size_t num_render_channels,
122                          const SpectrumBuffer& spectrum_buffer,
123                          const EchoCanceller3Config::EchoModel& echo_model,
124                          int filter_delay_blocks,
125                          rtc::ArrayView<float, kFftLengthBy2Plus1> X2) {
126   int idx_stop;
127   int idx_start;
128   GetRenderIndexesToAnalyze(spectrum_buffer, echo_model, filter_delay_blocks,
129                             &idx_start, &idx_stop);
130 
131   std::fill(X2.begin(), X2.end(), 0.f);
132   if (num_render_channels == 1) {
133     for (int k = idx_start; k != idx_stop; k = spectrum_buffer.IncIndex(k)) {
134       for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) {
135         X2[j] = std::max(X2[j], spectrum_buffer.buffer[k][/*channel=*/0][j]);
136       }
137     }
138   } else {
139     for (int k = idx_start; k != idx_stop; k = spectrum_buffer.IncIndex(k)) {
140       std::array<float, kFftLengthBy2Plus1> render_power;
141       render_power.fill(0.f);
142       for (size_t ch = 0; ch < num_render_channels; ++ch) {
143         const auto& channel_power = spectrum_buffer.buffer[k][ch];
144         for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) {
145           render_power[j] += channel_power[j];
146         }
147       }
148       for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) {
149         X2[j] = std::max(X2[j], render_power[j]);
150       }
151     }
152   }
153 }
154 
155 }  // namespace
156 
ResidualEchoEstimator(const EchoCanceller3Config & config,size_t num_render_channels)157 ResidualEchoEstimator::ResidualEchoEstimator(const EchoCanceller3Config& config,
158                                              size_t num_render_channels)
159     : config_(config),
160       num_render_channels_(num_render_channels),
161       early_reflections_transparent_mode_gain_(GetTransparentModeGain()),
162       late_reflections_transparent_mode_gain_(GetTransparentModeGain()),
163       early_reflections_general_gain_(
164           GetEarlyReflectionsDefaultModeGain(config_.ep_strength)),
165       late_reflections_general_gain_(
166           GetLateReflectionsDefaultModeGain(config_.ep_strength)),
167       erle_onset_compensation_in_dominant_nearend_(
168           UseErleOnsetCompensationInDominantNearend(config_.ep_strength)) {
169   Reset();
170 }
171 
172 ResidualEchoEstimator::~ResidualEchoEstimator() = default;
173 
Estimate(const AecState & aec_state,const RenderBuffer & render_buffer,rtc::ArrayView<const std::array<float,kFftLengthBy2Plus1>> S2_linear,rtc::ArrayView<const std::array<float,kFftLengthBy2Plus1>> Y2,bool dominant_nearend,rtc::ArrayView<std::array<float,kFftLengthBy2Plus1>> R2,rtc::ArrayView<std::array<float,kFftLengthBy2Plus1>> R2_unbounded)174 void ResidualEchoEstimator::Estimate(
175     const AecState& aec_state,
176     const RenderBuffer& render_buffer,
177     rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> S2_linear,
178     rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> Y2,
179     bool dominant_nearend,
180     rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2,
181     rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2_unbounded) {
182   RTC_DCHECK_EQ(R2.size(), Y2.size());
183   RTC_DCHECK_EQ(R2.size(), S2_linear.size());
184 
185   const size_t num_capture_channels = R2.size();
186 
187   // Estimate the power of the stationary noise in the render signal.
188   UpdateRenderNoisePower(render_buffer);
189 
190   // Estimate the residual echo power.
191   if (aec_state.UsableLinearEstimate()) {
192     // When there is saturated echo, assume the same spectral content as is
193     // present in the microphone signal.
194     if (aec_state.SaturatedEcho()) {
195       for (size_t ch = 0; ch < num_capture_channels; ++ch) {
196         std::copy(Y2[ch].begin(), Y2[ch].end(), R2[ch].begin());
197         std::copy(Y2[ch].begin(), Y2[ch].end(), R2_unbounded[ch].begin());
198       }
199     } else {
200       const bool onset_compensated =
201           erle_onset_compensation_in_dominant_nearend_ || !dominant_nearend;
202       LinearEstimate(S2_linear, aec_state.Erle(onset_compensated), R2);
203       LinearEstimate(S2_linear, aec_state.ErleUnbounded(), R2_unbounded);
204     }
205 
206     UpdateReverb(ReverbType::kLinear, aec_state, render_buffer,
207                  dominant_nearend);
208     AddReverb(R2);
209     AddReverb(R2_unbounded);
210   } else {
211     const float echo_path_gain =
212         GetEchoPathGain(aec_state, /*gain_for_early_reflections=*/true);
213 
214     // When there is saturated echo, assume the same spectral content as is
215     // present in the microphone signal.
216     if (aec_state.SaturatedEcho()) {
217       for (size_t ch = 0; ch < num_capture_channels; ++ch) {
218         std::copy(Y2[ch].begin(), Y2[ch].end(), R2[ch].begin());
219         std::copy(Y2[ch].begin(), Y2[ch].end(), R2_unbounded[ch].begin());
220       }
221     } else {
222       // Estimate the echo generating signal power.
223       std::array<float, kFftLengthBy2Plus1> X2;
224       EchoGeneratingPower(num_render_channels_,
225                           render_buffer.GetSpectrumBuffer(), config_.echo_model,
226                           aec_state.MinDirectPathFilterDelay(), X2);
227       if (!aec_state.UseStationarityProperties()) {
228         ApplyNoiseGate(config_.echo_model, X2);
229       }
230 
231       // Subtract the stationary noise power to avoid stationary noise causing
232       // excessive echo suppression.
233       for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
234         X2[k] -= config_.echo_model.stationary_gate_slope * X2_noise_floor_[k];
235         X2[k] = std::max(0.f, X2[k]);
236       }
237 
238       NonLinearEstimate(echo_path_gain, X2, R2);
239       NonLinearEstimate(echo_path_gain, X2, R2_unbounded);
240     }
241 
242     if (config_.echo_model.model_reverb_in_nonlinear_mode &&
243         !aec_state.TransparentModeActive()) {
244       UpdateReverb(ReverbType::kNonLinear, aec_state, render_buffer,
245                    dominant_nearend);
246       AddReverb(R2);
247       AddReverb(R2_unbounded);
248     }
249   }
250 
251   if (aec_state.UseStationarityProperties()) {
252     // Scale the echo according to echo audibility.
253     std::array<float, kFftLengthBy2Plus1> residual_scaling;
254     aec_state.GetResidualEchoScaling(residual_scaling);
255     for (size_t ch = 0; ch < num_capture_channels; ++ch) {
256       for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
257         R2[ch][k] *= residual_scaling[k];
258         R2_unbounded[ch][k] *= residual_scaling[k];
259       }
260     }
261   }
262 }
263 
Reset()264 void ResidualEchoEstimator::Reset() {
265   echo_reverb_.Reset();
266   X2_noise_floor_counter_.fill(config_.echo_model.noise_floor_hold);
267   X2_noise_floor_.fill(config_.echo_model.min_noise_floor_power);
268 }
269 
UpdateRenderNoisePower(const RenderBuffer & render_buffer)270 void ResidualEchoEstimator::UpdateRenderNoisePower(
271     const RenderBuffer& render_buffer) {
272   std::array<float, kFftLengthBy2Plus1> render_power_data;
273   rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> X2 =
274       render_buffer.Spectrum(0);
275   rtc::ArrayView<const float, kFftLengthBy2Plus1> render_power =
276       X2[/*channel=*/0];
277   if (num_render_channels_ > 1) {
278     render_power_data.fill(0.f);
279     for (size_t ch = 0; ch < num_render_channels_; ++ch) {
280       const auto& channel_power = X2[ch];
281       for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
282         render_power_data[k] += channel_power[k];
283       }
284     }
285     render_power = render_power_data;
286   }
287 
288   // Estimate the stationary noise power in a minimum statistics manner.
289   for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
290     // Decrease rapidly.
291     if (render_power[k] < X2_noise_floor_[k]) {
292       X2_noise_floor_[k] = render_power[k];
293       X2_noise_floor_counter_[k] = 0;
294     } else {
295       // Increase in a delayed, leaky manner.
296       if (X2_noise_floor_counter_[k] >=
297           static_cast<int>(config_.echo_model.noise_floor_hold)) {
298         X2_noise_floor_[k] = std::max(X2_noise_floor_[k] * 1.1f,
299                                       config_.echo_model.min_noise_floor_power);
300       } else {
301         ++X2_noise_floor_counter_[k];
302       }
303     }
304   }
305 }
306 
307 // Updates the reverb estimation.
UpdateReverb(ReverbType reverb_type,const AecState & aec_state,const RenderBuffer & render_buffer,bool dominant_nearend)308 void ResidualEchoEstimator::UpdateReverb(ReverbType reverb_type,
309                                          const AecState& aec_state,
310                                          const RenderBuffer& render_buffer,
311                                          bool dominant_nearend) {
312   // Choose reverb partition based on what type of echo power model is used.
313   const size_t first_reverb_partition =
314       reverb_type == ReverbType::kLinear
315           ? aec_state.FilterLengthBlocks() + 1
316           : aec_state.MinDirectPathFilterDelay() + 1;
317 
318   // Compute render power for the reverb.
319   std::array<float, kFftLengthBy2Plus1> render_power_data;
320   rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> X2 =
321       render_buffer.Spectrum(first_reverb_partition);
322   rtc::ArrayView<const float, kFftLengthBy2Plus1> render_power =
323       X2[/*channel=*/0];
324   if (num_render_channels_ > 1) {
325     render_power_data.fill(0.f);
326     for (size_t ch = 0; ch < num_render_channels_; ++ch) {
327       const auto& channel_power = X2[ch];
328       for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
329         render_power_data[k] += channel_power[k];
330       }
331     }
332     render_power = render_power_data;
333   }
334 
335   // Update the reverb estimate.
336   float reverb_decay = aec_state.ReverbDecay(/*mild=*/dominant_nearend);
337   if (reverb_type == ReverbType::kLinear) {
338     echo_reverb_.UpdateReverb(
339         render_power, aec_state.GetReverbFrequencyResponse(), reverb_decay);
340   } else {
341     const float echo_path_gain =
342         GetEchoPathGain(aec_state, /*gain_for_early_reflections=*/false);
343     echo_reverb_.UpdateReverbNoFreqShaping(render_power, echo_path_gain,
344                                            reverb_decay);
345   }
346 }
347 // Adds the estimated power of the reverb to the residual echo power.
AddReverb(rtc::ArrayView<std::array<float,kFftLengthBy2Plus1>> R2) const348 void ResidualEchoEstimator::AddReverb(
349     rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) const {
350   const size_t num_capture_channels = R2.size();
351 
352   // Add the reverb power.
353   rtc::ArrayView<const float, kFftLengthBy2Plus1> reverb_power =
354       echo_reverb_.reverb();
355   for (size_t ch = 0; ch < num_capture_channels; ++ch) {
356     for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
357       R2[ch][k] += reverb_power[k];
358     }
359   }
360 }
361 
362 // Chooses the echo path gain to use.
GetEchoPathGain(const AecState & aec_state,bool gain_for_early_reflections) const363 float ResidualEchoEstimator::GetEchoPathGain(
364     const AecState& aec_state,
365     bool gain_for_early_reflections) const {
366   float gain_amplitude;
367   if (aec_state.TransparentModeActive()) {
368     gain_amplitude = gain_for_early_reflections
369                          ? early_reflections_transparent_mode_gain_
370                          : late_reflections_transparent_mode_gain_;
371   } else {
372     gain_amplitude = gain_for_early_reflections
373                          ? early_reflections_general_gain_
374                          : late_reflections_general_gain_;
375   }
376   return gain_amplitude * gain_amplitude;
377 }
378 
379 }  // namespace webrtc
380