1 /* 2 * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_INTERNAL_H_ 12 #define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_INTERNAL_H_ 13 14 #include <stddef.h> 15 16 #include <array> 17 #include <utility> 18 19 #include "api/array_view.h" 20 #include "modules/audio_processing/agc2/cpu_features.h" 21 #include "modules/audio_processing/agc2/rnn_vad/common.h" 22 23 namespace webrtc { 24 namespace rnn_vad { 25 26 // Performs 2x decimation without any anti-aliasing filter. 27 void Decimate2x(rtc::ArrayView<const float, kBufSize24kHz> src, 28 rtc::ArrayView<float, kBufSize12kHz> dst); 29 30 // Key concepts and keywords used below in this file. 31 // 32 // The pitch estimation relies on a pitch buffer, which is an array-like data 33 // structured designed as follows: 34 // 35 // |....A....|.....B.....| 36 // 37 // The part on the left, named `A` contains the oldest samples, whereas `B` 38 // contains the most recent ones. The size of `A` corresponds to the maximum 39 // pitch period, that of `B` to the analysis frame size (e.g., 16 ms and 20 ms 40 // respectively). 41 // 42 // Pitch estimation is essentially based on the analysis of two 20 ms frames 43 // extracted from the pitch buffer. One frame, called `x`, is kept fixed and 44 // corresponds to `B` - i.e., the most recent 20 ms. The other frame, called 45 // `y`, is extracted from different parts of the buffer instead. 46 // 47 // The offset between `x` and `y` corresponds to a specific pitch period. 48 // For instance, if `y` is positioned at the beginning of the pitch buffer, then 49 // the cross-correlation between `x` and `y` can be used as an indication of the 50 // strength for the maximum pitch. 51 // 52 // Such an offset can be encoded in two ways: 53 // - As a lag, which is the index in the pitch buffer for the first item in `y` 54 // - As an inverted lag, which is the number of samples from the beginning of 55 // `x` and the end of `y` 56 // 57 // |---->| lag 58 // |....A....|.....B.....| 59 // |<--| inverted lag 60 // |.....y.....| `y` 20 ms frame 61 // 62 // The inverted lag has the advantage of being directly proportional to the 63 // corresponding pitch period. 64 65 // Computes the sum of squared samples for every sliding frame `y` in the pitch 66 // buffer. The indexes of `y_energy` are inverted lags. 67 void ComputeSlidingFrameSquareEnergies24kHz( 68 rtc::ArrayView<const float, kBufSize24kHz> pitch_buffer, 69 rtc::ArrayView<float, kRefineNumLags24kHz> y_energy, 70 AvailableCpuFeatures cpu_features); 71 72 // Top-2 pitch period candidates. Unit: number of samples - i.e., inverted lags. 73 struct CandidatePitchPeriods { 74 int best; 75 int second_best; 76 }; 77 78 // Computes the candidate pitch periods at 12 kHz given a view on the 12 kHz 79 // pitch buffer and the auto-correlation values (having inverted lags as 80 // indexes). 81 CandidatePitchPeriods ComputePitchPeriod12kHz( 82 rtc::ArrayView<const float, kBufSize12kHz> pitch_buffer, 83 rtc::ArrayView<const float, kNumLags12kHz> auto_correlation, 84 AvailableCpuFeatures cpu_features); 85 86 // Computes the pitch period at 48 kHz given a view on the 24 kHz pitch buffer, 87 // the energies for the sliding frames `y` at 24 kHz and the pitch period 88 // candidates at 24 kHz (encoded as inverted lag). 89 int ComputePitchPeriod48kHz( 90 rtc::ArrayView<const float, kBufSize24kHz> pitch_buffer, 91 rtc::ArrayView<const float, kRefineNumLags24kHz> y_energy, 92 CandidatePitchPeriods pitch_candidates_24kHz, 93 AvailableCpuFeatures cpu_features); 94 95 struct PitchInfo { 96 int period; 97 float strength; 98 }; 99 100 // Computes the pitch period at 48 kHz searching in an extended pitch range 101 // given a view on the 24 kHz pitch buffer, the energies for the sliding frames 102 // `y` at 24 kHz, the initial 48 kHz estimation (computed by 103 // `ComputePitchPeriod48kHz()`) and the last estimated pitch. 104 PitchInfo ComputeExtendedPitchPeriod48kHz( 105 rtc::ArrayView<const float, kBufSize24kHz> pitch_buffer, 106 rtc::ArrayView<const float, kRefineNumLags24kHz> y_energy, 107 int initial_pitch_period_48kHz, 108 PitchInfo last_pitch_48kHz, 109 AvailableCpuFeatures cpu_features); 110 111 } // namespace rnn_vad 112 } // namespace webrtc 113 114 #endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_INTERNAL_H_ 115