xref: /aosp_15_r20/external/webrtc/modules/audio_processing/agc2/rnn_vad/pitch_search_internal.h (revision d9f758449e529ab9291ac668be2861e7a55c2422)
1 /*
2  *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_INTERNAL_H_
12 #define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_INTERNAL_H_
13 
14 #include <stddef.h>
15 
16 #include <array>
17 #include <utility>
18 
19 #include "api/array_view.h"
20 #include "modules/audio_processing/agc2/cpu_features.h"
21 #include "modules/audio_processing/agc2/rnn_vad/common.h"
22 
23 namespace webrtc {
24 namespace rnn_vad {
25 
26 // Performs 2x decimation without any anti-aliasing filter.
27 void Decimate2x(rtc::ArrayView<const float, kBufSize24kHz> src,
28                 rtc::ArrayView<float, kBufSize12kHz> dst);
29 
30 // Key concepts and keywords used below in this file.
31 //
32 // The pitch estimation relies on a pitch buffer, which is an array-like data
33 // structured designed as follows:
34 //
35 // |....A....|.....B.....|
36 //
37 // The part on the left, named `A` contains the oldest samples, whereas `B`
38 // contains the most recent ones. The size of `A` corresponds to the maximum
39 // pitch period, that of `B` to the analysis frame size (e.g., 16 ms and 20 ms
40 // respectively).
41 //
42 // Pitch estimation is essentially based on the analysis of two 20 ms frames
43 // extracted from the pitch buffer. One frame, called `x`, is kept fixed and
44 // corresponds to `B` - i.e., the most recent 20 ms. The other frame, called
45 // `y`, is extracted from different parts of the buffer instead.
46 //
47 // The offset between `x` and `y` corresponds to a specific pitch period.
48 // For instance, if `y` is positioned at the beginning of the pitch buffer, then
49 // the cross-correlation between `x` and `y` can be used as an indication of the
50 // strength for the maximum pitch.
51 //
52 // Such an offset can be encoded in two ways:
53 // - As a lag, which is the index in the pitch buffer for the first item in `y`
54 // - As an inverted lag, which is the number of samples from the beginning of
55 //   `x` and the end of `y`
56 //
57 // |---->| lag
58 // |....A....|.....B.....|
59 //       |<--| inverted lag
60 //       |.....y.....| `y` 20 ms frame
61 //
62 // The inverted lag has the advantage of being directly proportional to the
63 // corresponding pitch period.
64 
65 // Computes the sum of squared samples for every sliding frame `y` in the pitch
66 // buffer. The indexes of `y_energy` are inverted lags.
67 void ComputeSlidingFrameSquareEnergies24kHz(
68     rtc::ArrayView<const float, kBufSize24kHz> pitch_buffer,
69     rtc::ArrayView<float, kRefineNumLags24kHz> y_energy,
70     AvailableCpuFeatures cpu_features);
71 
72 // Top-2 pitch period candidates. Unit: number of samples - i.e., inverted lags.
73 struct CandidatePitchPeriods {
74   int best;
75   int second_best;
76 };
77 
78 // Computes the candidate pitch periods at 12 kHz given a view on the 12 kHz
79 // pitch buffer and the auto-correlation values (having inverted lags as
80 // indexes).
81 CandidatePitchPeriods ComputePitchPeriod12kHz(
82     rtc::ArrayView<const float, kBufSize12kHz> pitch_buffer,
83     rtc::ArrayView<const float, kNumLags12kHz> auto_correlation,
84     AvailableCpuFeatures cpu_features);
85 
86 // Computes the pitch period at 48 kHz given a view on the 24 kHz pitch buffer,
87 // the energies for the sliding frames `y` at 24 kHz and the pitch period
88 // candidates at 24 kHz (encoded as inverted lag).
89 int ComputePitchPeriod48kHz(
90     rtc::ArrayView<const float, kBufSize24kHz> pitch_buffer,
91     rtc::ArrayView<const float, kRefineNumLags24kHz> y_energy,
92     CandidatePitchPeriods pitch_candidates_24kHz,
93     AvailableCpuFeatures cpu_features);
94 
95 struct PitchInfo {
96   int period;
97   float strength;
98 };
99 
100 // Computes the pitch period at 48 kHz searching in an extended pitch range
101 // given a view on the 24 kHz pitch buffer, the energies for the sliding frames
102 // `y` at 24 kHz, the initial 48 kHz estimation (computed by
103 // `ComputePitchPeriod48kHz()`) and the last estimated pitch.
104 PitchInfo ComputeExtendedPitchPeriod48kHz(
105     rtc::ArrayView<const float, kBufSize24kHz> pitch_buffer,
106     rtc::ArrayView<const float, kRefineNumLags24kHz> y_energy,
107     int initial_pitch_period_48kHz,
108     PitchInfo last_pitch_48kHz,
109     AvailableCpuFeatures cpu_features);
110 
111 }  // namespace rnn_vad
112 }  // namespace webrtc
113 
114 #endif  // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_INTERNAL_H_
115