xref: /aosp_15_r20/external/flac/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_neon.c (revision 600f14f40d737144c998e2ec7a483122d3776fbc)
1 	int i;
2 	float64x2_t sum0 = vdupq_n_f64(0.0f);
3 	float64x2_t sum1 = vdupq_n_f64(0.0f);
4 	float64x2_t sum2 = vdupq_n_f64(0.0f);
5 	float64x2_t sum3 = vdupq_n_f64(0.0f);
6 	float64x2_t d0 = vdupq_n_f64(0.0f);
7 	float64x2_t d1 = vdupq_n_f64(0.0f);
8 	float64x2_t d2 = vdupq_n_f64(0.0f);
9 	float64x2_t d3 = vdupq_n_f64(0.0f);
10 #if MAX_LAG > 8
11 	float64x2_t sum4 = vdupq_n_f64(0.0f);
12 	float64x2_t d4 = vdupq_n_f64(0.0f);
13 #endif
14 #if MAX_LAG > 10
15 	float64x2_t sum5 = vdupq_n_f64(0.0f);
16 	float64x2_t sum6 = vdupq_n_f64(0.0f);
17 	float64x2_t d5 = vdupq_n_f64(0.0f);
18 	float64x2_t d6 = vdupq_n_f64(0.0f);
19 #endif
20 	float64x2_t d;
21 
22 	(void)lag;
23 	FLAC__ASSERT(lag <= MAX_LAG);
24 
25 	// Loop backwards through samples from data_len to 0
26 	for (i = data_len - 1; i >= 0; i--)
27 	{
28 		d = vdupq_n_f64(data[i]); // Create vector with 2 entries data[i]
29 
30 		// The next 6 lines of code right-shift the elements through the 7 vectors d0..d6.
31 		// The 7th line adds the newly loaded element to d0. This works like a stack, where
32 		// data[i] is pushed onto the stack every time and the 9th element falls off
33 #if MAX_LAG > 10
34 		d6 = vextq_f64(d5,d6,1);
35 		d5 = vextq_f64(d4,d5,1);
36 #endif
37 #if MAX_LAG > 8
38 		d4 = vextq_f64(d3,d4,1);
39 #endif
40 		d3 = vextq_f64(d2,d3,1);
41 		d2 = vextq_f64(d1,d2,1);
42 		d1 = vextq_f64(d0,d1,1);
43 		d0 = vextq_f64(d,d0,1);
44 
45 		// Fused multiply-add sum += d * d0..d6
46 		sum0 = vfmaq_f64(sum0, d, d0);
47 		sum1 = vfmaq_f64(sum1, d, d1);
48 		sum2 = vfmaq_f64(sum2, d, d2);
49 		sum3 = vfmaq_f64(sum3, d, d3);
50 #if MAX_LAG > 8
51 		sum4 = vfmaq_f64(sum4, d, d4);
52 #endif
53 #if MAX_LAG > 10
54 		sum5 = vfmaq_f64(sum5, d, d5);
55 		sum6 = vfmaq_f64(sum6, d, d6);
56 #endif
57 	}
58 
59     // Store sum0..sum6 in autoc[0..14]
60     vst1q_f64(autoc, sum0);
61     vst1q_f64(autoc + 2, sum1);
62     vst1q_f64(autoc + 4, sum2);
63     vst1q_f64(autoc + 6, sum3);
64 #if MAX_LAG > 8
65     vst1q_f64(autoc + 8, sum4);
66 #endif
67 #if MAX_LAG > 10
68     vst1q_f64(autoc + 10, sum5);
69     vst1q_f64(autoc + 12, sum6);
70 #endif
71