xref: /aosp_15_r20/external/flac/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_sse2.c (revision 600f14f40d737144c998e2ec7a483122d3776fbc)
1 /* This code is imported several times in lpc_intrin_sse2.c with different
2  * values for MAX_LAG. Comments are for MAX_LAG == 14 */
3 	int i;
4 	__m128d sum0, sum1, sum2, sum3;
5 	__m128d d0, d1, d2, d3;
6 #if MAX_LAG > 8
7 	__m128d d4;
8 	__m128d sum4;
9 #endif
10 #if MAX_LAG > 10
11 	__m128d d5, d6;
12 	__m128d sum5, sum6;
13 #endif
14 
15 	(void) lag;
16 	FLAC__ASSERT(lag <= MAX_LAG);
17 
18 	/* Initialize all sum vectors with zero */
19 	sum0 = _mm_setzero_pd();
20 	sum1 = _mm_setzero_pd();
21 	sum2 = _mm_setzero_pd();
22 	sum3 = _mm_setzero_pd();
23 	d0 = _mm_setzero_pd();
24 	d1 = _mm_setzero_pd();
25 	d2 = _mm_setzero_pd();
26 	d3 = _mm_setzero_pd();
27 #if MAX_LAG > 8
28 	sum4 = _mm_setzero_pd();
29 	d4 = _mm_setzero_pd();
30 #endif
31 #if MAX_LAG > 10
32 	sum5 = _mm_setzero_pd();
33 	sum6 = _mm_setzero_pd();
34 	d5 = _mm_setzero_pd();
35 	d6 = _mm_setzero_pd();
36 #endif
37 
38 	/* Loop backwards through samples from data_len to limit */
39 	for(i = data_len-1; i >= 0; i--) {
40 		__m128d d = _mm_set1_pd(data[i]);
41 
42 		/* The next lines of code work like a queue. For more
43 		 * information see the lag8 version of this function */
44 #if MAX_LAG > 10
45 		d6 = _mm_shuffle_pd(d5, d6, _MM_SHUFFLE(0,0,0,1));
46 		d5 = _mm_shuffle_pd(d4, d5, _MM_SHUFFLE(0,0,0,1));
47 #endif
48 #if MAX_LAG > 8
49 		d4 = _mm_shuffle_pd(d3, d4, _MM_SHUFFLE(0,0,0,1));
50 #endif
51 		d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1));
52 		d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1));
53 		d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1));
54 		d0 = _mm_shuffle_pd(d,  d0, _MM_SHUFFLE(0,0,0,1));
55 
56 		/* sumn += d*dn */
57 		sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0));
58 		sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1));
59 		sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2));
60 		sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3));
61 #if MAX_LAG > 8
62 		sum4 = _mm_add_pd(sum4, _mm_mul_pd(d, d4));
63 #endif
64 #if MAX_LAG > 10
65 		sum5 = _mm_add_pd(sum5, _mm_mul_pd(d, d5));
66 		sum6 = _mm_add_pd(sum6, _mm_mul_pd(d, d6));
67 #endif
68 	}
69 
70 	/* Store sum0..sum6 in autoc[0..14] */
71 	_mm_storeu_pd(autoc,   sum0);
72 	_mm_storeu_pd(autoc+2, sum1);
73 	_mm_storeu_pd(autoc+4, sum2);
74 	_mm_storeu_pd(autoc+6 ,sum3);
75 #if MAX_LAG > 8
76 	_mm_storeu_pd(autoc+8, sum4);
77 #endif
78 #if MAX_LAG > 10
79 	_mm_storeu_pd(autoc+10,sum5);
80 	_mm_storeu_pd(autoc+12,sum6);
81 #endif
82