xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/sad4d_avx2.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 #include <immintrin.h>  // AVX2
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx/vpx_integer.h"
13 
14 // Note with sums[4] some versions of Visual Studio may fail due to parameter
15 // alignment, though the functions should be equivalent:
16 // error C2719: 'sums': formal parameter with requested alignment of 32 won't be
17 // aligned
calc_final_4(const __m256i * const sums,uint32_t sad_array[4])18 static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
19                                 uint32_t sad_array[4]) {
20   const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
21   const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
22   const __m256i t2 = _mm256_hadd_epi32(t0, t1);
23   const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
24                                     _mm256_extractf128_si256(t2, 1));
25   _mm_storeu_si128((__m128i *)sad_array, sum);
26 }
27 
sad32xhx4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,int h,uint32_t sad_array[4])28 static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
29                                    const uint8_t *const ref_array[4],
30                                    int ref_stride, int h,
31                                    uint32_t sad_array[4]) {
32   int i;
33   const uint8_t *refs[4];
34   __m256i sums[4];
35 
36   refs[0] = ref_array[0];
37   refs[1] = ref_array[1];
38   refs[2] = ref_array[2];
39   refs[3] = ref_array[3];
40   sums[0] = _mm256_setzero_si256();
41   sums[1] = _mm256_setzero_si256();
42   sums[2] = _mm256_setzero_si256();
43   sums[3] = _mm256_setzero_si256();
44 
45   for (i = 0; i < h; i++) {
46     __m256i r[4];
47 
48     // load src and all ref[]
49     const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
50     r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
51     r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
52     r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
53     r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
54 
55     // sum of the absolute differences between every ref[] to src
56     r[0] = _mm256_sad_epu8(r[0], s);
57     r[1] = _mm256_sad_epu8(r[1], s);
58     r[2] = _mm256_sad_epu8(r[2], s);
59     r[3] = _mm256_sad_epu8(r[3], s);
60 
61     // sum every ref[]
62     sums[0] = _mm256_add_epi32(sums[0], r[0]);
63     sums[1] = _mm256_add_epi32(sums[1], r[1]);
64     sums[2] = _mm256_add_epi32(sums[2], r[2]);
65     sums[3] = _mm256_add_epi32(sums[3], r[3]);
66 
67     src_ptr += src_stride;
68     refs[0] += ref_stride;
69     refs[1] += ref_stride;
70     refs[2] += ref_stride;
71     refs[3] += ref_stride;
72   }
73 
74   calc_final_4(sums, sad_array);
75 }
76 
sad64xhx4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,int h,uint32_t sad_array[4])77 static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
78                                    const uint8_t *const ref_array[4],
79                                    int ref_stride, int h,
80                                    uint32_t sad_array[4]) {
81   __m256i sums[4];
82   int i;
83   const uint8_t *refs[4];
84 
85   refs[0] = ref_array[0];
86   refs[1] = ref_array[1];
87   refs[2] = ref_array[2];
88   refs[3] = ref_array[3];
89   sums[0] = _mm256_setzero_si256();
90   sums[1] = _mm256_setzero_si256();
91   sums[2] = _mm256_setzero_si256();
92   sums[3] = _mm256_setzero_si256();
93 
94   for (i = 0; i < h; i++) {
95     __m256i r_lo[4], r_hi[4];
96     // load 64 bytes from src and all ref[]
97     const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
98     const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));
99     r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
100     r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));
101     r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
102     r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32));
103     r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
104     r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32));
105     r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
106     r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));
107 
108     // sum of the absolute differences between every ref[] to src
109     r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);
110     r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);
111     r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);
112     r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo);
113     r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi);
114     r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi);
115     r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);
116     r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);
117 
118     // sum every ref[]
119     sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);
120     sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);
121     sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);
122     sums[3] = _mm256_add_epi32(sums[3], r_lo[3]);
123     sums[0] = _mm256_add_epi32(sums[0], r_hi[0]);
124     sums[1] = _mm256_add_epi32(sums[1], r_hi[1]);
125     sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);
126     sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);
127 
128     src_ptr += src_stride;
129     refs[0] += ref_stride;
130     refs[1] += ref_stride;
131     refs[2] += ref_stride;
132     refs[3] += ref_stride;
133   }
134 
135   calc_final_4(sums, sad_array);
136 }
137 
138 #define SAD64_H(h)                                                         \
139   void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride,         \
140                                const uint8_t *const ref_array[4],          \
141                                int ref_stride, uint32_t sad_array[4]) {    \
142     sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
143   }
144 
145 #define SAD32_H(h)                                                         \
146   void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride,         \
147                                const uint8_t *const ref_array[4],          \
148                                int ref_stride, uint32_t sad_array[4]) {    \
149     sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
150   }
151 
152 SAD64_H(64)
153 SAD32_H(32)
154 
155 #define SADS64_H(h)                                                           \
156   void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride,      \
157                                      const uint8_t *const ref_array[4],       \
158                                      int ref_stride, uint32_t sad_array[4]) { \
159     sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
160                     ((h) >> 1), sad_array);                                   \
161     sad_array[0] <<= 1;                                                       \
162     sad_array[1] <<= 1;                                                       \
163     sad_array[2] <<= 1;                                                       \
164     sad_array[3] <<= 1;                                                       \
165   }
166 
167 #define SADS32_H(h)                                                           \
168   void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride,      \
169                                      const uint8_t *const ref_array[4],       \
170                                      int ref_stride, uint32_t sad_array[4]) { \
171     sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
172                     ((h) >> 1), sad_array);                                   \
173     sad_array[0] <<= 1;                                                       \
174     sad_array[1] <<= 1;                                                       \
175     sad_array[2] <<= 1;                                                       \
176     sad_array[3] <<= 1;                                                       \
177   }
178 
179 SADS64_H(64)
180 SADS64_H(32)
181 
182 SADS32_H(64)
183 SADS32_H(32)
184 SADS32_H(16)
185