1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10 #include <immintrin.h> // AVX2
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx/vpx_integer.h"
13
14 // Note with sums[4] some versions of Visual Studio may fail due to parameter
15 // alignment, though the functions should be equivalent:
16 // error C2719: 'sums': formal parameter with requested alignment of 32 won't be
17 // aligned
calc_final_4(const __m256i * const sums,uint32_t sad_array[4])18 static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
19 uint32_t sad_array[4]) {
20 const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
21 const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
22 const __m256i t2 = _mm256_hadd_epi32(t0, t1);
23 const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
24 _mm256_extractf128_si256(t2, 1));
25 _mm_storeu_si128((__m128i *)sad_array, sum);
26 }
27
sad32xhx4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,int h,uint32_t sad_array[4])28 static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
29 const uint8_t *const ref_array[4],
30 int ref_stride, int h,
31 uint32_t sad_array[4]) {
32 int i;
33 const uint8_t *refs[4];
34 __m256i sums[4];
35
36 refs[0] = ref_array[0];
37 refs[1] = ref_array[1];
38 refs[2] = ref_array[2];
39 refs[3] = ref_array[3];
40 sums[0] = _mm256_setzero_si256();
41 sums[1] = _mm256_setzero_si256();
42 sums[2] = _mm256_setzero_si256();
43 sums[3] = _mm256_setzero_si256();
44
45 for (i = 0; i < h; i++) {
46 __m256i r[4];
47
48 // load src and all ref[]
49 const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
50 r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
51 r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
52 r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
53 r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
54
55 // sum of the absolute differences between every ref[] to src
56 r[0] = _mm256_sad_epu8(r[0], s);
57 r[1] = _mm256_sad_epu8(r[1], s);
58 r[2] = _mm256_sad_epu8(r[2], s);
59 r[3] = _mm256_sad_epu8(r[3], s);
60
61 // sum every ref[]
62 sums[0] = _mm256_add_epi32(sums[0], r[0]);
63 sums[1] = _mm256_add_epi32(sums[1], r[1]);
64 sums[2] = _mm256_add_epi32(sums[2], r[2]);
65 sums[3] = _mm256_add_epi32(sums[3], r[3]);
66
67 src_ptr += src_stride;
68 refs[0] += ref_stride;
69 refs[1] += ref_stride;
70 refs[2] += ref_stride;
71 refs[3] += ref_stride;
72 }
73
74 calc_final_4(sums, sad_array);
75 }
76
sad64xhx4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,int h,uint32_t sad_array[4])77 static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
78 const uint8_t *const ref_array[4],
79 int ref_stride, int h,
80 uint32_t sad_array[4]) {
81 __m256i sums[4];
82 int i;
83 const uint8_t *refs[4];
84
85 refs[0] = ref_array[0];
86 refs[1] = ref_array[1];
87 refs[2] = ref_array[2];
88 refs[3] = ref_array[3];
89 sums[0] = _mm256_setzero_si256();
90 sums[1] = _mm256_setzero_si256();
91 sums[2] = _mm256_setzero_si256();
92 sums[3] = _mm256_setzero_si256();
93
94 for (i = 0; i < h; i++) {
95 __m256i r_lo[4], r_hi[4];
96 // load 64 bytes from src and all ref[]
97 const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
98 const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));
99 r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
100 r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));
101 r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
102 r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32));
103 r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
104 r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32));
105 r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
106 r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));
107
108 // sum of the absolute differences between every ref[] to src
109 r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);
110 r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);
111 r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);
112 r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo);
113 r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi);
114 r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi);
115 r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);
116 r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);
117
118 // sum every ref[]
119 sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);
120 sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);
121 sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);
122 sums[3] = _mm256_add_epi32(sums[3], r_lo[3]);
123 sums[0] = _mm256_add_epi32(sums[0], r_hi[0]);
124 sums[1] = _mm256_add_epi32(sums[1], r_hi[1]);
125 sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);
126 sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);
127
128 src_ptr += src_stride;
129 refs[0] += ref_stride;
130 refs[1] += ref_stride;
131 refs[2] += ref_stride;
132 refs[3] += ref_stride;
133 }
134
135 calc_final_4(sums, sad_array);
136 }
137
138 #define SAD64_H(h) \
139 void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride, \
140 const uint8_t *const ref_array[4], \
141 int ref_stride, uint32_t sad_array[4]) { \
142 sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
143 }
144
145 #define SAD32_H(h) \
146 void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride, \
147 const uint8_t *const ref_array[4], \
148 int ref_stride, uint32_t sad_array[4]) { \
149 sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
150 }
151
152 SAD64_H(64)
153 SAD32_H(32)
154
155 #define SADS64_H(h) \
156 void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride, \
157 const uint8_t *const ref_array[4], \
158 int ref_stride, uint32_t sad_array[4]) { \
159 sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
160 ((h) >> 1), sad_array); \
161 sad_array[0] <<= 1; \
162 sad_array[1] <<= 1; \
163 sad_array[2] <<= 1; \
164 sad_array[3] <<= 1; \
165 }
166
167 #define SADS32_H(h) \
168 void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride, \
169 const uint8_t *const ref_array[4], \
170 int ref_stride, uint32_t sad_array[4]) { \
171 sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
172 ((h) >> 1), sad_array); \
173 sad_array[0] <<= 1; \
174 sad_array[1] <<= 1; \
175 sad_array[2] <<= 1; \
176 sad_array[3] <<= 1; \
177 }
178
179 SADS64_H(64)
180 SADS64_H(32)
181
182 SADS32_H(64)
183 SADS32_H(32)
184 SADS32_H(16)
185