1 /*
2 * Copyright (c) 2021 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/transpose_neon.h"
19 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
20 #include "vpx_dsp/vpx_filter.h"
21 #include "vpx_ports/mem.h"
22
23 // Filter values always sum to 128.
24 #define FILTER_SUM 128
25
26 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
27 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
28 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
29 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
30 };
31
32 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
33 // Shift left and insert new last column in transposed 4x4 block.
34 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
35 // Shift left and insert two new columns in transposed 4x4 block.
36 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
37 // Shift left and insert three new columns in transposed 4x4 block.
38 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
39 };
40
convolve4_4_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16_t permute_tbl)41 static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
42 const int8x8_t filters,
43 const uint8x16_t permute_tbl) {
44 // Transform sample range to [-128, 127] for 8-bit signed dot product.
45 int8x16_t samples_128 =
46 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
47
48 // Permute samples ready for dot product.
49 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
50 int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
51
52 // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide
53 // by 2 since we halved the filter values.)
54 int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2);
55 int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
56
57 // Further narrowing and packing is performed by the caller.
58 return vmovn_s32(sum);
59 }
60
convolve4_8_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16x2_t permute_tbl)61 static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
62 const int8x8_t filters,
63 const uint8x16x2_t permute_tbl) {
64 // Transform sample range to [-128, 127] for 8-bit signed dot product.
65 int8x16_t samples_128 =
66 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
67
68 // Permute samples ready for dot product.
69 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
70 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
71 int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
72 vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
73
74 // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide
75 // by 2 since we halved the filter values.)
76 int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2);
77 // First 4 output values.
78 int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
79 // Second 4 output values.
80 int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
81
82 // Narrow and re-pack.
83 int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
84 // We halved the filter values so -1 from right shift.
85 return vqrshrun_n_s16(sum, FILTER_BITS - 1);
86 }
87
convolve8_4_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16x2_t permute_tbl)88 static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
89 const int8x8_t filters,
90 const uint8x16x2_t permute_tbl) {
91 // Transform sample range to [-128, 127] for 8-bit signed dot product.
92 int8x16_t samples_128 =
93 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
94
95 // Permute samples ready for dot product.
96 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
97 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
98 int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
99 vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
100
101 // Accumulate into 128 * FILTER_SUM to account for range transform.
102 int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
103 int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
104 sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1);
105
106 // Further narrowing and packing is performed by the caller.
107 return vshrn_n_s32(sum, 1);
108 }
109
convolve8_8_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16x3_t permute_tbl)110 static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
111 const int8x8_t filters,
112 const uint8x16x3_t permute_tbl) {
113 // Transform sample range to [-128, 127] for 8-bit signed dot product.
114 int8x16_t samples_128 =
115 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
116
117 // Permute samples ready for dot product.
118 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
119 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
120 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
121 int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
122 vqtbl1q_s8(samples_128, permute_tbl.val[1]),
123 vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
124
125 // Accumulate into 128 * FILTER_SUM to account for range transform.
126 int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
127 // First 4 output values.
128 int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
129 sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1);
130 // Second 4 output values.
131 int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
132 sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1);
133
134 // Narrow and re-pack.
135 int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
136 return vqrshrun_n_s16(sum, FILTER_BITS - 1);
137 }
138
convolve_4tap_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h,const int8x8_t filter)139 static INLINE void convolve_4tap_horiz_neon_dotprod(
140 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
141 ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
142 if (w == 4) {
143 const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
144
145 do {
146 uint8x16_t s0, s1, s2, s3;
147 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
148
149 int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
150 int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
151 int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
152 int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
153 // We halved the filter values so -1 from right shift.
154 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
155 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
156
157 store_u8(dst + 0 * dst_stride, dst_stride, d01);
158 store_u8(dst + 2 * dst_stride, dst_stride, d23);
159
160 src += 4 * src_stride;
161 dst += 4 * dst_stride;
162 h -= 4;
163 } while (h != 0);
164 } else {
165 const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
166
167 do {
168 const uint8_t *s = src;
169 uint8_t *d = dst;
170 int width = w;
171
172 do {
173 uint8x16_t s0, s1, s2, s3;
174 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
175
176 uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
177 uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
178 uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
179 uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
180
181 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
182
183 s += 8;
184 d += 8;
185 width -= 8;
186 } while (width != 0);
187 src += 4 * src_stride;
188 dst += 4 * dst_stride;
189 h -= 4;
190 } while (h != 0);
191 }
192 }
193
convolve_8tap_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h,const int8x8_t filter)194 static INLINE void convolve_8tap_horiz_neon_dotprod(
195 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
196 ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
197 if (w == 4) {
198 const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
199
200 do {
201 uint8x16_t s0, s1, s2, s3;
202 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
203
204 int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl);
205 int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl);
206 int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl);
207 int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl);
208 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
209 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
210
211 store_u8(dst + 0 * dst_stride, dst_stride, d01);
212 store_u8(dst + 2 * dst_stride, dst_stride, d23);
213
214 src += 4 * src_stride;
215 dst += 4 * dst_stride;
216 h -= 4;
217 } while (h != 0);
218 } else {
219 const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
220
221 do {
222 const uint8_t *s = src;
223 uint8_t *d = dst;
224 int width = w;
225
226 do {
227 uint8x16_t s0, s1, s2, s3;
228 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
229
230 uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
231 uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
232 uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
233 uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
234
235 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
236
237 s += 8;
238 d += 8;
239 width -= 8;
240 } while (width != 0);
241 src += 4 * src_stride;
242 dst += 4 * dst_stride;
243 h -= 4;
244 } while (h != 0);
245 }
246 }
247
vpx_convolve8_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)248 void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
249 uint8_t *dst, ptrdiff_t dst_stride,
250 const InterpKernel *filter, int x0_q4,
251 int x_step_q4, int y0_q4, int y_step_q4,
252 int w, int h) {
253 assert((intptr_t)dst % 4 == 0);
254 assert(dst_stride % 4 == 0);
255 assert(x_step_q4 == 16);
256
257 (void)x_step_q4;
258 (void)y0_q4;
259 (void)y_step_q4;
260
261 if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
262 // Load 4-tap filter into first 4 elements of the vector.
263 // All 4-tap and bilinear filter values are even, so halve them to reduce
264 // intermediate precision requirements.
265 const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
266 const int8x8_t x_filter_4tap =
267 vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
268
269 convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride, w, h,
270 x_filter_4tap);
271
272 } else {
273 const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
274
275 convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride, w, h,
276 x_filter_8tap);
277 }
278 }
279
vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)280 void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
281 ptrdiff_t src_stride, uint8_t *dst,
282 ptrdiff_t dst_stride,
283 const InterpKernel *filter, int x0_q4,
284 int x_step_q4, int y0_q4,
285 int y_step_q4, int w, int h) {
286 const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
287
288 assert((intptr_t)dst % 4 == 0);
289 assert(dst_stride % 4 == 0);
290 assert(x_step_q4 == 16);
291
292 (void)x_step_q4;
293 (void)y0_q4;
294 (void)y_step_q4;
295
296 src -= 3;
297
298 if (w == 4) {
299 const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
300
301 do {
302 uint8x16_t s0, s1, s2, s3;
303 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
304
305 int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl);
306 int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl);
307 int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl);
308 int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl);
309 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
310 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
311
312 uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
313 uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
314
315 d01 = vrhadd_u8(d01, dd01);
316 d23 = vrhadd_u8(d23, dd23);
317
318 store_u8(dst + 0 * dst_stride, dst_stride, d01);
319 store_u8(dst + 2 * dst_stride, dst_stride, d23);
320
321 src += 4 * src_stride;
322 dst += 4 * dst_stride;
323 h -= 4;
324 } while (h != 0);
325 } else {
326 const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
327
328 do {
329 const uint8_t *s = src;
330 uint8_t *d = dst;
331 int width = w;
332
333 do {
334 uint8x16_t s0, s1, s2, s3;
335 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
336
337 uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl);
338 uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl);
339 uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl);
340 uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl);
341
342 uint8x8_t dd0, dd1, dd2, dd3;
343 load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
344
345 d0 = vrhadd_u8(d0, dd0);
346 d1 = vrhadd_u8(d1, dd1);
347 d2 = vrhadd_u8(d2, dd2);
348 d3 = vrhadd_u8(d3, dd3);
349
350 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
351
352 s += 8;
353 d += 8;
354 width -= 8;
355 } while (width != 0);
356 src += 4 * src_stride;
357 dst += 4 * dst_stride;
358 h -= 4;
359 } while (h != 0);
360 }
361 }
362
transpose_concat_4x4(int8x8_t a0,int8x8_t a1,int8x8_t a2,int8x8_t a3,int8x16_t * b)363 static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
364 int8x8_t a3, int8x16_t *b) {
365 // Transpose 8-bit elements and concatenate result rows as follows:
366 // a0: 00, 01, 02, 03, XX, XX, XX, XX
367 // a1: 10, 11, 12, 13, XX, XX, XX, XX
368 // a2: 20, 21, 22, 23, XX, XX, XX, XX
369 // a3: 30, 31, 32, 33, XX, XX, XX, XX
370 //
371 // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
372
373 int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
374 int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
375 int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
376 int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
377
378 int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
379 int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
380
381 int16x8_t a0123 =
382 vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0];
383
384 *b = vreinterpretq_s8_s16(a0123);
385 }
386
transpose_concat_8x4(int8x8_t a0,int8x8_t a1,int8x8_t a2,int8x8_t a3,int8x16_t * b0,int8x16_t * b1)387 static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
388 int8x8_t a3, int8x16_t *b0,
389 int8x16_t *b1) {
390 // Transpose 8-bit elements and concatenate result rows as follows:
391 // a0: 00, 01, 02, 03, 04, 05, 06, 07
392 // a1: 10, 11, 12, 13, 14, 15, 16, 17
393 // a2: 20, 21, 22, 23, 24, 25, 26, 27
394 // a3: 30, 31, 32, 33, 34, 35, 36, 37
395 //
396 // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
397 // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
398
399 int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
400 int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
401 int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
402 int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
403
404 int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
405 int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
406
407 int16x8x2_t a0123 =
408 vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23));
409
410 *b0 = vreinterpretq_s8_s16(a0123.val[0]);
411 *b1 = vreinterpretq_s8_s16(a0123.val[1]);
412 }
413
convolve8_4_v(const int8x16_t samples_lo,const int8x16_t samples_hi,const int8x8_t filters)414 static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo,
415 const int8x16_t samples_hi,
416 const int8x8_t filters) {
417 // The sample range transform and permutation are performed by the caller.
418
419 // Accumulate into 128 * FILTER_SUM to account for range transform.
420 int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
421 int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0);
422 sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
423
424 // Further narrowing and packing is performed by the caller.
425 return vshrn_n_s32(sum, 1);
426 }
427
convolve8_8_v(const int8x16_t samples0_lo,const int8x16_t samples0_hi,const int8x16_t samples1_lo,const int8x16_t samples1_hi,const int8x8_t filters)428 static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
429 const int8x16_t samples0_hi,
430 const int8x16_t samples1_lo,
431 const int8x16_t samples1_hi,
432 const int8x8_t filters) {
433 // The sample range transform and permutation are performed by the caller.
434
435 // Accumulate into 128 * FILTER_SUM to account for range transform.
436 int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
437 // First 4 output values.
438 int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0);
439 sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
440 // Second 4 output values.
441 int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0);
442 sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
443
444 // Narrow and re-pack.
445 int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
446 return vqrshrun_n_s16(sum, FILTER_BITS - 1);
447 }
448
convolve_8tap_vert_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h,const int8x8_t filter)449 static INLINE void convolve_8tap_vert_neon_dotprod(
450 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
451 ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
452 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
453
454 if (w == 4) {
455 uint8x8_t t0, t1, t2, t3, t4, t5, t6;
456 load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
457 src += 7 * src_stride;
458
459 // Transform sample range to [-128, 127] for 8-bit signed dot product.
460 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
461 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
462 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
463 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
464 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
465 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
466 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
467
468 // This operation combines a conventional transpose and the sample permute
469 // (see horizontal case) required before computing the dot product.
470 int8x16_t s0123, s1234, s2345, s3456;
471 transpose_concat_4x4(s0, s1, s2, s3, &s0123);
472 transpose_concat_4x4(s1, s2, s3, s4, &s1234);
473 transpose_concat_4x4(s2, s3, s4, s5, &s2345);
474 transpose_concat_4x4(s3, s4, s5, s6, &s3456);
475
476 do {
477 uint8x8_t t7, t8, t9, t10;
478 load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
479
480 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
481 int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
482 int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
483 int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
484
485 int8x16_t s78910;
486 transpose_concat_4x4(s7, s8, s9, s10, &s78910);
487
488 // Merge new data into block from previous iteration.
489 int8x16x2_t samples_LUT = { { s3456, s78910 } };
490 int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
491 int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
492 int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
493
494 int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
495 int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
496 int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
497 int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
498 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
499 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
500
501 store_u8(dst + 0 * dst_stride, dst_stride, d01);
502 store_u8(dst + 2 * dst_stride, dst_stride, d23);
503
504 /* Prepare block for next iteration - re-using as much as possible. */
505 /* Shuffle everything up four rows. */
506 s0123 = s4567;
507 s1234 = s5678;
508 s2345 = s6789;
509 s3456 = s78910;
510
511 src += 4 * src_stride;
512 dst += 4 * dst_stride;
513 h -= 4;
514 } while (h != 0);
515 } else {
516 do {
517 const uint8_t *s = src;
518 uint8_t *d = dst;
519 int height = h;
520
521 uint8x8_t t0, t1, t2, t3, t4, t5, t6;
522 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
523 s += 7 * src_stride;
524
525 // Transform sample range to [-128, 127] for 8-bit signed dot product.
526 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
527 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
528 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
529 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
530 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
531 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
532 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
533
534 // This operation combines a conventional transpose and the sample permute
535 // (see horizontal case) required before computing the dot product.
536 int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
537 s3456_lo, s3456_hi;
538 transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
539 transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
540 transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
541 transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
542
543 do {
544 uint8x8_t t7, t8, t9, t10;
545 load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
546
547 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
548 int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
549 int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
550 int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
551
552 int8x16_t s78910_lo, s78910_hi;
553 transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
554
555 // Merge new data into block from previous iteration.
556 int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
557 int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
558 int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
559 int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
560
561 samples_LUT.val[0] = s3456_hi;
562 samples_LUT.val[1] = s78910_hi;
563 int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
564 int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
565 int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
566
567 uint8x8_t d0 =
568 convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
569 uint8x8_t d1 =
570 convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
571 uint8x8_t d2 =
572 convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
573 uint8x8_t d3 =
574 convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
575
576 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
577
578 // Prepare block for next iteration - re-using as much as possible.
579 // Shuffle everything up four rows.
580 s0123_lo = s4567_lo;
581 s0123_hi = s4567_hi;
582 s1234_lo = s5678_lo;
583 s1234_hi = s5678_hi;
584 s2345_lo = s6789_lo;
585 s2345_hi = s6789_hi;
586 s3456_lo = s78910_lo;
587 s3456_hi = s78910_hi;
588
589 s += 4 * src_stride;
590 d += 4 * dst_stride;
591 height -= 4;
592 } while (height != 0);
593 src += 8;
594 dst += 8;
595 w -= 8;
596 } while (w != 0);
597 }
598 }
599
vpx_convolve8_vert_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)600 void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
601 uint8_t *dst, ptrdiff_t dst_stride,
602 const InterpKernel *filter, int x0_q4,
603 int x_step_q4, int y0_q4, int y_step_q4,
604 int w, int h) {
605 assert((intptr_t)dst % 4 == 0);
606 assert(dst_stride % 4 == 0);
607 assert(y_step_q4 == 16);
608
609 (void)x0_q4;
610 (void)x_step_q4;
611 (void)y_step_q4;
612
613 if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
614 const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
615
616 convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
617 y_filter);
618 } else {
619 const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4]));
620
621 convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst,
622 dst_stride, w, h, y_filter);
623 }
624 }
625
vpx_convolve8_avg_vert_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)626 void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
627 ptrdiff_t src_stride, uint8_t *dst,
628 ptrdiff_t dst_stride,
629 const InterpKernel *filter, int x0_q4,
630 int x_step_q4, int y0_q4,
631 int y_step_q4, int w, int h) {
632 const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
633 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
634
635 assert((intptr_t)dst % 4 == 0);
636 assert(dst_stride % 4 == 0);
637 assert(y_step_q4 == 16);
638
639 (void)x0_q4;
640 (void)x_step_q4;
641 (void)y_step_q4;
642
643 src -= 3 * src_stride;
644
645 if (w == 4) {
646 uint8x8_t t0, t1, t2, t3, t4, t5, t6;
647 load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
648 src += 7 * src_stride;
649
650 // Transform sample range to [-128, 127] for 8-bit signed dot product.
651 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
652 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
653 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
654 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
655 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
656 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
657 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
658
659 // This operation combines a conventional transpose and the sample permute
660 // (see horizontal case) required before computing the dot product.
661 int8x16_t s0123, s1234, s2345, s3456;
662 transpose_concat_4x4(s0, s1, s2, s3, &s0123);
663 transpose_concat_4x4(s1, s2, s3, s4, &s1234);
664 transpose_concat_4x4(s2, s3, s4, s5, &s2345);
665 transpose_concat_4x4(s3, s4, s5, s6, &s3456);
666
667 do {
668 uint8x8_t t7, t8, t9, t10;
669 load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
670
671 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
672 int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
673 int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
674 int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
675
676 int8x16_t s78910;
677 transpose_concat_4x4(s7, s8, s9, s10, &s78910);
678
679 // Merge new data into block from previous iteration.
680 int8x16x2_t samples_LUT = { { s3456, s78910 } };
681 int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
682 int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
683 int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
684
685 int16x4_t d0 = convolve8_4_v(s0123, s4567, filters);
686 int16x4_t d1 = convolve8_4_v(s1234, s5678, filters);
687 int16x4_t d2 = convolve8_4_v(s2345, s6789, filters);
688 int16x4_t d3 = convolve8_4_v(s3456, s78910, filters);
689 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
690 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
691
692 uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
693 uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
694
695 d01 = vrhadd_u8(d01, dd01);
696 d23 = vrhadd_u8(d23, dd23);
697
698 store_u8(dst + 0 * dst_stride, dst_stride, d01);
699 store_u8(dst + 2 * dst_stride, dst_stride, d23);
700
701 // Prepare block for next iteration - re-using as much as possible.
702 // Shuffle everything up four rows.
703 s0123 = s4567;
704 s1234 = s5678;
705 s2345 = s6789;
706 s3456 = s78910;
707
708 src += 4 * src_stride;
709 dst += 4 * dst_stride;
710 h -= 4;
711 } while (h != 0);
712 } else {
713 do {
714 const uint8_t *s = src;
715 uint8_t *d = dst;
716 int height = h;
717
718 uint8x8_t t0, t1, t2, t3, t4, t5, t6;
719 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
720 s += 7 * src_stride;
721
722 // Transform sample range to [-128, 127] for 8-bit signed dot product.
723 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
724 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
725 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
726 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
727 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
728 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
729 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
730
731 // This operation combines a conventional transpose and the sample permute
732 // (see horizontal case) required before computing the dot product.
733 int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
734 s3456_lo, s3456_hi;
735 transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
736 transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
737 transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
738 transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
739
740 do {
741 uint8x8_t t7, t8, t9, t10;
742 load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
743
744 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
745 int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
746 int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
747 int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
748
749 int8x16_t s78910_lo, s78910_hi;
750 transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
751
752 // Merge new data into block from previous iteration.
753 int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
754 int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
755 int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
756 int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
757
758 samples_LUT.val[0] = s3456_hi;
759 samples_LUT.val[1] = s78910_hi;
760 int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
761 int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
762 int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
763
764 uint8x8_t d0 =
765 convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters);
766 uint8x8_t d1 =
767 convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters);
768 uint8x8_t d2 =
769 convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters);
770 uint8x8_t d3 =
771 convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters);
772
773 uint8x8_t dd0, dd1, dd2, dd3;
774 load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
775
776 d0 = vrhadd_u8(d0, dd0);
777 d1 = vrhadd_u8(d1, dd1);
778 d2 = vrhadd_u8(d2, dd2);
779 d3 = vrhadd_u8(d3, dd3);
780
781 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
782
783 // Prepare block for next iteration - re-using as much as possible.
784 // Shuffle everything up four rows.
785 s0123_lo = s4567_lo;
786 s0123_hi = s4567_hi;
787 s1234_lo = s5678_lo;
788 s1234_hi = s5678_hi;
789 s2345_lo = s6789_lo;
790 s2345_hi = s6789_hi;
791 s3456_lo = s78910_lo;
792 s3456_hi = s78910_hi;
793
794 s += 4 * src_stride;
795 d += 4 * dst_stride;
796 height -= 4;
797 } while (height != 0);
798 src += 8;
799 dst += 8;
800 w -= 8;
801 } while (w != 0);
802 }
803 }
804
convolve_4tap_2d_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h,const int8x8_t x_filter,const uint8x8_t y_filter)805 static INLINE void convolve_4tap_2d_neon_dotprod(const uint8_t *src,
806 ptrdiff_t src_stride,
807 uint8_t *dst,
808 ptrdiff_t dst_stride, int w,
809 int h, const int8x8_t x_filter,
810 const uint8x8_t y_filter) {
811 // Neon does not have lane-referencing multiply or multiply-accumulate
812 // instructions that operate on vectors of 8-bit elements. This means we have
813 // to duplicate filter taps into a whole vector and use standard multiply /
814 // multiply-accumulate instructions.
815 const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2),
816 vdup_lane_u8(y_filter, 3),
817 vdup_lane_u8(y_filter, 4),
818 vdup_lane_u8(y_filter, 5) };
819
820 if (w == 4) {
821 const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
822
823 uint8x16_t h_s0, h_s1, h_s2;
824 load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
825
826 int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl);
827 int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl);
828 int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl);
829 // We halved the filter values so -1 from right shift.
830 uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
831 uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1);
832
833 src += 3 * src_stride;
834
835 do {
836 uint8x16_t h_s3, h_s4, h_s5, h_s6;
837 load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
838
839 int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl);
840 int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl);
841 int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl);
842 int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl);
843 // We halved the filter values so -1 from right shift.
844 uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1);
845 uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1);
846 uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4);
847 uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4);
848
849 uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps);
850 uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps);
851
852 store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
853 store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
854
855 v_s01 = v_s45;
856 v_s12 = v_s56;
857 src += 4 * src_stride;
858 dst += 4 * dst_stride;
859 h -= 4;
860 } while (h != 0);
861 } else {
862 const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
863
864 do {
865 const uint8_t *s = src;
866 uint8_t *d = dst;
867 int height = h;
868
869 uint8x16_t h_s0, h_s1, h_s2;
870 load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2);
871
872 uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl);
873 uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl);
874 uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl);
875
876 s += 3 * src_stride;
877
878 do {
879 uint8x16_t h_s3, h_s4, h_s5, h_s6;
880 load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
881
882 uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl);
883 uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl);
884 uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl);
885 uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl);
886
887 uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps);
888 uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps);
889 uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps);
890 uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps);
891
892 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
893
894 v_s0 = v_s4;
895 v_s1 = v_s5;
896 v_s2 = v_s6;
897 s += 4 * src_stride;
898 d += 4 * dst_stride;
899 height -= 4;
900 } while (height != 0);
901 src += 8;
902 dst += 8;
903 w -= 8;
904 } while (w != 0);
905 }
906 }
907
convolve_8tap_2d_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h,const int8x8_t filter)908 static INLINE void convolve_8tap_2d_horiz_neon_dotprod(
909 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
910 ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
911 if (w == 4) {
912 const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
913
914 do {
915 uint8x16_t s0, s1, s2, s3;
916 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
917
918 int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
919 int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
920 int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
921 int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl);
922 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
923 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
924
925 store_u8(dst + 0 * dst_stride, dst_stride, d01);
926 store_u8(dst + 2 * dst_stride, dst_stride, d23);
927
928 src += 4 * src_stride;
929 dst += 4 * dst_stride;
930 h -= 4;
931 } while (h > 3);
932
933 // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
934 // below for further details on possible values of block height.
935 uint8x16_t s0, s1, s2;
936 load_u8_16x3(src, src_stride, &s0, &s1, &s2);
937
938 int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
939 int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
940 int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
941 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
942 uint8x8_t d23 =
943 vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
944
945 store_u8(dst + 0 * dst_stride, dst_stride, d01);
946 store_u8_4x1(dst + 2 * dst_stride, d23);
947 } else {
948 const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
949
950 do {
951 const uint8_t *s = src;
952 uint8_t *d = dst;
953 int width = w;
954
955 do {
956 uint8x16_t s0, s1, s2, s3;
957 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
958
959 uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
960 uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
961 uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
962 uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
963
964 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
965
966 s += 8;
967 d += 8;
968 width -= 8;
969 } while (width > 0);
970 src += 4 * src_stride;
971 dst += 4 * dst_stride;
972 h -= 4;
973 } while (h > 3);
974
975 // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
976 // below for further details on possible values of block height.
977 const uint8_t *s = src;
978 uint8_t *d = dst;
979 int width = w;
980
981 do {
982 uint8x16_t s0, s1, s2;
983 load_u8_16x3(s, src_stride, &s0, &s1, &s2);
984
985 uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
986 uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
987 uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
988
989 store_u8_8x3(d, dst_stride, d0, d1, d2);
990
991 s += 8;
992 d += 8;
993 width -= 8;
994 } while (width > 0);
995 }
996 }
997
vpx_convolve8_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)998 void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
999 uint8_t *dst, ptrdiff_t dst_stride,
1000 const InterpKernel *filter, int x0_q4,
1001 int x_step_q4, int y0_q4, int y_step_q4, int w,
1002 int h) {
1003 assert(x_step_q4 == 16);
1004 assert(y_step_q4 == 16);
1005
1006 (void)x_step_q4;
1007 (void)y_step_q4;
1008
1009 const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
1010 const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
1011 // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
1012 // lines post both horizontally and vertically.
1013 const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
1014 const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
1015
1016 if (x_filter_taps == 4 && y_filter_taps == 4) {
1017 const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
1018 const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
1019
1020 // 4-tap and bilinear filter values are even, so halve them to reduce
1021 // intermediate precision requirements.
1022 const int8x8_t x_filter_4tap =
1023 vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
1024 const uint8x8_t y_filter_4tap =
1025 vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1);
1026
1027 convolve_4tap_2d_neon_dotprod(src - horiz_offset - vert_offset, src_stride,
1028 dst, dst_stride, w, h, x_filter_4tap,
1029 y_filter_4tap);
1030 return;
1031 }
1032
1033 // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
1034 // maximum buffer size to 64 * (64 + 7).
1035 DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
1036 const int im_stride = 64;
1037 const int im_height = h + SUBPEL_TAPS - 1;
1038
1039 const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
1040 const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
1041
1042 convolve_8tap_2d_horiz_neon_dotprod(src - horiz_offset - vert_offset,
1043 src_stride, im_block, im_stride, w,
1044 im_height, x_filter_8tap);
1045
1046 convolve_8tap_vert_neon_dotprod(im_block, im_stride, dst, dst_stride, w, h,
1047 y_filter_8tap);
1048 }
1049
vpx_convolve8_avg_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)1050 void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
1051 uint8_t *dst, ptrdiff_t dst_stride,
1052 const InterpKernel *filter, int x0_q4,
1053 int x_step_q4, int y0_q4, int y_step_q4,
1054 int w, int h) {
1055 DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
1056 const int im_stride = 64;
1057
1058 // Averaging convolution always uses an 8-tap filter.
1059 // Account for the vertical phase needing 3 lines prior and 4 lines post.
1060 const int im_height = h + SUBPEL_TAPS - 1;
1061 const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1;
1062
1063 assert(y_step_q4 == 16);
1064 assert(x_step_q4 == 16);
1065
1066 const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
1067
1068 convolve_8tap_2d_horiz_neon_dotprod(src - offset - offset * src_stride,
1069 src_stride, im_block, im_stride, w,
1070 im_height, x_filter_8tap);
1071
1072 vpx_convolve8_avg_vert_neon_dotprod(im_block + offset * im_stride, im_stride,
1073 dst, dst_stride, filter, x0_q4, x_step_q4,
1074 y0_q4, y_step_q4, w, h);
1075 }
1076