1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <string.h>
13 #include "./vpx_config.h"
14 #include "./vp8_rtcd.h"
15 #include "vpx_dsp/arm/mem_neon.h"
16 #include "vpx_ports/mem.h"
17
18 static const int8_t vp8_sub_pel_filters[8][8] = {
19 { 0, 0, -128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */
20 { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */
21 { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
22 { 0, -9, 93, 50, -6, 0, 0, 0 },
23 { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
24 { 0, -6, 50, 93, -9, 0, 0, 0 },
25 { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
26 { 0, -1, 12, 123, -6, 0, 0, 0 },
27 };
28
29 // This table is derived from vp8/common/filter.c:vp8_sub_pel_filters.
30 // Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive.
31 // Elements 1 and 4 are either 0 or negative. The code accounts for this with
32 // multiply/accumulates which either add or subtract as needed. The other
33 // functions will be updated to use this table later.
34 // It is also expanded to 8 elements to allow loading into 64 bit neon
35 // registers.
36 static const uint8_t abs_filters[8][8] = {
37 { 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, 6, 123, 12, 1, 0, 0, 0 },
38 { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 },
39 { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 },
40 { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 },
41 };
42
load_and_shift(const unsigned char * a)43 static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
44 return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
45 }
46
filter_add_accumulate(const uint8x16_t a,const uint8x16_t b,const uint8x8_t filter,uint16x8_t * c,uint16x8_t * d)47 static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
48 const uint8x8_t filter, uint16x8_t *c,
49 uint16x8_t *d) {
50 const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
51 vreinterpret_u32_u8(vget_high_u8(a)));
52 const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
53 vreinterpret_u32_u8(vget_high_u8(b)));
54 *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
55 *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
56 }
57
filter_sub_accumulate(const uint8x16_t a,const uint8x16_t b,const uint8x8_t filter,uint16x8_t * c,uint16x8_t * d)58 static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b,
59 const uint8x8_t filter, uint16x8_t *c,
60 uint16x8_t *d) {
61 const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
62 vreinterpret_u32_u8(vget_high_u8(a)));
63 const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
64 vreinterpret_u32_u8(vget_high_u8(b)));
65 *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
66 *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
67 }
68
yonly4x4(const unsigned char * src,int src_stride,int filter_offset,unsigned char * dst,int dst_stride)69 static INLINE void yonly4x4(const unsigned char *src, int src_stride,
70 int filter_offset, unsigned char *dst,
71 int dst_stride) {
72 uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8;
73 uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
74 uint16x8_t c0, c1, c2, c3;
75 int16x8_t d0, d1;
76 uint8x8_t e0, e1;
77
78 const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]);
79 const uint8x8_t filter0 = vdup_lane_u8(filter, 0);
80 const uint8x8_t filter1 = vdup_lane_u8(filter, 1);
81 const uint8x8_t filter2 = vdup_lane_u8(filter, 2);
82 const uint8x8_t filter3 = vdup_lane_u8(filter, 3);
83 const uint8x8_t filter4 = vdup_lane_u8(filter, 4);
84 const uint8x8_t filter5 = vdup_lane_u8(filter, 5);
85
86 src -= src_stride * 2;
87 // Shift the even rows to allow using 'vext' to combine the vectors. armv8
88 // has vcopy_lane which would be interesting. This started as just a
89 // horrible workaround for clang adding alignment hints to 32bit loads:
90 // https://llvm.org/bugs/show_bug.cgi?id=24421
91 // But it turns out it almost identical to casting the loads.
92 a0 = load_and_shift(src);
93 src += src_stride;
94 a1 = vld1_u8(src);
95 src += src_stride;
96 a2 = load_and_shift(src);
97 src += src_stride;
98 a3 = vld1_u8(src);
99 src += src_stride;
100 a4 = load_and_shift(src);
101 src += src_stride;
102 a5 = vld1_u8(src);
103 src += src_stride;
104 a6 = load_and_shift(src);
105 src += src_stride;
106 a7 = vld1_u8(src);
107 src += src_stride;
108 a8 = vld1_u8(src);
109
110 // Combine the rows so we can operate on 8 at a time.
111 b0 = vext_u8(a0, a1, 4);
112 b2 = vext_u8(a2, a3, 4);
113 b4 = vext_u8(a4, a5, 4);
114 b6 = vext_u8(a6, a7, 4);
115 b8 = a8;
116
117 // To keep with the 8-at-a-time theme, combine *alternate* rows. This
118 // allows combining the odd rows with the even.
119 b1 = vext_u8(b0, b2, 4);
120 b3 = vext_u8(b2, b4, 4);
121 b5 = vext_u8(b4, b6, 4);
122 b7 = vext_u8(b6, b8, 4);
123
124 // Multiply and expand to 16 bits.
125 c0 = vmull_u8(b0, filter0);
126 c1 = vmull_u8(b2, filter0);
127 c2 = vmull_u8(b5, filter5);
128 c3 = vmull_u8(b7, filter5);
129
130 // Multiply, subtract and accumulate for filters 1 and 4 (the negative
131 // ones).
132 c0 = vmlsl_u8(c0, b4, filter4);
133 c1 = vmlsl_u8(c1, b6, filter4);
134 c2 = vmlsl_u8(c2, b1, filter1);
135 c3 = vmlsl_u8(c3, b3, filter1);
136
137 // Add more positive ones. vmlal should really return a signed type.
138 // It's doing signed math internally, as evidenced by the fact we can do
139 // subtractions followed by more additions. Ideally we could use
140 // vqmlal/sl but that instruction doesn't exist. Might be able to
141 // shoehorn vqdmlal/vqdmlsl in here but it would take some effort.
142 c0 = vmlal_u8(c0, b2, filter2);
143 c1 = vmlal_u8(c1, b4, filter2);
144 c2 = vmlal_u8(c2, b3, filter3);
145 c3 = vmlal_u8(c3, b5, filter3);
146
147 // Use signed saturation math because vmlsl may have left some negative
148 // numbers in there.
149 d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
150 d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
151
152 // Use signed again because numbers like -200 need to be saturated to 0.
153 e0 = vqrshrun_n_s16(d0, 7);
154 e1 = vqrshrun_n_s16(d1, 7);
155
156 store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1));
157 }
158
vp8_sixtap_predict4x4_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)159 void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
160 int xoffset, int yoffset,
161 unsigned char *dst_ptr, int dst_pitch) {
162 uint8x16_t s0, s1, s2, s3, s4;
163 uint64x2_t s01, s23;
164 // Variables to hold src[] elements for the given filter[]
165 uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5;
166 uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4;
167 uint8x16_t s01_f0, s23_f0;
168 uint64x2_t s01_f3, s23_f3;
169 uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q;
170 // Accumulator variables.
171 uint16x8_t d0123, d4567, d89;
172 uint16x8_t d0123_a, d4567_a, d89_a;
173 int16x8_t e0123, e4567, e89;
174 // Second pass intermediates.
175 uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
176 uint16x8_t c0, c1, c2, c3;
177 int16x8_t d0, d1;
178 uint8x8_t e0, e1;
179 uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5;
180
181 if (xoffset == 0) { // Second pass only.
182 yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch);
183 return;
184 }
185
186 if (yoffset == 0) { // First pass only.
187 src_ptr -= 2;
188 } else { // Add context for the second pass. 2 extra lines on top.
189 src_ptr -= 2 + (src_pixels_per_line * 2);
190 }
191
192 filter = vld1_u8(abs_filters[xoffset]);
193 filter0 = vdup_lane_u8(filter, 0);
194 filter1 = vdup_lane_u8(filter, 1);
195 filter2 = vdup_lane_u8(filter, 2);
196 filter3 = vdup_lane_u8(filter, 3);
197 filter4 = vdup_lane_u8(filter, 4);
198 filter5 = vdup_lane_u8(filter, 5);
199
200 // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of
201 // garbage. So much effort for that last single bit.
202 // The low values of each pair are for filter0.
203 s0 = vld1q_u8(src_ptr);
204 src_ptr += src_pixels_per_line;
205 s1 = vld1q_u8(src_ptr);
206 src_ptr += src_pixels_per_line;
207 s2 = vld1q_u8(src_ptr);
208 src_ptr += src_pixels_per_line;
209 s3 = vld1q_u8(src_ptr);
210 src_ptr += src_pixels_per_line;
211
212 // Shift to extract values for filter[5]
213 // If src[] is 0, this puts:
214 // 3 4 5 6 7 8 9 10 in s0_f5
215 // Can't use vshr.u64 because it crosses the double word boundary.
216 s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
217 s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
218 s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
219 s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
220
221 s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
222 s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
223
224 s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
225 s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
226 d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
227 d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
228
229 // Keep original src data as 64 bits to simplify shifting and extracting.
230 s01 = vreinterpretq_u64_u8(s01_f0);
231 s23 = vreinterpretq_u64_u8(s23_f0);
232
233 // 3 4 5 6 * filter0
234 filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
235
236 // Shift over one to use -1, 0, 1, 2 for filter1
237 // -1 0 1 2 * filter1
238 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
239 vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
240 &d0123, &d4567);
241
242 // 2 3 4 5 * filter4
243 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
244 vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
245 &d0123, &d4567);
246
247 // 0 1 2 3 * filter2
248 filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
249 vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
250 &d0123, &d4567);
251
252 // 1 2 3 4 * filter3
253 s01_f3 = vshrq_n_u64(s01, 24);
254 s23_f3 = vshrq_n_u64(s23, 24);
255 s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
256 vreinterpret_u32_u64(vget_high_u64(s01_f3)));
257 s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
258 vreinterpret_u32_u64(vget_high_u64(s23_f3)));
259 // Accumulate into different registers so it can use saturated addition.
260 d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
261 d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
262
263 e0123 =
264 vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
265 e4567 =
266 vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
267
268 // Shift and narrow.
269 b0 = vqrshrun_n_s16(e0123, 7);
270 b2 = vqrshrun_n_s16(e4567, 7);
271
272 if (yoffset == 0) { // firstpass_filter4x4_only
273 store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2));
274 return;
275 }
276
277 // Load additional context when doing both filters.
278 s0 = vld1q_u8(src_ptr);
279 src_ptr += src_pixels_per_line;
280 s1 = vld1q_u8(src_ptr);
281 src_ptr += src_pixels_per_line;
282 s2 = vld1q_u8(src_ptr);
283 src_ptr += src_pixels_per_line;
284 s3 = vld1q_u8(src_ptr);
285 src_ptr += src_pixels_per_line;
286 s4 = vld1q_u8(src_ptr);
287
288 s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
289 s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
290 s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
291 s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
292 s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5);
293
294 // 3 4 5 6 * filter0
295 s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
296 s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
297
298 s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
299 s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
300 // But this time instead of 16 pixels to filter, there are 20. So an extra
301 // run with a doubleword register.
302 d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
303 d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
304 d89 = vmull_u8(s4_f5, filter5);
305
306 // Save a copy as u64 for shifting.
307 s01 = vreinterpretq_u64_u8(s01_f0);
308 s23 = vreinterpretq_u64_u8(s23_f0);
309
310 filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
311 d89 = vmlal_u8(d89, vget_low_u8(s4), filter0);
312
313 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
314 vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
315 &d0123, &d4567);
316 s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1);
317 d89 = vmlsl_u8(d89, s4_f1, filter1);
318
319 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
320 vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
321 &d0123, &d4567);
322 s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4);
323 d89 = vmlsl_u8(d89, s4_f4, filter4);
324
325 filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
326 vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
327 &d0123, &d4567);
328 s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2);
329 d89 = vmlal_u8(d89, s4_f2, filter2);
330
331 s01_f3 = vshrq_n_u64(s01, 24);
332 s23_f3 = vshrq_n_u64(s23, 24);
333 s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
334 vreinterpret_u32_u64(vget_high_u64(s01_f3)));
335 s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
336 vreinterpret_u32_u64(vget_high_u64(s23_f3)));
337 s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3);
338 d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
339 d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
340 d89_a = vmull_u8(s4_f3, filter3);
341
342 e0123 =
343 vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
344 e4567 =
345 vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
346 e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a));
347
348 b4 = vqrshrun_n_s16(e0123, 7);
349 b6 = vqrshrun_n_s16(e4567, 7);
350 b8 = vqrshrun_n_s16(e89, 7);
351
352 // Second pass: 4x4
353 filter = vld1_u8(abs_filters[yoffset]);
354 filter0 = vdup_lane_u8(filter, 0);
355 filter1 = vdup_lane_u8(filter, 1);
356 filter2 = vdup_lane_u8(filter, 2);
357 filter3 = vdup_lane_u8(filter, 3);
358 filter4 = vdup_lane_u8(filter, 4);
359 filter5 = vdup_lane_u8(filter, 5);
360
361 b1 = vext_u8(b0, b2, 4);
362 b3 = vext_u8(b2, b4, 4);
363 b5 = vext_u8(b4, b6, 4);
364 b7 = vext_u8(b6, b8, 4);
365
366 c0 = vmull_u8(b0, filter0);
367 c1 = vmull_u8(b2, filter0);
368 c2 = vmull_u8(b5, filter5);
369 c3 = vmull_u8(b7, filter5);
370
371 c0 = vmlsl_u8(c0, b4, filter4);
372 c1 = vmlsl_u8(c1, b6, filter4);
373 c2 = vmlsl_u8(c2, b1, filter1);
374 c3 = vmlsl_u8(c3, b3, filter1);
375
376 c0 = vmlal_u8(c0, b2, filter2);
377 c1 = vmlal_u8(c1, b4, filter2);
378 c2 = vmlal_u8(c2, b3, filter3);
379 c3 = vmlal_u8(c3, b5, filter3);
380
381 d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
382 d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
383
384 e0 = vqrshrun_n_s16(d0, 7);
385 e1 = vqrshrun_n_s16(d1, 7);
386
387 store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
388 }
389
vp8_sixtap_predict8x4_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)390 void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
391 int xoffset, int yoffset,
392 unsigned char *dst_ptr, int dst_pitch) {
393 unsigned char *src;
394 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
395 uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
396 uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
397 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
398 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
399 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
400 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
401 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
402 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
403
404 if (xoffset == 0) { // secondpass_filter8x4_only
405 // load second_pass filter
406 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
407 d0s8 = vdup_lane_s8(dtmps8, 0);
408 d1s8 = vdup_lane_s8(dtmps8, 1);
409 d2s8 = vdup_lane_s8(dtmps8, 2);
410 d3s8 = vdup_lane_s8(dtmps8, 3);
411 d4s8 = vdup_lane_s8(dtmps8, 4);
412 d5s8 = vdup_lane_s8(dtmps8, 5);
413 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
414 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
415 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
416 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
417 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
418 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
419
420 // load src data
421 src = src_ptr - src_pixels_per_line * 2;
422 d22u8 = vld1_u8(src);
423 src += src_pixels_per_line;
424 d23u8 = vld1_u8(src);
425 src += src_pixels_per_line;
426 d24u8 = vld1_u8(src);
427 src += src_pixels_per_line;
428 d25u8 = vld1_u8(src);
429 src += src_pixels_per_line;
430 d26u8 = vld1_u8(src);
431 src += src_pixels_per_line;
432 d27u8 = vld1_u8(src);
433 src += src_pixels_per_line;
434 d28u8 = vld1_u8(src);
435 src += src_pixels_per_line;
436 d29u8 = vld1_u8(src);
437 src += src_pixels_per_line;
438 d30u8 = vld1_u8(src);
439
440 q3u16 = vmull_u8(d22u8, d0u8);
441 q4u16 = vmull_u8(d23u8, d0u8);
442 q5u16 = vmull_u8(d24u8, d0u8);
443 q6u16 = vmull_u8(d25u8, d0u8);
444
445 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
446 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
447 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
448 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
449
450 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
451 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
452 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
453 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
454
455 q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
456 q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
457 q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
458 q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
459
460 q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
461 q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
462 q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
463 q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
464
465 q7u16 = vmull_u8(d25u8, d3u8);
466 q8u16 = vmull_u8(d26u8, d3u8);
467 q9u16 = vmull_u8(d27u8, d3u8);
468 q10u16 = vmull_u8(d28u8, d3u8);
469
470 q3s16 = vreinterpretq_s16_u16(q3u16);
471 q4s16 = vreinterpretq_s16_u16(q4u16);
472 q5s16 = vreinterpretq_s16_u16(q5u16);
473 q6s16 = vreinterpretq_s16_u16(q6u16);
474 q7s16 = vreinterpretq_s16_u16(q7u16);
475 q8s16 = vreinterpretq_s16_u16(q8u16);
476 q9s16 = vreinterpretq_s16_u16(q9u16);
477 q10s16 = vreinterpretq_s16_u16(q10u16);
478
479 q7s16 = vqaddq_s16(q7s16, q3s16);
480 q8s16 = vqaddq_s16(q8s16, q4s16);
481 q9s16 = vqaddq_s16(q9s16, q5s16);
482 q10s16 = vqaddq_s16(q10s16, q6s16);
483
484 d6u8 = vqrshrun_n_s16(q7s16, 7);
485 d7u8 = vqrshrun_n_s16(q8s16, 7);
486 d8u8 = vqrshrun_n_s16(q9s16, 7);
487 d9u8 = vqrshrun_n_s16(q10s16, 7);
488
489 vst1_u8(dst_ptr, d6u8);
490 dst_ptr += dst_pitch;
491 vst1_u8(dst_ptr, d7u8);
492 dst_ptr += dst_pitch;
493 vst1_u8(dst_ptr, d8u8);
494 dst_ptr += dst_pitch;
495 vst1_u8(dst_ptr, d9u8);
496 return;
497 }
498
499 // load first_pass filter
500 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
501 d0s8 = vdup_lane_s8(dtmps8, 0);
502 d1s8 = vdup_lane_s8(dtmps8, 1);
503 d2s8 = vdup_lane_s8(dtmps8, 2);
504 d3s8 = vdup_lane_s8(dtmps8, 3);
505 d4s8 = vdup_lane_s8(dtmps8, 4);
506 d5s8 = vdup_lane_s8(dtmps8, 5);
507 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
508 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
509 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
510 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
511 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
512 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
513
514 // First pass: output_height lines x output_width columns (9x4)
515 if (yoffset == 0) // firstpass_filter4x4_only
516 src = src_ptr - 2;
517 else
518 src = src_ptr - 2 - (src_pixels_per_line * 2);
519 q3u8 = vld1q_u8(src);
520 src += src_pixels_per_line;
521 q4u8 = vld1q_u8(src);
522 src += src_pixels_per_line;
523 q5u8 = vld1q_u8(src);
524 src += src_pixels_per_line;
525 q6u8 = vld1q_u8(src);
526
527 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
528 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
529 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
530 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
531
532 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
533 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
534 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
535 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
536
537 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
538 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
539 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
540 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
541
542 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
543 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
544 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
545 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
546
547 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
548 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
549 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
550 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
551
552 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
553 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
554 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
555 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
556
557 q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
558 q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
559 q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
560 q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
561
562 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
563 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
564 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
565 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
566
567 q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
568 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
569 q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
570 q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
571
572 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
573 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
574 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
575 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
576
577 q3u16 = vmull_u8(d28u8, d3u8);
578 q4u16 = vmull_u8(d29u8, d3u8);
579 q5u16 = vmull_u8(d30u8, d3u8);
580 q6u16 = vmull_u8(d31u8, d3u8);
581
582 q3s16 = vreinterpretq_s16_u16(q3u16);
583 q4s16 = vreinterpretq_s16_u16(q4u16);
584 q5s16 = vreinterpretq_s16_u16(q5u16);
585 q6s16 = vreinterpretq_s16_u16(q6u16);
586 q7s16 = vreinterpretq_s16_u16(q7u16);
587 q8s16 = vreinterpretq_s16_u16(q8u16);
588 q9s16 = vreinterpretq_s16_u16(q9u16);
589 q10s16 = vreinterpretq_s16_u16(q10u16);
590
591 q7s16 = vqaddq_s16(q7s16, q3s16);
592 q8s16 = vqaddq_s16(q8s16, q4s16);
593 q9s16 = vqaddq_s16(q9s16, q5s16);
594 q10s16 = vqaddq_s16(q10s16, q6s16);
595
596 d22u8 = vqrshrun_n_s16(q7s16, 7);
597 d23u8 = vqrshrun_n_s16(q8s16, 7);
598 d24u8 = vqrshrun_n_s16(q9s16, 7);
599 d25u8 = vqrshrun_n_s16(q10s16, 7);
600
601 if (yoffset == 0) { // firstpass_filter8x4_only
602 vst1_u8(dst_ptr, d22u8);
603 dst_ptr += dst_pitch;
604 vst1_u8(dst_ptr, d23u8);
605 dst_ptr += dst_pitch;
606 vst1_u8(dst_ptr, d24u8);
607 dst_ptr += dst_pitch;
608 vst1_u8(dst_ptr, d25u8);
609 return;
610 }
611
612 // First Pass on rest 5-line data
613 src += src_pixels_per_line;
614 q3u8 = vld1q_u8(src);
615 src += src_pixels_per_line;
616 q4u8 = vld1q_u8(src);
617 src += src_pixels_per_line;
618 q5u8 = vld1q_u8(src);
619 src += src_pixels_per_line;
620 q6u8 = vld1q_u8(src);
621 src += src_pixels_per_line;
622 q7u8 = vld1q_u8(src);
623
624 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
625 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
626 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
627 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
628 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
629
630 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
631 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
632 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
633 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
634 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
635
636 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
637 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
638 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
639 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
640 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
641
642 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
643 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
644 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
645 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
646 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
647
648 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
649 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
650 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
651 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
652 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
653
654 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
655 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
656 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
657 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
658 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
659
660 q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
661 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
662 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
663 q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
664 q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
665
666 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
667 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
668 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
669 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
670 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
671
672 q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
673 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
674 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
675 q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
676 q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
677
678 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
679 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
680 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
681 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
682 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
683
684 q3u16 = vmull_u8(d27u8, d3u8);
685 q4u16 = vmull_u8(d28u8, d3u8);
686 q5u16 = vmull_u8(d29u8, d3u8);
687 q6u16 = vmull_u8(d30u8, d3u8);
688 q7u16 = vmull_u8(d31u8, d3u8);
689
690 q3s16 = vreinterpretq_s16_u16(q3u16);
691 q4s16 = vreinterpretq_s16_u16(q4u16);
692 q5s16 = vreinterpretq_s16_u16(q5u16);
693 q6s16 = vreinterpretq_s16_u16(q6u16);
694 q7s16 = vreinterpretq_s16_u16(q7u16);
695 q8s16 = vreinterpretq_s16_u16(q8u16);
696 q9s16 = vreinterpretq_s16_u16(q9u16);
697 q10s16 = vreinterpretq_s16_u16(q10u16);
698 q11s16 = vreinterpretq_s16_u16(q11u16);
699 q12s16 = vreinterpretq_s16_u16(q12u16);
700
701 q8s16 = vqaddq_s16(q8s16, q3s16);
702 q9s16 = vqaddq_s16(q9s16, q4s16);
703 q10s16 = vqaddq_s16(q10s16, q5s16);
704 q11s16 = vqaddq_s16(q11s16, q6s16);
705 q12s16 = vqaddq_s16(q12s16, q7s16);
706
707 d26u8 = vqrshrun_n_s16(q8s16, 7);
708 d27u8 = vqrshrun_n_s16(q9s16, 7);
709 d28u8 = vqrshrun_n_s16(q10s16, 7);
710 d29u8 = vqrshrun_n_s16(q11s16, 7);
711 d30u8 = vqrshrun_n_s16(q12s16, 7);
712
713 // Second pass: 8x4
714 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
715 d0s8 = vdup_lane_s8(dtmps8, 0);
716 d1s8 = vdup_lane_s8(dtmps8, 1);
717 d2s8 = vdup_lane_s8(dtmps8, 2);
718 d3s8 = vdup_lane_s8(dtmps8, 3);
719 d4s8 = vdup_lane_s8(dtmps8, 4);
720 d5s8 = vdup_lane_s8(dtmps8, 5);
721 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
722 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
723 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
724 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
725 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
726 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
727
728 q3u16 = vmull_u8(d22u8, d0u8);
729 q4u16 = vmull_u8(d23u8, d0u8);
730 q5u16 = vmull_u8(d24u8, d0u8);
731 q6u16 = vmull_u8(d25u8, d0u8);
732
733 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
734 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
735 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
736 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
737
738 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
739 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
740 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
741 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
742
743 q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
744 q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
745 q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
746 q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
747
748 q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
749 q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
750 q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
751 q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
752
753 q7u16 = vmull_u8(d25u8, d3u8);
754 q8u16 = vmull_u8(d26u8, d3u8);
755 q9u16 = vmull_u8(d27u8, d3u8);
756 q10u16 = vmull_u8(d28u8, d3u8);
757
758 q3s16 = vreinterpretq_s16_u16(q3u16);
759 q4s16 = vreinterpretq_s16_u16(q4u16);
760 q5s16 = vreinterpretq_s16_u16(q5u16);
761 q6s16 = vreinterpretq_s16_u16(q6u16);
762 q7s16 = vreinterpretq_s16_u16(q7u16);
763 q8s16 = vreinterpretq_s16_u16(q8u16);
764 q9s16 = vreinterpretq_s16_u16(q9u16);
765 q10s16 = vreinterpretq_s16_u16(q10u16);
766
767 q7s16 = vqaddq_s16(q7s16, q3s16);
768 q8s16 = vqaddq_s16(q8s16, q4s16);
769 q9s16 = vqaddq_s16(q9s16, q5s16);
770 q10s16 = vqaddq_s16(q10s16, q6s16);
771
772 d6u8 = vqrshrun_n_s16(q7s16, 7);
773 d7u8 = vqrshrun_n_s16(q8s16, 7);
774 d8u8 = vqrshrun_n_s16(q9s16, 7);
775 d9u8 = vqrshrun_n_s16(q10s16, 7);
776
777 vst1_u8(dst_ptr, d6u8);
778 dst_ptr += dst_pitch;
779 vst1_u8(dst_ptr, d7u8);
780 dst_ptr += dst_pitch;
781 vst1_u8(dst_ptr, d8u8);
782 dst_ptr += dst_pitch;
783 vst1_u8(dst_ptr, d9u8);
784 }
785
vp8_sixtap_predict8x8_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)786 void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
787 int xoffset, int yoffset,
788 unsigned char *dst_ptr, int dst_pitch) {
789 unsigned char *src, *tmpp;
790 unsigned char tmp[64];
791 int i;
792 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
793 uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
794 uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
795 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
796 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
797 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
798 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
799 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
800 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
801
802 if (xoffset == 0) { // secondpass_filter8x8_only
803 // load second_pass filter
804 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
805 d0s8 = vdup_lane_s8(dtmps8, 0);
806 d1s8 = vdup_lane_s8(dtmps8, 1);
807 d2s8 = vdup_lane_s8(dtmps8, 2);
808 d3s8 = vdup_lane_s8(dtmps8, 3);
809 d4s8 = vdup_lane_s8(dtmps8, 4);
810 d5s8 = vdup_lane_s8(dtmps8, 5);
811 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
812 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
813 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
814 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
815 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
816 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
817
818 // load src data
819 src = src_ptr - src_pixels_per_line * 2;
820 d18u8 = vld1_u8(src);
821 src += src_pixels_per_line;
822 d19u8 = vld1_u8(src);
823 src += src_pixels_per_line;
824 d20u8 = vld1_u8(src);
825 src += src_pixels_per_line;
826 d21u8 = vld1_u8(src);
827 src += src_pixels_per_line;
828 d22u8 = vld1_u8(src);
829 src += src_pixels_per_line;
830 d23u8 = vld1_u8(src);
831 src += src_pixels_per_line;
832 d24u8 = vld1_u8(src);
833 src += src_pixels_per_line;
834 d25u8 = vld1_u8(src);
835 src += src_pixels_per_line;
836 d26u8 = vld1_u8(src);
837 src += src_pixels_per_line;
838 d27u8 = vld1_u8(src);
839 src += src_pixels_per_line;
840 d28u8 = vld1_u8(src);
841 src += src_pixels_per_line;
842 d29u8 = vld1_u8(src);
843 src += src_pixels_per_line;
844 d30u8 = vld1_u8(src);
845
846 for (i = 2; i > 0; i--) {
847 q3u16 = vmull_u8(d18u8, d0u8);
848 q4u16 = vmull_u8(d19u8, d0u8);
849 q5u16 = vmull_u8(d20u8, d0u8);
850 q6u16 = vmull_u8(d21u8, d0u8);
851
852 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
853 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
854 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
855 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
856
857 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
858 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
859 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
860 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
861
862 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
863 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
864 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
865 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
866
867 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
868 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
869 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
870 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
871
872 q7u16 = vmull_u8(d21u8, d3u8);
873 q8u16 = vmull_u8(d22u8, d3u8);
874 q9u16 = vmull_u8(d23u8, d3u8);
875 q10u16 = vmull_u8(d24u8, d3u8);
876
877 q3s16 = vreinterpretq_s16_u16(q3u16);
878 q4s16 = vreinterpretq_s16_u16(q4u16);
879 q5s16 = vreinterpretq_s16_u16(q5u16);
880 q6s16 = vreinterpretq_s16_u16(q6u16);
881 q7s16 = vreinterpretq_s16_u16(q7u16);
882 q8s16 = vreinterpretq_s16_u16(q8u16);
883 q9s16 = vreinterpretq_s16_u16(q9u16);
884 q10s16 = vreinterpretq_s16_u16(q10u16);
885
886 q7s16 = vqaddq_s16(q7s16, q3s16);
887 q8s16 = vqaddq_s16(q8s16, q4s16);
888 q9s16 = vqaddq_s16(q9s16, q5s16);
889 q10s16 = vqaddq_s16(q10s16, q6s16);
890
891 d6u8 = vqrshrun_n_s16(q7s16, 7);
892 d7u8 = vqrshrun_n_s16(q8s16, 7);
893 d8u8 = vqrshrun_n_s16(q9s16, 7);
894 d9u8 = vqrshrun_n_s16(q10s16, 7);
895
896 d18u8 = d22u8;
897 d19u8 = d23u8;
898 d20u8 = d24u8;
899 d21u8 = d25u8;
900 d22u8 = d26u8;
901 d23u8 = d27u8;
902 d24u8 = d28u8;
903 d25u8 = d29u8;
904 d26u8 = d30u8;
905
906 vst1_u8(dst_ptr, d6u8);
907 dst_ptr += dst_pitch;
908 vst1_u8(dst_ptr, d7u8);
909 dst_ptr += dst_pitch;
910 vst1_u8(dst_ptr, d8u8);
911 dst_ptr += dst_pitch;
912 vst1_u8(dst_ptr, d9u8);
913 dst_ptr += dst_pitch;
914 }
915 return;
916 }
917
918 // load first_pass filter
919 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
920 d0s8 = vdup_lane_s8(dtmps8, 0);
921 d1s8 = vdup_lane_s8(dtmps8, 1);
922 d2s8 = vdup_lane_s8(dtmps8, 2);
923 d3s8 = vdup_lane_s8(dtmps8, 3);
924 d4s8 = vdup_lane_s8(dtmps8, 4);
925 d5s8 = vdup_lane_s8(dtmps8, 5);
926 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
927 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
928 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
929 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
930 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
931 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
932
933 // First pass: output_height lines x output_width columns (9x4)
934 if (yoffset == 0) // firstpass_filter4x4_only
935 src = src_ptr - 2;
936 else
937 src = src_ptr - 2 - (src_pixels_per_line * 2);
938
939 tmpp = tmp;
940 for (i = 2; i > 0; i--) {
941 q3u8 = vld1q_u8(src);
942 src += src_pixels_per_line;
943 q4u8 = vld1q_u8(src);
944 src += src_pixels_per_line;
945 q5u8 = vld1q_u8(src);
946 src += src_pixels_per_line;
947 q6u8 = vld1q_u8(src);
948 src += src_pixels_per_line;
949
950 __builtin_prefetch(src);
951 __builtin_prefetch(src + src_pixels_per_line);
952 __builtin_prefetch(src + src_pixels_per_line * 2);
953
954 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
955 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
956 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
957 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
958
959 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
960 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
961 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
962 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
963
964 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
965 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
966 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
967 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
968
969 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
970 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
971 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
972 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
973
974 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
975 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
976 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
977 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
978
979 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
980 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
981 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
982 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
983
984 q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
985 q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
986 q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
987 q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
988
989 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
990 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
991 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
992 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
993
994 q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
995 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
996 q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
997 q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
998
999 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1000 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1001 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1002 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1003
1004 q3u16 = vmull_u8(d28u8, d3u8);
1005 q4u16 = vmull_u8(d29u8, d3u8);
1006 q5u16 = vmull_u8(d30u8, d3u8);
1007 q6u16 = vmull_u8(d31u8, d3u8);
1008
1009 q3s16 = vreinterpretq_s16_u16(q3u16);
1010 q4s16 = vreinterpretq_s16_u16(q4u16);
1011 q5s16 = vreinterpretq_s16_u16(q5u16);
1012 q6s16 = vreinterpretq_s16_u16(q6u16);
1013 q7s16 = vreinterpretq_s16_u16(q7u16);
1014 q8s16 = vreinterpretq_s16_u16(q8u16);
1015 q9s16 = vreinterpretq_s16_u16(q9u16);
1016 q10s16 = vreinterpretq_s16_u16(q10u16);
1017
1018 q7s16 = vqaddq_s16(q7s16, q3s16);
1019 q8s16 = vqaddq_s16(q8s16, q4s16);
1020 q9s16 = vqaddq_s16(q9s16, q5s16);
1021 q10s16 = vqaddq_s16(q10s16, q6s16);
1022
1023 d22u8 = vqrshrun_n_s16(q7s16, 7);
1024 d23u8 = vqrshrun_n_s16(q8s16, 7);
1025 d24u8 = vqrshrun_n_s16(q9s16, 7);
1026 d25u8 = vqrshrun_n_s16(q10s16, 7);
1027
1028 if (yoffset == 0) { // firstpass_filter8x4_only
1029 vst1_u8(dst_ptr, d22u8);
1030 dst_ptr += dst_pitch;
1031 vst1_u8(dst_ptr, d23u8);
1032 dst_ptr += dst_pitch;
1033 vst1_u8(dst_ptr, d24u8);
1034 dst_ptr += dst_pitch;
1035 vst1_u8(dst_ptr, d25u8);
1036 dst_ptr += dst_pitch;
1037 } else {
1038 vst1_u8(tmpp, d22u8);
1039 tmpp += 8;
1040 vst1_u8(tmpp, d23u8);
1041 tmpp += 8;
1042 vst1_u8(tmpp, d24u8);
1043 tmpp += 8;
1044 vst1_u8(tmpp, d25u8);
1045 tmpp += 8;
1046 }
1047 }
1048 if (yoffset == 0) return;
1049
1050 // First Pass on rest 5-line data
1051 q3u8 = vld1q_u8(src);
1052 src += src_pixels_per_line;
1053 q4u8 = vld1q_u8(src);
1054 src += src_pixels_per_line;
1055 q5u8 = vld1q_u8(src);
1056 src += src_pixels_per_line;
1057 q6u8 = vld1q_u8(src);
1058 src += src_pixels_per_line;
1059 q7u8 = vld1q_u8(src);
1060
1061 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
1062 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
1063 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
1064 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
1065 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
1066
1067 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
1068 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
1069 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
1070 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
1071 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
1072
1073 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
1074 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1075 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1076 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
1077 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
1078
1079 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
1080 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
1081 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
1082 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
1083 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
1084
1085 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
1086 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1087 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1088 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
1089 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
1090
1091 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
1092 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
1093 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
1094 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
1095 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
1096
1097 q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
1098 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1099 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1100 q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
1101 q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
1102
1103 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
1104 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
1105 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
1106 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
1107 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
1108
1109 q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
1110 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1111 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1112 q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
1113 q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
1114
1115 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1116 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1117 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1118 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1119 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
1120
1121 q3u16 = vmull_u8(d27u8, d3u8);
1122 q4u16 = vmull_u8(d28u8, d3u8);
1123 q5u16 = vmull_u8(d29u8, d3u8);
1124 q6u16 = vmull_u8(d30u8, d3u8);
1125 q7u16 = vmull_u8(d31u8, d3u8);
1126
1127 q3s16 = vreinterpretq_s16_u16(q3u16);
1128 q4s16 = vreinterpretq_s16_u16(q4u16);
1129 q5s16 = vreinterpretq_s16_u16(q5u16);
1130 q6s16 = vreinterpretq_s16_u16(q6u16);
1131 q7s16 = vreinterpretq_s16_u16(q7u16);
1132 q8s16 = vreinterpretq_s16_u16(q8u16);
1133 q9s16 = vreinterpretq_s16_u16(q9u16);
1134 q10s16 = vreinterpretq_s16_u16(q10u16);
1135 q11s16 = vreinterpretq_s16_u16(q11u16);
1136 q12s16 = vreinterpretq_s16_u16(q12u16);
1137
1138 q8s16 = vqaddq_s16(q8s16, q3s16);
1139 q9s16 = vqaddq_s16(q9s16, q4s16);
1140 q10s16 = vqaddq_s16(q10s16, q5s16);
1141 q11s16 = vqaddq_s16(q11s16, q6s16);
1142 q12s16 = vqaddq_s16(q12s16, q7s16);
1143
1144 d26u8 = vqrshrun_n_s16(q8s16, 7);
1145 d27u8 = vqrshrun_n_s16(q9s16, 7);
1146 d28u8 = vqrshrun_n_s16(q10s16, 7);
1147 d29u8 = vqrshrun_n_s16(q11s16, 7);
1148 d30u8 = vqrshrun_n_s16(q12s16, 7);
1149
1150 // Second pass: 8x8
1151 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1152 d0s8 = vdup_lane_s8(dtmps8, 0);
1153 d1s8 = vdup_lane_s8(dtmps8, 1);
1154 d2s8 = vdup_lane_s8(dtmps8, 2);
1155 d3s8 = vdup_lane_s8(dtmps8, 3);
1156 d4s8 = vdup_lane_s8(dtmps8, 4);
1157 d5s8 = vdup_lane_s8(dtmps8, 5);
1158 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1159 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1160 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1161 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1162 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1163 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1164
1165 tmpp = tmp;
1166 q9u8 = vld1q_u8(tmpp);
1167 tmpp += 16;
1168 q10u8 = vld1q_u8(tmpp);
1169 tmpp += 16;
1170 q11u8 = vld1q_u8(tmpp);
1171 tmpp += 16;
1172 q12u8 = vld1q_u8(tmpp);
1173
1174 d18u8 = vget_low_u8(q9u8);
1175 d19u8 = vget_high_u8(q9u8);
1176 d20u8 = vget_low_u8(q10u8);
1177 d21u8 = vget_high_u8(q10u8);
1178 d22u8 = vget_low_u8(q11u8);
1179 d23u8 = vget_high_u8(q11u8);
1180 d24u8 = vget_low_u8(q12u8);
1181 d25u8 = vget_high_u8(q12u8);
1182
1183 for (i = 2; i > 0; i--) {
1184 q3u16 = vmull_u8(d18u8, d0u8);
1185 q4u16 = vmull_u8(d19u8, d0u8);
1186 q5u16 = vmull_u8(d20u8, d0u8);
1187 q6u16 = vmull_u8(d21u8, d0u8);
1188
1189 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1190 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1191 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1192 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1193
1194 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1195 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1196 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1197 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1198
1199 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1200 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1201 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1202 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1203
1204 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1205 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1206 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1207 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1208
1209 q7u16 = vmull_u8(d21u8, d3u8);
1210 q8u16 = vmull_u8(d22u8, d3u8);
1211 q9u16 = vmull_u8(d23u8, d3u8);
1212 q10u16 = vmull_u8(d24u8, d3u8);
1213
1214 q3s16 = vreinterpretq_s16_u16(q3u16);
1215 q4s16 = vreinterpretq_s16_u16(q4u16);
1216 q5s16 = vreinterpretq_s16_u16(q5u16);
1217 q6s16 = vreinterpretq_s16_u16(q6u16);
1218 q7s16 = vreinterpretq_s16_u16(q7u16);
1219 q8s16 = vreinterpretq_s16_u16(q8u16);
1220 q9s16 = vreinterpretq_s16_u16(q9u16);
1221 q10s16 = vreinterpretq_s16_u16(q10u16);
1222
1223 q7s16 = vqaddq_s16(q7s16, q3s16);
1224 q8s16 = vqaddq_s16(q8s16, q4s16);
1225 q9s16 = vqaddq_s16(q9s16, q5s16);
1226 q10s16 = vqaddq_s16(q10s16, q6s16);
1227
1228 d6u8 = vqrshrun_n_s16(q7s16, 7);
1229 d7u8 = vqrshrun_n_s16(q8s16, 7);
1230 d8u8 = vqrshrun_n_s16(q9s16, 7);
1231 d9u8 = vqrshrun_n_s16(q10s16, 7);
1232
1233 d18u8 = d22u8;
1234 d19u8 = d23u8;
1235 d20u8 = d24u8;
1236 d21u8 = d25u8;
1237 d22u8 = d26u8;
1238 d23u8 = d27u8;
1239 d24u8 = d28u8;
1240 d25u8 = d29u8;
1241 d26u8 = d30u8;
1242
1243 vst1_u8(dst_ptr, d6u8);
1244 dst_ptr += dst_pitch;
1245 vst1_u8(dst_ptr, d7u8);
1246 dst_ptr += dst_pitch;
1247 vst1_u8(dst_ptr, d8u8);
1248 dst_ptr += dst_pitch;
1249 vst1_u8(dst_ptr, d9u8);
1250 dst_ptr += dst_pitch;
1251 }
1252 }
1253
vp8_sixtap_predict16x16_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)1254 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
1255 int src_pixels_per_line, int xoffset,
1256 int yoffset, unsigned char *dst_ptr,
1257 int dst_pitch) {
1258 unsigned char *src, *src_tmp, *dst, *tmpp;
1259 unsigned char tmp[336];
1260 int i, j;
1261 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
1262 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
1263 uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
1264 uint8x8_t d28u8, d29u8, d30u8, d31u8;
1265 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
1266 uint8x16_t q3u8, q4u8;
1267 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
1268 uint16x8_t q11u16, q12u16, q13u16, q15u16;
1269 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
1270 int16x8_t q11s16, q12s16, q13s16, q15s16;
1271
1272 if (xoffset == 0) { // secondpass_filter8x8_only
1273 // load second_pass filter
1274 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1275 d0s8 = vdup_lane_s8(dtmps8, 0);
1276 d1s8 = vdup_lane_s8(dtmps8, 1);
1277 d2s8 = vdup_lane_s8(dtmps8, 2);
1278 d3s8 = vdup_lane_s8(dtmps8, 3);
1279 d4s8 = vdup_lane_s8(dtmps8, 4);
1280 d5s8 = vdup_lane_s8(dtmps8, 5);
1281 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1282 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1283 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1284 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1285 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1286 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1287
1288 // load src data
1289 src_tmp = src_ptr - src_pixels_per_line * 2;
1290 for (i = 0; i < 2; ++i) {
1291 src = src_tmp + i * 8;
1292 dst = dst_ptr + i * 8;
1293 d18u8 = vld1_u8(src);
1294 src += src_pixels_per_line;
1295 d19u8 = vld1_u8(src);
1296 src += src_pixels_per_line;
1297 d20u8 = vld1_u8(src);
1298 src += src_pixels_per_line;
1299 d21u8 = vld1_u8(src);
1300 src += src_pixels_per_line;
1301 d22u8 = vld1_u8(src);
1302 src += src_pixels_per_line;
1303 for (j = 0; j < 4; ++j) {
1304 d23u8 = vld1_u8(src);
1305 src += src_pixels_per_line;
1306 d24u8 = vld1_u8(src);
1307 src += src_pixels_per_line;
1308 d25u8 = vld1_u8(src);
1309 src += src_pixels_per_line;
1310 d26u8 = vld1_u8(src);
1311 src += src_pixels_per_line;
1312
1313 q3u16 = vmull_u8(d18u8, d0u8);
1314 q4u16 = vmull_u8(d19u8, d0u8);
1315 q5u16 = vmull_u8(d20u8, d0u8);
1316 q6u16 = vmull_u8(d21u8, d0u8);
1317
1318 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1319 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1320 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1321 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1322
1323 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1324 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1325 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1326 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1327
1328 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1329 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1330 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1331 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1332
1333 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1334 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1335 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1336 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1337
1338 q7u16 = vmull_u8(d21u8, d3u8);
1339 q8u16 = vmull_u8(d22u8, d3u8);
1340 q9u16 = vmull_u8(d23u8, d3u8);
1341 q10u16 = vmull_u8(d24u8, d3u8);
1342
1343 q3s16 = vreinterpretq_s16_u16(q3u16);
1344 q4s16 = vreinterpretq_s16_u16(q4u16);
1345 q5s16 = vreinterpretq_s16_u16(q5u16);
1346 q6s16 = vreinterpretq_s16_u16(q6u16);
1347 q7s16 = vreinterpretq_s16_u16(q7u16);
1348 q8s16 = vreinterpretq_s16_u16(q8u16);
1349 q9s16 = vreinterpretq_s16_u16(q9u16);
1350 q10s16 = vreinterpretq_s16_u16(q10u16);
1351
1352 q7s16 = vqaddq_s16(q7s16, q3s16);
1353 q8s16 = vqaddq_s16(q8s16, q4s16);
1354 q9s16 = vqaddq_s16(q9s16, q5s16);
1355 q10s16 = vqaddq_s16(q10s16, q6s16);
1356
1357 d6u8 = vqrshrun_n_s16(q7s16, 7);
1358 d7u8 = vqrshrun_n_s16(q8s16, 7);
1359 d8u8 = vqrshrun_n_s16(q9s16, 7);
1360 d9u8 = vqrshrun_n_s16(q10s16, 7);
1361
1362 d18u8 = d22u8;
1363 d19u8 = d23u8;
1364 d20u8 = d24u8;
1365 d21u8 = d25u8;
1366 d22u8 = d26u8;
1367
1368 vst1_u8(dst, d6u8);
1369 dst += dst_pitch;
1370 vst1_u8(dst, d7u8);
1371 dst += dst_pitch;
1372 vst1_u8(dst, d8u8);
1373 dst += dst_pitch;
1374 vst1_u8(dst, d9u8);
1375 dst += dst_pitch;
1376 }
1377 }
1378 return;
1379 }
1380
1381 // load first_pass filter
1382 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
1383 d0s8 = vdup_lane_s8(dtmps8, 0);
1384 d1s8 = vdup_lane_s8(dtmps8, 1);
1385 d2s8 = vdup_lane_s8(dtmps8, 2);
1386 d3s8 = vdup_lane_s8(dtmps8, 3);
1387 d4s8 = vdup_lane_s8(dtmps8, 4);
1388 d5s8 = vdup_lane_s8(dtmps8, 5);
1389 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1390 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1391 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1392 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1393 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1394 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1395
1396 // First pass: output_height lines x output_width columns (9x4)
1397 if (yoffset == 0) { // firstpass_filter4x4_only
1398 src = src_ptr - 2;
1399 dst = dst_ptr;
1400 for (i = 0; i < 8; ++i) {
1401 d6u8 = vld1_u8(src);
1402 d7u8 = vld1_u8(src + 8);
1403 d8u8 = vld1_u8(src + 16);
1404 src += src_pixels_per_line;
1405 d9u8 = vld1_u8(src);
1406 d10u8 = vld1_u8(src + 8);
1407 d11u8 = vld1_u8(src + 16);
1408 src += src_pixels_per_line;
1409
1410 __builtin_prefetch(src);
1411 __builtin_prefetch(src + src_pixels_per_line);
1412
1413 q6u16 = vmull_u8(d6u8, d0u8);
1414 q7u16 = vmull_u8(d7u8, d0u8);
1415 q8u16 = vmull_u8(d9u8, d0u8);
1416 q9u16 = vmull_u8(d10u8, d0u8);
1417
1418 d20u8 = vext_u8(d6u8, d7u8, 1);
1419 d21u8 = vext_u8(d9u8, d10u8, 1);
1420 d22u8 = vext_u8(d7u8, d8u8, 1);
1421 d23u8 = vext_u8(d10u8, d11u8, 1);
1422 d24u8 = vext_u8(d6u8, d7u8, 4);
1423 d25u8 = vext_u8(d9u8, d10u8, 4);
1424 d26u8 = vext_u8(d7u8, d8u8, 4);
1425 d27u8 = vext_u8(d10u8, d11u8, 4);
1426 d28u8 = vext_u8(d6u8, d7u8, 5);
1427 d29u8 = vext_u8(d9u8, d10u8, 5);
1428
1429 q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
1430 q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
1431 q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
1432 q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
1433 q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
1434 q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
1435 q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
1436 q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
1437 q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
1438 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
1439
1440 d20u8 = vext_u8(d7u8, d8u8, 5);
1441 d21u8 = vext_u8(d10u8, d11u8, 5);
1442 d22u8 = vext_u8(d6u8, d7u8, 2);
1443 d23u8 = vext_u8(d9u8, d10u8, 2);
1444 d24u8 = vext_u8(d7u8, d8u8, 2);
1445 d25u8 = vext_u8(d10u8, d11u8, 2);
1446 d26u8 = vext_u8(d6u8, d7u8, 3);
1447 d27u8 = vext_u8(d9u8, d10u8, 3);
1448 d28u8 = vext_u8(d7u8, d8u8, 3);
1449 d29u8 = vext_u8(d10u8, d11u8, 3);
1450
1451 q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
1452 q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
1453 q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
1454 q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
1455 q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
1456 q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
1457
1458 q10u16 = vmull_u8(d26u8, d3u8);
1459 q11u16 = vmull_u8(d27u8, d3u8);
1460 q12u16 = vmull_u8(d28u8, d3u8);
1461 q15u16 = vmull_u8(d29u8, d3u8);
1462
1463 q6s16 = vreinterpretq_s16_u16(q6u16);
1464 q7s16 = vreinterpretq_s16_u16(q7u16);
1465 q8s16 = vreinterpretq_s16_u16(q8u16);
1466 q9s16 = vreinterpretq_s16_u16(q9u16);
1467 q10s16 = vreinterpretq_s16_u16(q10u16);
1468 q11s16 = vreinterpretq_s16_u16(q11u16);
1469 q12s16 = vreinterpretq_s16_u16(q12u16);
1470 q15s16 = vreinterpretq_s16_u16(q15u16);
1471
1472 q6s16 = vqaddq_s16(q6s16, q10s16);
1473 q8s16 = vqaddq_s16(q8s16, q11s16);
1474 q7s16 = vqaddq_s16(q7s16, q12s16);
1475 q9s16 = vqaddq_s16(q9s16, q15s16);
1476
1477 d6u8 = vqrshrun_n_s16(q6s16, 7);
1478 d7u8 = vqrshrun_n_s16(q7s16, 7);
1479 d8u8 = vqrshrun_n_s16(q8s16, 7);
1480 d9u8 = vqrshrun_n_s16(q9s16, 7);
1481
1482 q3u8 = vcombine_u8(d6u8, d7u8);
1483 q4u8 = vcombine_u8(d8u8, d9u8);
1484 vst1q_u8(dst, q3u8);
1485 dst += dst_pitch;
1486 vst1q_u8(dst, q4u8);
1487 dst += dst_pitch;
1488 }
1489 return;
1490 }
1491
1492 src = src_ptr - 2 - src_pixels_per_line * 2;
1493 tmpp = tmp;
1494 for (i = 0; i < 7; ++i) {
1495 d6u8 = vld1_u8(src);
1496 d7u8 = vld1_u8(src + 8);
1497 d8u8 = vld1_u8(src + 16);
1498 src += src_pixels_per_line;
1499 d9u8 = vld1_u8(src);
1500 d10u8 = vld1_u8(src + 8);
1501 d11u8 = vld1_u8(src + 16);
1502 src += src_pixels_per_line;
1503 d12u8 = vld1_u8(src);
1504 d13u8 = vld1_u8(src + 8);
1505 // Only 5 pixels are needed, avoid a potential out of bounds read.
1506 d14u8 = vld1_u8(src + 13);
1507 d14u8 = vext_u8(d14u8, d14u8, 3);
1508 src += src_pixels_per_line;
1509
1510 __builtin_prefetch(src);
1511 __builtin_prefetch(src + src_pixels_per_line);
1512 __builtin_prefetch(src + src_pixels_per_line * 2);
1513
1514 q8u16 = vmull_u8(d6u8, d0u8);
1515 q9u16 = vmull_u8(d7u8, d0u8);
1516 q10u16 = vmull_u8(d9u8, d0u8);
1517 q11u16 = vmull_u8(d10u8, d0u8);
1518 q12u16 = vmull_u8(d12u8, d0u8);
1519 q13u16 = vmull_u8(d13u8, d0u8);
1520
1521 d28u8 = vext_u8(d6u8, d7u8, 1);
1522 d29u8 = vext_u8(d9u8, d10u8, 1);
1523 d30u8 = vext_u8(d12u8, d13u8, 1);
1524 q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
1525 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1526 q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
1527 d28u8 = vext_u8(d7u8, d8u8, 1);
1528 d29u8 = vext_u8(d10u8, d11u8, 1);
1529 d30u8 = vext_u8(d13u8, d14u8, 1);
1530 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1531 q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
1532 q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
1533
1534 d28u8 = vext_u8(d6u8, d7u8, 4);
1535 d29u8 = vext_u8(d9u8, d10u8, 4);
1536 d30u8 = vext_u8(d12u8, d13u8, 4);
1537 q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
1538 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1539 q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
1540 d28u8 = vext_u8(d7u8, d8u8, 4);
1541 d29u8 = vext_u8(d10u8, d11u8, 4);
1542 d30u8 = vext_u8(d13u8, d14u8, 4);
1543 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1544 q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
1545 q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
1546
1547 d28u8 = vext_u8(d6u8, d7u8, 5);
1548 d29u8 = vext_u8(d9u8, d10u8, 5);
1549 d30u8 = vext_u8(d12u8, d13u8, 5);
1550 q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
1551 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1552 q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
1553 d28u8 = vext_u8(d7u8, d8u8, 5);
1554 d29u8 = vext_u8(d10u8, d11u8, 5);
1555 d30u8 = vext_u8(d13u8, d14u8, 5);
1556 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1557 q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
1558 q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
1559
1560 d28u8 = vext_u8(d6u8, d7u8, 2);
1561 d29u8 = vext_u8(d9u8, d10u8, 2);
1562 d30u8 = vext_u8(d12u8, d13u8, 2);
1563 q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
1564 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1565 q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
1566 d28u8 = vext_u8(d7u8, d8u8, 2);
1567 d29u8 = vext_u8(d10u8, d11u8, 2);
1568 d30u8 = vext_u8(d13u8, d14u8, 2);
1569 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1570 q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
1571 q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
1572
1573 d28u8 = vext_u8(d6u8, d7u8, 3);
1574 d29u8 = vext_u8(d9u8, d10u8, 3);
1575 d30u8 = vext_u8(d12u8, d13u8, 3);
1576 d15u8 = vext_u8(d7u8, d8u8, 3);
1577 d31u8 = vext_u8(d10u8, d11u8, 3);
1578 d6u8 = vext_u8(d13u8, d14u8, 3);
1579 q4u16 = vmull_u8(d28u8, d3u8);
1580 q5u16 = vmull_u8(d29u8, d3u8);
1581 q6u16 = vmull_u8(d30u8, d3u8);
1582 q4s16 = vreinterpretq_s16_u16(q4u16);
1583 q5s16 = vreinterpretq_s16_u16(q5u16);
1584 q6s16 = vreinterpretq_s16_u16(q6u16);
1585 q8s16 = vreinterpretq_s16_u16(q8u16);
1586 q10s16 = vreinterpretq_s16_u16(q10u16);
1587 q12s16 = vreinterpretq_s16_u16(q12u16);
1588 q8s16 = vqaddq_s16(q8s16, q4s16);
1589 q10s16 = vqaddq_s16(q10s16, q5s16);
1590 q12s16 = vqaddq_s16(q12s16, q6s16);
1591
1592 q6u16 = vmull_u8(d15u8, d3u8);
1593 q7u16 = vmull_u8(d31u8, d3u8);
1594 q3u16 = vmull_u8(d6u8, d3u8);
1595 q3s16 = vreinterpretq_s16_u16(q3u16);
1596 q6s16 = vreinterpretq_s16_u16(q6u16);
1597 q7s16 = vreinterpretq_s16_u16(q7u16);
1598 q9s16 = vreinterpretq_s16_u16(q9u16);
1599 q11s16 = vreinterpretq_s16_u16(q11u16);
1600 q13s16 = vreinterpretq_s16_u16(q13u16);
1601 q9s16 = vqaddq_s16(q9s16, q6s16);
1602 q11s16 = vqaddq_s16(q11s16, q7s16);
1603 q13s16 = vqaddq_s16(q13s16, q3s16);
1604
1605 d6u8 = vqrshrun_n_s16(q8s16, 7);
1606 d7u8 = vqrshrun_n_s16(q9s16, 7);
1607 d8u8 = vqrshrun_n_s16(q10s16, 7);
1608 d9u8 = vqrshrun_n_s16(q11s16, 7);
1609 d10u8 = vqrshrun_n_s16(q12s16, 7);
1610 d11u8 = vqrshrun_n_s16(q13s16, 7);
1611
1612 vst1_u8(tmpp, d6u8);
1613 tmpp += 8;
1614 vst1_u8(tmpp, d7u8);
1615 tmpp += 8;
1616 vst1_u8(tmpp, d8u8);
1617 tmpp += 8;
1618 vst1_u8(tmpp, d9u8);
1619 tmpp += 8;
1620 vst1_u8(tmpp, d10u8);
1621 tmpp += 8;
1622 vst1_u8(tmpp, d11u8);
1623 tmpp += 8;
1624 }
1625
1626 // Second pass: 16x16
1627 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1628 d0s8 = vdup_lane_s8(dtmps8, 0);
1629 d1s8 = vdup_lane_s8(dtmps8, 1);
1630 d2s8 = vdup_lane_s8(dtmps8, 2);
1631 d3s8 = vdup_lane_s8(dtmps8, 3);
1632 d4s8 = vdup_lane_s8(dtmps8, 4);
1633 d5s8 = vdup_lane_s8(dtmps8, 5);
1634 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1635 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1636 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1637 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1638 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1639 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1640
1641 for (i = 0; i < 2; ++i) {
1642 dst = dst_ptr + 8 * i;
1643 tmpp = tmp + 8 * i;
1644 d18u8 = vld1_u8(tmpp);
1645 tmpp += 16;
1646 d19u8 = vld1_u8(tmpp);
1647 tmpp += 16;
1648 d20u8 = vld1_u8(tmpp);
1649 tmpp += 16;
1650 d21u8 = vld1_u8(tmpp);
1651 tmpp += 16;
1652 d22u8 = vld1_u8(tmpp);
1653 tmpp += 16;
1654 for (j = 0; j < 4; ++j) {
1655 d23u8 = vld1_u8(tmpp);
1656 tmpp += 16;
1657 d24u8 = vld1_u8(tmpp);
1658 tmpp += 16;
1659 d25u8 = vld1_u8(tmpp);
1660 tmpp += 16;
1661 d26u8 = vld1_u8(tmpp);
1662 tmpp += 16;
1663
1664 q3u16 = vmull_u8(d18u8, d0u8);
1665 q4u16 = vmull_u8(d19u8, d0u8);
1666 q5u16 = vmull_u8(d20u8, d0u8);
1667 q6u16 = vmull_u8(d21u8, d0u8);
1668
1669 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1670 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1671 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1672 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1673
1674 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1675 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1676 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1677 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1678
1679 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1680 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1681 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1682 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1683
1684 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1685 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1686 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1687 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1688
1689 q7u16 = vmull_u8(d21u8, d3u8);
1690 q8u16 = vmull_u8(d22u8, d3u8);
1691 q9u16 = vmull_u8(d23u8, d3u8);
1692 q10u16 = vmull_u8(d24u8, d3u8);
1693
1694 q3s16 = vreinterpretq_s16_u16(q3u16);
1695 q4s16 = vreinterpretq_s16_u16(q4u16);
1696 q5s16 = vreinterpretq_s16_u16(q5u16);
1697 q6s16 = vreinterpretq_s16_u16(q6u16);
1698 q7s16 = vreinterpretq_s16_u16(q7u16);
1699 q8s16 = vreinterpretq_s16_u16(q8u16);
1700 q9s16 = vreinterpretq_s16_u16(q9u16);
1701 q10s16 = vreinterpretq_s16_u16(q10u16);
1702
1703 q7s16 = vqaddq_s16(q7s16, q3s16);
1704 q8s16 = vqaddq_s16(q8s16, q4s16);
1705 q9s16 = vqaddq_s16(q9s16, q5s16);
1706 q10s16 = vqaddq_s16(q10s16, q6s16);
1707
1708 d6u8 = vqrshrun_n_s16(q7s16, 7);
1709 d7u8 = vqrshrun_n_s16(q8s16, 7);
1710 d8u8 = vqrshrun_n_s16(q9s16, 7);
1711 d9u8 = vqrshrun_n_s16(q10s16, 7);
1712
1713 d18u8 = d22u8;
1714 d19u8 = d23u8;
1715 d20u8 = d24u8;
1716 d21u8 = d25u8;
1717 d22u8 = d26u8;
1718
1719 vst1_u8(dst, d6u8);
1720 dst += dst_pitch;
1721 vst1_u8(dst, d7u8);
1722 dst += dst_pitch;
1723 vst1_u8(dst, d8u8);
1724 dst += dst_pitch;
1725 vst1_u8(dst, d9u8);
1726 dst += dst_pitch;
1727 }
1728 }
1729 }
1730