1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // ARM NEON version of speed-critical encoding functions.
11 //
12 // adapted from libvpx (https://www.webmproject.org/code/)
13
14 #include "src/dsp/dsp.h"
15
16 #if defined(WEBP_USE_NEON)
17
18 #include <assert.h>
19
20 #include "src/dsp/neon.h"
21 #include "src/enc/vp8i_enc.h"
22
23 //------------------------------------------------------------------------------
24 // Transforms (Paragraph 14.4)
25
26 // Inverse transform.
27 // This code is pretty much the same as TransformOne in the dec_neon.c, except
28 // for subtraction to *ref. See the comments there for algorithmic explanations.
29
30 static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
31 static const int16_t kC2 =
32 WEBP_TRANSFORM_AC3_C2 / 2; // half of kC2, actually. See comment above.
33
34 // This code works but is *slower* than the inlined-asm version below
35 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
36 // WEBP_USE_INTRINSICS define.
37 // With gcc-4.8, it's a little faster speed than inlined-assembly.
38 #if defined(WEBP_USE_INTRINSICS)
39
40 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
ConvertU8ToS16_NEON(uint32x2_t v)41 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
42 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
43 }
44
45 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
46 // to the corresponding rows of 'dst'.
SaturateAndStore4x4_NEON(uint8_t * const dst,const int16x8_t dst01,const int16x8_t dst23)47 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
48 const int16x8_t dst01,
49 const int16x8_t dst23) {
50 // Unsigned saturate to 8b.
51 const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
52 const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
53
54 // Store the results.
55 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
56 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
57 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
58 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
59 }
60
Add4x4_NEON(const int16x8_t row01,const int16x8_t row23,const uint8_t * const ref,uint8_t * const dst)61 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
62 const int16x8_t row23,
63 const uint8_t* const ref,
64 uint8_t* const dst) {
65 uint32x2_t dst01 = vdup_n_u32(0);
66 uint32x2_t dst23 = vdup_n_u32(0);
67
68 // Load the source pixels.
69 dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
70 dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
71 dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
72 dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
73
74 {
75 // Convert to 16b.
76 const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
77 const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
78
79 // Descale with rounding.
80 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
81 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
82 // Add the inverse transform.
83 SaturateAndStore4x4_NEON(dst, out01, out23);
84 }
85 }
86
Transpose8x2_NEON(const int16x8_t in0,const int16x8_t in1,int16x8x2_t * const out)87 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
88 const int16x8_t in1,
89 int16x8x2_t* const out) {
90 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
91 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
92 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
93 // b0 d0 b1 d1 b2 d2 ...
94 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
95 }
96
TransformPass_NEON(int16x8x2_t * const rows)97 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
98 // {rows} = in0 | in4
99 // in8 | in12
100 // B1 = in4 | in12
101 const int16x8_t B1 =
102 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
103 // C0 = kC1 * in4 | kC1 * in12
104 // C1 = kC2 * in4 | kC2 * in12
105 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
106 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
107 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
108 vget_low_s16(rows->val[1])); // in0 + in8
109 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
110 vget_low_s16(rows->val[1])); // in0 - in8
111 // c = kC2 * in4 - kC1 * in12
112 // d = kC1 * in4 + kC2 * in12
113 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
114 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
115 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
116 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
117 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
118 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
119 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
120 Transpose8x2_NEON(E0, E1, rows);
121 }
122
ITransformOne_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst)123 static void ITransformOne_NEON(const uint8_t* ref,
124 const int16_t* in, uint8_t* dst) {
125 int16x8x2_t rows;
126 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
127 TransformPass_NEON(&rows);
128 TransformPass_NEON(&rows);
129 Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
130 }
131
132 #else
133
ITransformOne_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst)134 static void ITransformOne_NEON(const uint8_t* ref,
135 const int16_t* in, uint8_t* dst) {
136 const int kBPS = BPS;
137 const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
138
139 __asm__ volatile (
140 "vld1.16 {q1, q2}, [%[in]] \n"
141 "vld1.16 {d0}, [%[kC1C2]] \n"
142
143 // d2: in[0]
144 // d3: in[8]
145 // d4: in[4]
146 // d5: in[12]
147 "vswp d3, d4 \n"
148
149 // q8 = {in[4], in[12]} * kC1 * 2 >> 16
150 // q9 = {in[4], in[12]} * kC2 >> 16
151 "vqdmulh.s16 q8, q2, d0[0] \n"
152 "vqdmulh.s16 q9, q2, d0[1] \n"
153
154 // d22 = a = in[0] + in[8]
155 // d23 = b = in[0] - in[8]
156 "vqadd.s16 d22, d2, d3 \n"
157 "vqsub.s16 d23, d2, d3 \n"
158
159 // q8 = in[4]/[12] * kC1 >> 16
160 "vshr.s16 q8, q8, #1 \n"
161
162 // Add {in[4], in[12]} back after the multiplication.
163 "vqadd.s16 q8, q2, q8 \n"
164
165 // d20 = c = in[4]*kC2 - in[12]*kC1
166 // d21 = d = in[4]*kC1 + in[12]*kC2
167 "vqsub.s16 d20, d18, d17 \n"
168 "vqadd.s16 d21, d19, d16 \n"
169
170 // d2 = tmp[0] = a + d
171 // d3 = tmp[1] = b + c
172 // d4 = tmp[2] = b - c
173 // d5 = tmp[3] = a - d
174 "vqadd.s16 d2, d22, d21 \n"
175 "vqadd.s16 d3, d23, d20 \n"
176 "vqsub.s16 d4, d23, d20 \n"
177 "vqsub.s16 d5, d22, d21 \n"
178
179 "vzip.16 q1, q2 \n"
180 "vzip.16 q1, q2 \n"
181
182 "vswp d3, d4 \n"
183
184 // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
185 // q9 = {tmp[4], tmp[12]} * kC2 >> 16
186 "vqdmulh.s16 q8, q2, d0[0] \n"
187 "vqdmulh.s16 q9, q2, d0[1] \n"
188
189 // d22 = a = tmp[0] + tmp[8]
190 // d23 = b = tmp[0] - tmp[8]
191 "vqadd.s16 d22, d2, d3 \n"
192 "vqsub.s16 d23, d2, d3 \n"
193
194 "vshr.s16 q8, q8, #1 \n"
195 "vqadd.s16 q8, q2, q8 \n"
196
197 // d20 = c = in[4]*kC2 - in[12]*kC1
198 // d21 = d = in[4]*kC1 + in[12]*kC2
199 "vqsub.s16 d20, d18, d17 \n"
200 "vqadd.s16 d21, d19, d16 \n"
201
202 // d2 = tmp[0] = a + d
203 // d3 = tmp[1] = b + c
204 // d4 = tmp[2] = b - c
205 // d5 = tmp[3] = a - d
206 "vqadd.s16 d2, d22, d21 \n"
207 "vqadd.s16 d3, d23, d20 \n"
208 "vqsub.s16 d4, d23, d20 \n"
209 "vqsub.s16 d5, d22, d21 \n"
210
211 "vld1.32 d6[0], [%[ref]], %[kBPS] \n"
212 "vld1.32 d6[1], [%[ref]], %[kBPS] \n"
213 "vld1.32 d7[0], [%[ref]], %[kBPS] \n"
214 "vld1.32 d7[1], [%[ref]], %[kBPS] \n"
215
216 "sub %[ref], %[ref], %[kBPS], lsl #2 \n"
217
218 // (val) + 4 >> 3
219 "vrshr.s16 d2, d2, #3 \n"
220 "vrshr.s16 d3, d3, #3 \n"
221 "vrshr.s16 d4, d4, #3 \n"
222 "vrshr.s16 d5, d5, #3 \n"
223
224 "vzip.16 q1, q2 \n"
225 "vzip.16 q1, q2 \n"
226
227 // Must accumulate before saturating
228 "vmovl.u8 q8, d6 \n"
229 "vmovl.u8 q9, d7 \n"
230
231 "vqadd.s16 q1, q1, q8 \n"
232 "vqadd.s16 q2, q2, q9 \n"
233
234 "vqmovun.s16 d0, q1 \n"
235 "vqmovun.s16 d1, q2 \n"
236
237 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
238 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
239 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
240 "vst1.32 d1[1], [%[dst]] \n"
241
242 : [in] "+r"(in), [dst] "+r"(dst) // modified registers
243 : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
244 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
245 );
246 }
247
248 #endif // WEBP_USE_INTRINSICS
249
ITransform_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst,int do_two)250 static void ITransform_NEON(const uint8_t* ref,
251 const int16_t* in, uint8_t* dst, int do_two) {
252 ITransformOne_NEON(ref, in, dst);
253 if (do_two) {
254 ITransformOne_NEON(ref + 4, in + 16, dst + 4);
255 }
256 }
257
258 // Load all 4x4 pixels into a single uint8x16_t variable.
Load4x4_NEON(const uint8_t * src)259 static uint8x16_t Load4x4_NEON(const uint8_t* src) {
260 uint32x4_t out = vdupq_n_u32(0);
261 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
262 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
263 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
264 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
265 return vreinterpretq_u8_u32(out);
266 }
267
268 // Forward transform.
269
270 #if defined(WEBP_USE_INTRINSICS)
271
Transpose4x4_S16_NEON(const int16x4_t A,const int16x4_t B,const int16x4_t C,const int16x4_t D,int16x8_t * const out01,int16x8_t * const out32)272 static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
273 const int16x4_t B,
274 const int16x4_t C,
275 const int16x4_t D,
276 int16x8_t* const out01,
277 int16x8_t* const out32) {
278 const int16x4x2_t AB = vtrn_s16(A, B);
279 const int16x4x2_t CD = vtrn_s16(C, D);
280 const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
281 vreinterpret_s32_s16(CD.val[0]));
282 const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
283 vreinterpret_s32_s16(CD.val[1]));
284 *out01 = vreinterpretq_s16_s64(
285 vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
286 vreinterpret_s64_s32(tmp13.val[0])));
287 *out32 = vreinterpretq_s16_s64(
288 vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
289 vreinterpret_s64_s32(tmp02.val[1])));
290 }
291
DiffU8ToS16_NEON(const uint8x8_t a,const uint8x8_t b)292 static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
293 const uint8x8_t b) {
294 return vreinterpretq_s16_u16(vsubl_u8(a, b));
295 }
296
FTransform_NEON(const uint8_t * src,const uint8_t * ref,int16_t * out)297 static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
298 int16_t* out) {
299 int16x8_t d0d1, d3d2; // working 4x4 int16 variables
300 {
301 const uint8x16_t S0 = Load4x4_NEON(src);
302 const uint8x16_t R0 = Load4x4_NEON(ref);
303 const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
304 const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
305 const int16x4_t D0 = vget_low_s16(D0D1);
306 const int16x4_t D1 = vget_high_s16(D0D1);
307 const int16x4_t D2 = vget_low_s16(D2D3);
308 const int16x4_t D3 = vget_high_s16(D2D3);
309 Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
310 }
311 { // 1rst pass
312 const int32x4_t kCst937 = vdupq_n_s32(937);
313 const int32x4_t kCst1812 = vdupq_n_s32(1812);
314 const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
315 const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
316 const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
317 const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
318 vget_high_s16(a0a1_2));
319 const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
320 vget_high_s16(a0a1_2));
321 const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
322 const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
323 const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
324 const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
325 const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
326 const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
327 Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
328 }
329 { // 2nd pass
330 // the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
331 const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
332 const int32x4_t kCst51000 = vdupq_n_s32(51000);
333 const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
334 const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
335 const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
336 const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
337 const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
338 const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
339 const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
340 const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
341 const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
342 const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
343 const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
344 const int16x4_t a3_eq_0 =
345 vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
346 const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
347 vst1_s16(out + 0, out0);
348 vst1_s16(out + 4, out1);
349 vst1_s16(out + 8, out2);
350 vst1_s16(out + 12, out3);
351 }
352 }
353
354 #else
355
356 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
357 static const int16_t kCoeff16[] = {
358 5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217
359 };
360 static const int32_t kCoeff32[] = {
361 1812, 1812, 1812, 1812,
362 937, 937, 937, 937,
363 12000, 12000, 12000, 12000,
364 51000, 51000, 51000, 51000
365 };
366
FTransform_NEON(const uint8_t * src,const uint8_t * ref,int16_t * out)367 static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
368 int16_t* out) {
369 const int kBPS = BPS;
370 const uint8_t* src_ptr = src;
371 const uint8_t* ref_ptr = ref;
372 const int16_t* coeff16 = kCoeff16;
373 const int32_t* coeff32 = kCoeff32;
374
375 __asm__ volatile (
376 // load src into q4, q5 in high half
377 "vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"
378 "vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"
379 "vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"
380 "vld1.8 {d11}, [%[src_ptr]] \n"
381
382 // load ref into q6, q7 in high half
383 "vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"
384 "vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"
385 "vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"
386 "vld1.8 {d15}, [%[ref_ptr]] \n"
387
388 // Pack the high values in to q4 and q6
389 "vtrn.32 q4, q5 \n"
390 "vtrn.32 q6, q7 \n"
391
392 // d[0-3] = src - ref
393 "vsubl.u8 q0, d8, d12 \n"
394 "vsubl.u8 q1, d9, d13 \n"
395
396 // load coeff16 into q8(d16=5352, d17=2217)
397 "vld1.16 {q8}, [%[coeff16]] \n"
398
399 // load coeff32 high half into q9 = 1812, q10 = 937
400 "vld1.32 {q9, q10}, [%[coeff32]]! \n"
401
402 // load coeff32 low half into q11=12000, q12=51000
403 "vld1.32 {q11,q12}, [%[coeff32]] \n"
404
405 // part 1
406 // Transpose. Register dN is the same as dN in C
407 "vtrn.32 d0, d2 \n"
408 "vtrn.32 d1, d3 \n"
409 "vtrn.16 d0, d1 \n"
410 "vtrn.16 d2, d3 \n"
411
412 "vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3
413 "vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2
414 "vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2
415 "vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3
416
417 "vadd.s16 d0, d4, d5 \n" // a0 + a1
418 "vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3
419 "vsub.s16 d2, d4, d5 \n" // a0 - a1
420 "vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3
421
422 "vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812
423 "vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937
424 "vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812
425 "vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352
426
427 // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
428 // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
429 "vshrn.s32 d1, q9, #9 \n"
430 "vshrn.s32 d3, q10, #9 \n"
431
432 // part 2
433 // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
434 "vtrn.32 d0, d2 \n"
435 "vtrn.32 d1, d3 \n"
436 "vtrn.16 d0, d1 \n"
437 "vtrn.16 d2, d3 \n"
438
439 "vmov.s16 d26, #7 \n"
440
441 "vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]
442 "vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]
443 "vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]
444 "vadd.s16 d4, d4, d26 \n" // a1 + 7
445 "vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]
446
447 "vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7
448 "vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7
449
450 "vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000
451 "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000
452
453 "vceq.s16 d4, d7, #0 \n"
454
455 "vshr.s16 d0, d0, #4 \n"
456 "vshr.s16 d2, d2, #4 \n"
457
458 "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000
459 "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000
460
461 "vmvn d4, d4 \n" // !(d1 == 0)
462 // op[4] = (c1*2217 + d1*5352 + 12000)>>16
463 "vshrn.s32 d1, q11, #16 \n"
464 // op[4] += (d1!=0)
465 "vsub.s16 d1, d1, d4 \n"
466 // op[12]= (d1*2217 - c1*5352 + 51000)>>16
467 "vshrn.s32 d3, q12, #16 \n"
468
469 // set result to out array
470 "vst1.16 {q0, q1}, [%[out]] \n"
471 : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
472 [coeff32] "+r"(coeff32) // modified registers
473 : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
474 [out] "r"(out) // constants
475 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
476 "q10", "q11", "q12", "q13" // clobbered
477 );
478 }
479
480 #endif
481
482 #define LOAD_LANE_16b(VALUE, LANE) do { \
483 (VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
484 src += stride; \
485 } while (0)
486
FTransformWHT_NEON(const int16_t * src,int16_t * out)487 static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
488 const int stride = 16;
489 const int16x4_t zero = vdup_n_s16(0);
490 int32x4x4_t tmp0;
491 int16x4x4_t in;
492 INIT_VECTOR4(in, zero, zero, zero, zero);
493 LOAD_LANE_16b(in.val[0], 0);
494 LOAD_LANE_16b(in.val[1], 0);
495 LOAD_LANE_16b(in.val[2], 0);
496 LOAD_LANE_16b(in.val[3], 0);
497 LOAD_LANE_16b(in.val[0], 1);
498 LOAD_LANE_16b(in.val[1], 1);
499 LOAD_LANE_16b(in.val[2], 1);
500 LOAD_LANE_16b(in.val[3], 1);
501 LOAD_LANE_16b(in.val[0], 2);
502 LOAD_LANE_16b(in.val[1], 2);
503 LOAD_LANE_16b(in.val[2], 2);
504 LOAD_LANE_16b(in.val[3], 2);
505 LOAD_LANE_16b(in.val[0], 3);
506 LOAD_LANE_16b(in.val[1], 3);
507 LOAD_LANE_16b(in.val[2], 3);
508 LOAD_LANE_16b(in.val[3], 3);
509
510 {
511 // a0 = in[0 * 16] + in[2 * 16]
512 // a1 = in[1 * 16] + in[3 * 16]
513 // a2 = in[1 * 16] - in[3 * 16]
514 // a3 = in[0 * 16] - in[2 * 16]
515 const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
516 const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
517 const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
518 const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
519 tmp0.val[0] = vaddq_s32(a0, a1);
520 tmp0.val[1] = vaddq_s32(a3, a2);
521 tmp0.val[2] = vsubq_s32(a3, a2);
522 tmp0.val[3] = vsubq_s32(a0, a1);
523 }
524 {
525 const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
526 // a0 = tmp[0 + i] + tmp[ 8 + i]
527 // a1 = tmp[4 + i] + tmp[12 + i]
528 // a2 = tmp[4 + i] - tmp[12 + i]
529 // a3 = tmp[0 + i] - tmp[ 8 + i]
530 const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
531 const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
532 const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
533 const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
534 const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1
535 const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1
536 const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1
537 const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1
538 const int16x4_t out0 = vmovn_s32(b0);
539 const int16x4_t out1 = vmovn_s32(b1);
540 const int16x4_t out2 = vmovn_s32(b2);
541 const int16x4_t out3 = vmovn_s32(b3);
542
543 vst1_s16(out + 0, out0);
544 vst1_s16(out + 4, out1);
545 vst1_s16(out + 8, out2);
546 vst1_s16(out + 12, out3);
547 }
548 }
549 #undef LOAD_LANE_16b
550
551 //------------------------------------------------------------------------------
552 // Texture distortion
553 //
554 // We try to match the spectral content (weighted) between source and
555 // reconstructed samples.
556
557 // a 0123, b 0123
558 // a 4567, b 4567
559 // a 89ab, b 89ab
560 // a cdef, b cdef
561 //
562 // transpose
563 //
564 // a 048c, b 048c
565 // a 159d, b 159d
566 // a 26ae, b 26ae
567 // a 37bf, b 37bf
568 //
DistoTranspose4x4S16_NEON(int16x8x4_t q4_in)569 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
570 const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
571 const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
572 const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
573 vreinterpretq_s32_s16(q2_tmp1.val[0]));
574 const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
575 vreinterpretq_s32_s16(q2_tmp1.val[1]));
576 q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
577 q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
578 q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
579 q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
580 return q4_in;
581 }
582
DistoHorizontalPass_NEON(const int16x8x4_t q4_in)583 static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
584 const int16x8x4_t q4_in) {
585 // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
586 // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
587 const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
588 const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
589 const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
590 const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
591 int16x8x4_t q4_out;
592 // tmp[0] = a0 + a1
593 // tmp[1] = a3 + a2
594 // tmp[2] = a3 - a2
595 // tmp[3] = a0 - a1
596 INIT_VECTOR4(q4_out,
597 vabsq_s16(vaddq_s16(q_a0, q_a1)),
598 vabsq_s16(vaddq_s16(q_a3, q_a2)),
599 vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
600 return q4_out;
601 }
602
DistoVerticalPass_NEON(const uint8x8x4_t q4_in)603 static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
604 const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
605 q4_in.val[2]));
606 const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
607 q4_in.val[3]));
608 const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
609 q4_in.val[3]));
610 const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
611 q4_in.val[2]));
612 int16x8x4_t q4_out;
613
614 INIT_VECTOR4(q4_out,
615 vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
616 vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
617 return q4_out;
618 }
619
DistoLoadW_NEON(const uint16_t * w)620 static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
621 const uint16x8_t q_w07 = vld1q_u16(&w[0]);
622 const uint16x8_t q_w8f = vld1q_u16(&w[8]);
623 int16x4x4_t d4_w;
624 INIT_VECTOR4(d4_w,
625 vget_low_s16(vreinterpretq_s16_u16(q_w07)),
626 vget_high_s16(vreinterpretq_s16_u16(q_w07)),
627 vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
628 vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
629 return d4_w;
630 }
631
DistoSum_NEON(const int16x8x4_t q4_in,const int16x4x4_t d4_w)632 static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
633 const int16x4x4_t d4_w) {
634 int32x2_t d_sum;
635 // sum += w[ 0] * abs(b0);
636 // sum += w[ 4] * abs(b1);
637 // sum += w[ 8] * abs(b2);
638 // sum += w[12] * abs(b3);
639 int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
640 int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
641 int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
642 int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
643 q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
644 q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
645 q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
646 q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
647
648 q_sum0 = vaddq_s32(q_sum0, q_sum1);
649 q_sum2 = vaddq_s32(q_sum2, q_sum3);
650 q_sum2 = vaddq_s32(q_sum0, q_sum2);
651 d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
652 d_sum = vpadd_s32(d_sum, d_sum);
653 return d_sum;
654 }
655
656 #define LOAD_LANE_32b(src, VALUE, LANE) \
657 (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
658
659 // Hadamard transform
660 // Returns the weighted sum of the absolute value of transformed coefficients.
661 // w[] contains a row-major 4 by 4 symmetric matrix.
Disto4x4_NEON(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)662 static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
663 const uint16_t* const w) {
664 uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
665 uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
666 uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
667 uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
668 uint8x8x4_t d4_in;
669
670 // load data a, b
671 LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
672 LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
673 LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
674 LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
675 LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
676 LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
677 LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
678 LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
679 INIT_VECTOR4(d4_in,
680 vreinterpret_u8_u32(d_in_ab_0123),
681 vreinterpret_u8_u32(d_in_ab_4567),
682 vreinterpret_u8_u32(d_in_ab_89ab),
683 vreinterpret_u8_u32(d_in_ab_cdef));
684
685 {
686 // Vertical pass first to avoid a transpose (vertical and horizontal passes
687 // are commutative because w/kWeightY is symmetric) and subsequent
688 // transpose.
689 const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
690 const int16x4x4_t d4_w = DistoLoadW_NEON(w);
691 // horizontal pass
692 const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
693 const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
694 int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
695
696 // abs(sum2 - sum1) >> 5
697 d_sum = vabs_s32(d_sum);
698 d_sum = vshr_n_s32(d_sum, 5);
699 return vget_lane_s32(d_sum, 0);
700 }
701 }
702 #undef LOAD_LANE_32b
703
Disto16x16_NEON(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)704 static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
705 const uint16_t* const w) {
706 int D = 0;
707 int x, y;
708 for (y = 0; y < 16 * BPS; y += 4 * BPS) {
709 for (x = 0; x < 16; x += 4) {
710 D += Disto4x4_NEON(a + x + y, b + x + y, w);
711 }
712 }
713 return D;
714 }
715
716 //------------------------------------------------------------------------------
717
CollectHistogram_NEON(const uint8_t * ref,const uint8_t * pred,int start_block,int end_block,VP8Histogram * const histo)718 static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
719 int start_block, int end_block,
720 VP8Histogram* const histo) {
721 const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
722 int j;
723 int distribution[MAX_COEFF_THRESH + 1] = { 0 };
724 for (j = start_block; j < end_block; ++j) {
725 int16_t out[16];
726 FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
727 {
728 int k;
729 const int16x8_t a0 = vld1q_s16(out + 0);
730 const int16x8_t b0 = vld1q_s16(out + 8);
731 const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
732 const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
733 const uint16x8_t a2 = vshrq_n_u16(a1, 3);
734 const uint16x8_t b2 = vshrq_n_u16(b1, 3);
735 const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
736 const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
737 vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
738 vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
739 // Convert coefficients to bin.
740 for (k = 0; k < 16; ++k) {
741 ++distribution[out[k]];
742 }
743 }
744 }
745 VP8SetHistogramData(distribution, histo);
746 }
747
748 //------------------------------------------------------------------------------
749
AccumulateSSE16_NEON(const uint8_t * const a,const uint8_t * const b,uint32x4_t * const sum)750 static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
751 const uint8_t* const b,
752 uint32x4_t* const sum) {
753 const uint8x16_t a0 = vld1q_u8(a);
754 const uint8x16_t b0 = vld1q_u8(b);
755 const uint8x16_t abs_diff = vabdq_u8(a0, b0);
756 const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
757 vget_low_u8(abs_diff));
758 const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
759 vget_high_u8(abs_diff));
760 /* pair-wise adds and widen */
761 const uint32x4_t sum1 = vpaddlq_u16(prod1);
762 const uint32x4_t sum2 = vpaddlq_u16(prod2);
763 *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
764 }
765
766 // Horizontal sum of all four uint32_t values in 'sum'.
SumToInt_NEON(uint32x4_t sum)767 static int SumToInt_NEON(uint32x4_t sum) {
768 #if WEBP_AARCH64
769 return (int)vaddvq_u32(sum);
770 #else
771 const uint64x2_t sum2 = vpaddlq_u32(sum);
772 const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
773 vreinterpret_u32_u64(vget_high_u64(sum2)));
774 return (int)vget_lane_u32(sum3, 0);
775 #endif
776 }
777
SSE16x16_NEON(const uint8_t * a,const uint8_t * b)778 static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
779 uint32x4_t sum = vdupq_n_u32(0);
780 int y;
781 for (y = 0; y < 16; ++y) {
782 AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
783 }
784 return SumToInt_NEON(sum);
785 }
786
SSE16x8_NEON(const uint8_t * a,const uint8_t * b)787 static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
788 uint32x4_t sum = vdupq_n_u32(0);
789 int y;
790 for (y = 0; y < 8; ++y) {
791 AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
792 }
793 return SumToInt_NEON(sum);
794 }
795
SSE8x8_NEON(const uint8_t * a,const uint8_t * b)796 static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
797 uint32x4_t sum = vdupq_n_u32(0);
798 int y;
799 for (y = 0; y < 8; ++y) {
800 const uint8x8_t a0 = vld1_u8(a + y * BPS);
801 const uint8x8_t b0 = vld1_u8(b + y * BPS);
802 const uint8x8_t abs_diff = vabd_u8(a0, b0);
803 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
804 sum = vpadalq_u16(sum, prod);
805 }
806 return SumToInt_NEON(sum);
807 }
808
SSE4x4_NEON(const uint8_t * a,const uint8_t * b)809 static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
810 const uint8x16_t a0 = Load4x4_NEON(a);
811 const uint8x16_t b0 = Load4x4_NEON(b);
812 const uint8x16_t abs_diff = vabdq_u8(a0, b0);
813 const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
814 vget_low_u8(abs_diff));
815 const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
816 vget_high_u8(abs_diff));
817 /* pair-wise adds and widen */
818 const uint32x4_t sum1 = vpaddlq_u16(prod1);
819 const uint32x4_t sum2 = vpaddlq_u16(prod2);
820 return SumToInt_NEON(vaddq_u32(sum1, sum2));
821 }
822
823 //------------------------------------------------------------------------------
824
825 // Compilation with gcc-4.6.x is problematic for now.
826 #if !defined(WORK_AROUND_GCC)
827
Quantize_NEON(int16_t * const in,const VP8Matrix * const mtx,int offset)828 static int16x8_t Quantize_NEON(int16_t* const in,
829 const VP8Matrix* const mtx, int offset) {
830 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
831 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
832 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
833 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
834 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
835
836 const int16x8_t a = vld1q_s16(in + offset); // in
837 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
838 const int16x8_t sign = vshrq_n_s16(a, 15); // sign
839 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
840 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
841 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
842 const uint32x4_t m2 = vhaddq_u32(m0, bias0);
843 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1
844 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
845 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1
846 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
847 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
848 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
849 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
850 vst1q_s16(in + offset, c4);
851 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1
852 return c3;
853 }
854
855 static const uint8_t kShuffles[4][8] = {
856 { 0, 1, 2, 3, 8, 9, 16, 17 },
857 { 10, 11, 4, 5, 6, 7, 12, 13 },
858 { 18, 19, 24, 25, 26, 27, 20, 21 },
859 { 14, 15, 22, 23, 28, 29, 30, 31 }
860 };
861
QuantizeBlock_NEON(int16_t in[16],int16_t out[16],const VP8Matrix * const mtx)862 static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
863 const VP8Matrix* const mtx) {
864 const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
865 const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
866 uint8x8x4_t shuffles;
867 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
868 // non-standard versions there.
869 #if defined(__APPLE__) && WEBP_AARCH64 && \
870 defined(__apple_build_version__) && (__apple_build_version__< 6020037)
871 uint8x16x2_t all_out;
872 INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
873 INIT_VECTOR4(shuffles,
874 vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
875 vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
876 vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
877 vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
878 #else
879 uint8x8x4_t all_out;
880 INIT_VECTOR4(all_out,
881 vreinterpret_u8_s16(vget_low_s16(out0)),
882 vreinterpret_u8_s16(vget_high_s16(out0)),
883 vreinterpret_u8_s16(vget_low_s16(out1)),
884 vreinterpret_u8_s16(vget_high_s16(out1)));
885 INIT_VECTOR4(shuffles,
886 vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
887 vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
888 vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
889 vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
890 #endif
891 // Zigzag reordering
892 vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);
893 vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);
894 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);
895 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
896 // test zeros
897 if (*(uint64_t*)(out + 0) != 0) return 1;
898 if (*(uint64_t*)(out + 4) != 0) return 1;
899 if (*(uint64_t*)(out + 8) != 0) return 1;
900 if (*(uint64_t*)(out + 12) != 0) return 1;
901 return 0;
902 }
903
Quantize2Blocks_NEON(int16_t in[32],int16_t out[32],const VP8Matrix * const mtx)904 static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
905 const VP8Matrix* const mtx) {
906 int nz;
907 nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
908 nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
909 return nz;
910 }
911
912 #endif // !WORK_AROUND_GCC
913
914 //------------------------------------------------------------------------------
915 // Entry point
916
917 extern void VP8EncDspInitNEON(void);
918
VP8EncDspInitNEON(void)919 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
920 VP8ITransform = ITransform_NEON;
921 VP8FTransform = FTransform_NEON;
922
923 VP8FTransformWHT = FTransformWHT_NEON;
924
925 VP8TDisto4x4 = Disto4x4_NEON;
926 VP8TDisto16x16 = Disto16x16_NEON;
927 VP8CollectHistogram = CollectHistogram_NEON;
928
929 VP8SSE16x16 = SSE16x16_NEON;
930 VP8SSE16x8 = SSE16x8_NEON;
931 VP8SSE8x8 = SSE8x8_NEON;
932 VP8SSE4x4 = SSE4x4_NEON;
933
934 #if !defined(WORK_AROUND_GCC)
935 VP8EncQuantizeBlock = QuantizeBlock_NEON;
936 VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
937 #endif
938 }
939
940 #else // !WEBP_USE_NEON
941
942 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
943
944 #endif // WEBP_USE_NEON
945