xref: /aosp_15_r20/external/webp/src/dsp/enc_neon.c (revision b2055c353e87c8814eb2b6b1b11112a1562253bd)
1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // ARM NEON version of speed-critical encoding functions.
11 //
12 // adapted from libvpx (https://www.webmproject.org/code/)
13 
14 #include "src/dsp/dsp.h"
15 
16 #if defined(WEBP_USE_NEON)
17 
18 #include <assert.h>
19 
20 #include "src/dsp/neon.h"
21 #include "src/enc/vp8i_enc.h"
22 
23 //------------------------------------------------------------------------------
24 // Transforms (Paragraph 14.4)
25 
26 // Inverse transform.
27 // This code is pretty much the same as TransformOne in the dec_neon.c, except
28 // for subtraction to *ref. See the comments there for algorithmic explanations.
29 
30 static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
31 static const int16_t kC2 =
32     WEBP_TRANSFORM_AC3_C2 / 2;  // half of kC2, actually. See comment above.
33 
34 // This code works but is *slower* than the inlined-asm version below
35 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
36 // WEBP_USE_INTRINSICS define.
37 // With gcc-4.8, it's a little faster speed than inlined-assembly.
38 #if defined(WEBP_USE_INTRINSICS)
39 
40 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
ConvertU8ToS16_NEON(uint32x2_t v)41 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
42   return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
43 }
44 
45 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
46 // to the corresponding rows of 'dst'.
SaturateAndStore4x4_NEON(uint8_t * const dst,const int16x8_t dst01,const int16x8_t dst23)47 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
48                                                  const int16x8_t dst01,
49                                                  const int16x8_t dst23) {
50   // Unsigned saturate to 8b.
51   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
52   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
53 
54   // Store the results.
55   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
56   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
57   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
58   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
59 }
60 
Add4x4_NEON(const int16x8_t row01,const int16x8_t row23,const uint8_t * const ref,uint8_t * const dst)61 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
62                                     const int16x8_t row23,
63                                     const uint8_t* const ref,
64                                     uint8_t* const dst) {
65   uint32x2_t dst01 = vdup_n_u32(0);
66   uint32x2_t dst23 = vdup_n_u32(0);
67 
68   // Load the source pixels.
69   dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
70   dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
71   dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
72   dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
73 
74   {
75     // Convert to 16b.
76     const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
77     const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
78 
79     // Descale with rounding.
80     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
81     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
82     // Add the inverse transform.
83     SaturateAndStore4x4_NEON(dst, out01, out23);
84   }
85 }
86 
Transpose8x2_NEON(const int16x8_t in0,const int16x8_t in1,int16x8x2_t * const out)87 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
88                                           const int16x8_t in1,
89                                           int16x8x2_t* const out) {
90   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
91   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
92   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
93                                                   // b0 d0 b1 d1 b2 d2 ...
94   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
95 }
96 
TransformPass_NEON(int16x8x2_t * const rows)97 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
98   // {rows} = in0 | in4
99   //          in8 | in12
100   // B1 = in4 | in12
101   const int16x8_t B1 =
102       vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
103   // C0 = kC1 * in4 | kC1 * in12
104   // C1 = kC2 * in4 | kC2 * in12
105   const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
106   const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
107   const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
108                                 vget_low_s16(rows->val[1]));   // in0 + in8
109   const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
110                                 vget_low_s16(rows->val[1]));   // in0 - in8
111   // c = kC2 * in4 - kC1 * in12
112   // d = kC1 * in4 + kC2 * in12
113   const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
114   const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
115   const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
116   const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
117   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
118   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
119   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
120   Transpose8x2_NEON(E0, E1, rows);
121 }
122 
ITransformOne_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst)123 static void ITransformOne_NEON(const uint8_t* ref,
124                                const int16_t* in, uint8_t* dst) {
125   int16x8x2_t rows;
126   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
127   TransformPass_NEON(&rows);
128   TransformPass_NEON(&rows);
129   Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
130 }
131 
132 #else
133 
ITransformOne_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst)134 static void ITransformOne_NEON(const uint8_t* ref,
135                                const int16_t* in, uint8_t* dst) {
136   const int kBPS = BPS;
137   const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
138 
139   __asm__ volatile (
140     "vld1.16         {q1, q2}, [%[in]]           \n"
141     "vld1.16         {d0}, [%[kC1C2]]            \n"
142 
143     // d2: in[0]
144     // d3: in[8]
145     // d4: in[4]
146     // d5: in[12]
147     "vswp            d3, d4                      \n"
148 
149     // q8 = {in[4], in[12]} * kC1 * 2 >> 16
150     // q9 = {in[4], in[12]} * kC2 >> 16
151     "vqdmulh.s16     q8, q2, d0[0]               \n"
152     "vqdmulh.s16     q9, q2, d0[1]               \n"
153 
154     // d22 = a = in[0] + in[8]
155     // d23 = b = in[0] - in[8]
156     "vqadd.s16       d22, d2, d3                 \n"
157     "vqsub.s16       d23, d2, d3                 \n"
158 
159     //  q8 = in[4]/[12] * kC1 >> 16
160     "vshr.s16        q8, q8, #1                  \n"
161 
162     // Add {in[4], in[12]} back after the multiplication.
163     "vqadd.s16       q8, q2, q8                  \n"
164 
165     // d20 = c = in[4]*kC2 - in[12]*kC1
166     // d21 = d = in[4]*kC1 + in[12]*kC2
167     "vqsub.s16       d20, d18, d17               \n"
168     "vqadd.s16       d21, d19, d16               \n"
169 
170     // d2 = tmp[0] = a + d
171     // d3 = tmp[1] = b + c
172     // d4 = tmp[2] = b - c
173     // d5 = tmp[3] = a - d
174     "vqadd.s16       d2, d22, d21                \n"
175     "vqadd.s16       d3, d23, d20                \n"
176     "vqsub.s16       d4, d23, d20                \n"
177     "vqsub.s16       d5, d22, d21                \n"
178 
179     "vzip.16         q1, q2                      \n"
180     "vzip.16         q1, q2                      \n"
181 
182     "vswp            d3, d4                      \n"
183 
184     // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
185     // q9 = {tmp[4], tmp[12]} * kC2 >> 16
186     "vqdmulh.s16     q8, q2, d0[0]               \n"
187     "vqdmulh.s16     q9, q2, d0[1]               \n"
188 
189     // d22 = a = tmp[0] + tmp[8]
190     // d23 = b = tmp[0] - tmp[8]
191     "vqadd.s16       d22, d2, d3                 \n"
192     "vqsub.s16       d23, d2, d3                 \n"
193 
194     "vshr.s16        q8, q8, #1                  \n"
195     "vqadd.s16       q8, q2, q8                  \n"
196 
197     // d20 = c = in[4]*kC2 - in[12]*kC1
198     // d21 = d = in[4]*kC1 + in[12]*kC2
199     "vqsub.s16       d20, d18, d17               \n"
200     "vqadd.s16       d21, d19, d16               \n"
201 
202     // d2 = tmp[0] = a + d
203     // d3 = tmp[1] = b + c
204     // d4 = tmp[2] = b - c
205     // d5 = tmp[3] = a - d
206     "vqadd.s16       d2, d22, d21                \n"
207     "vqadd.s16       d3, d23, d20                \n"
208     "vqsub.s16       d4, d23, d20                \n"
209     "vqsub.s16       d5, d22, d21                \n"
210 
211     "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
212     "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
213     "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
214     "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
215 
216     "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
217 
218     // (val) + 4 >> 3
219     "vrshr.s16       d2, d2, #3                  \n"
220     "vrshr.s16       d3, d3, #3                  \n"
221     "vrshr.s16       d4, d4, #3                  \n"
222     "vrshr.s16       d5, d5, #3                  \n"
223 
224     "vzip.16         q1, q2                      \n"
225     "vzip.16         q1, q2                      \n"
226 
227     // Must accumulate before saturating
228     "vmovl.u8        q8, d6                      \n"
229     "vmovl.u8        q9, d7                      \n"
230 
231     "vqadd.s16       q1, q1, q8                  \n"
232     "vqadd.s16       q2, q2, q9                  \n"
233 
234     "vqmovun.s16     d0, q1                      \n"
235     "vqmovun.s16     d1, q2                      \n"
236 
237     "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
238     "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
239     "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
240     "vst1.32         d1[1], [%[dst]]             \n"
241 
242     : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
243     : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
244     : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
245   );
246 }
247 
248 #endif    // WEBP_USE_INTRINSICS
249 
ITransform_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst,int do_two)250 static void ITransform_NEON(const uint8_t* ref,
251                             const int16_t* in, uint8_t* dst, int do_two) {
252   ITransformOne_NEON(ref, in, dst);
253   if (do_two) {
254     ITransformOne_NEON(ref + 4, in + 16, dst + 4);
255   }
256 }
257 
258 // Load all 4x4 pixels into a single uint8x16_t variable.
Load4x4_NEON(const uint8_t * src)259 static uint8x16_t Load4x4_NEON(const uint8_t* src) {
260   uint32x4_t out = vdupq_n_u32(0);
261   out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
262   out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
263   out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
264   out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
265   return vreinterpretq_u8_u32(out);
266 }
267 
268 // Forward transform.
269 
270 #if defined(WEBP_USE_INTRINSICS)
271 
Transpose4x4_S16_NEON(const int16x4_t A,const int16x4_t B,const int16x4_t C,const int16x4_t D,int16x8_t * const out01,int16x8_t * const out32)272 static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
273                                               const int16x4_t B,
274                                               const int16x4_t C,
275                                               const int16x4_t D,
276                                               int16x8_t* const out01,
277                                               int16x8_t* const out32) {
278   const int16x4x2_t AB = vtrn_s16(A, B);
279   const int16x4x2_t CD = vtrn_s16(C, D);
280   const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
281                                      vreinterpret_s32_s16(CD.val[0]));
282   const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
283                                      vreinterpret_s32_s16(CD.val[1]));
284   *out01 = vreinterpretq_s16_s64(
285       vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
286                    vreinterpret_s64_s32(tmp13.val[0])));
287   *out32 = vreinterpretq_s16_s64(
288       vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
289                    vreinterpret_s64_s32(tmp02.val[1])));
290 }
291 
DiffU8ToS16_NEON(const uint8x8_t a,const uint8x8_t b)292 static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
293                                               const uint8x8_t b) {
294   return vreinterpretq_s16_u16(vsubl_u8(a, b));
295 }
296 
FTransform_NEON(const uint8_t * src,const uint8_t * ref,int16_t * out)297 static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
298                             int16_t* out) {
299   int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
300   {
301     const uint8x16_t S0 = Load4x4_NEON(src);
302     const uint8x16_t R0 = Load4x4_NEON(ref);
303     const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
304     const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
305     const int16x4_t D0 = vget_low_s16(D0D1);
306     const int16x4_t D1 = vget_high_s16(D0D1);
307     const int16x4_t D2 = vget_low_s16(D2D3);
308     const int16x4_t D3 = vget_high_s16(D2D3);
309     Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
310   }
311   {    // 1rst pass
312     const int32x4_t kCst937 = vdupq_n_s32(937);
313     const int32x4_t kCst1812 = vdupq_n_s32(1812);
314     const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
315     const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
316     const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
317     const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
318                                     vget_high_s16(a0a1_2));
319     const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
320                                     vget_high_s16(a0a1_2));
321     const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
322     const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
323     const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
324     const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
325     const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
326     const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
327     Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
328   }
329   {    // 2nd pass
330     // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
331     const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
332     const int32x4_t kCst51000 = vdupq_n_s32(51000);
333     const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
334     const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
335     const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
336     const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
337     const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
338     const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
339     const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
340     const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
341     const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
342     const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
343     const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
344     const int16x4_t a3_eq_0 =
345         vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
346     const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
347     vst1_s16(out +  0, out0);
348     vst1_s16(out +  4, out1);
349     vst1_s16(out +  8, out2);
350     vst1_s16(out + 12, out3);
351   }
352 }
353 
354 #else
355 
356 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
357 static const int16_t kCoeff16[] = {
358   5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
359 };
360 static const int32_t kCoeff32[] = {
361    1812,  1812,  1812,  1812,
362     937,   937,   937,   937,
363   12000, 12000, 12000, 12000,
364   51000, 51000, 51000, 51000
365 };
366 
FTransform_NEON(const uint8_t * src,const uint8_t * ref,int16_t * out)367 static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
368                             int16_t* out) {
369   const int kBPS = BPS;
370   const uint8_t* src_ptr = src;
371   const uint8_t* ref_ptr = ref;
372   const int16_t* coeff16 = kCoeff16;
373   const int32_t* coeff32 = kCoeff32;
374 
375   __asm__ volatile (
376     // load src into q4, q5 in high half
377     "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
378     "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
379     "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
380     "vld1.8 {d11}, [%[src_ptr]]               \n"
381 
382     // load ref into q6, q7 in high half
383     "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
384     "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
385     "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
386     "vld1.8 {d15}, [%[ref_ptr]]               \n"
387 
388     // Pack the high values in to q4 and q6
389     "vtrn.32     q4, q5                       \n"
390     "vtrn.32     q6, q7                       \n"
391 
392     // d[0-3] = src - ref
393     "vsubl.u8    q0, d8, d12                  \n"
394     "vsubl.u8    q1, d9, d13                  \n"
395 
396     // load coeff16 into q8(d16=5352, d17=2217)
397     "vld1.16     {q8}, [%[coeff16]]           \n"
398 
399     // load coeff32 high half into q9 = 1812, q10 = 937
400     "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
401 
402     // load coeff32 low half into q11=12000, q12=51000
403     "vld1.32     {q11,q12}, [%[coeff32]]      \n"
404 
405     // part 1
406     // Transpose. Register dN is the same as dN in C
407     "vtrn.32         d0, d2                   \n"
408     "vtrn.32         d1, d3                   \n"
409     "vtrn.16         d0, d1                   \n"
410     "vtrn.16         d2, d3                   \n"
411 
412     "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
413     "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
414     "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
415     "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
416 
417     "vadd.s16        d0, d4, d5               \n" // a0 + a1
418     "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
419     "vsub.s16        d2, d4, d5               \n" // a0 - a1
420     "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
421 
422     "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
423     "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
424     "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
425     "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
426 
427     // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
428     // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
429     "vshrn.s32       d1, q9, #9               \n"
430     "vshrn.s32       d3, q10, #9              \n"
431 
432     // part 2
433     // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
434     "vtrn.32         d0, d2                   \n"
435     "vtrn.32         d1, d3                   \n"
436     "vtrn.16         d0, d1                   \n"
437     "vtrn.16         d2, d3                   \n"
438 
439     "vmov.s16        d26, #7                  \n"
440 
441     "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
442     "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
443     "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
444     "vadd.s16        d4, d4, d26              \n" // a1 + 7
445     "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
446 
447     "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
448     "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
449 
450     "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
451     "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
452 
453     "vceq.s16        d4, d7, #0               \n"
454 
455     "vshr.s16        d0, d0, #4               \n"
456     "vshr.s16        d2, d2, #4               \n"
457 
458     "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
459     "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
460 
461     "vmvn            d4, d4                   \n" // !(d1 == 0)
462     // op[4] = (c1*2217 + d1*5352 + 12000)>>16
463     "vshrn.s32       d1, q11, #16             \n"
464     // op[4] += (d1!=0)
465     "vsub.s16        d1, d1, d4               \n"
466     // op[12]= (d1*2217 - c1*5352 + 51000)>>16
467     "vshrn.s32       d3, q12, #16             \n"
468 
469     // set result to out array
470     "vst1.16         {q0, q1}, [%[out]]   \n"
471     : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
472       [coeff32] "+r"(coeff32)          // modified registers
473     : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
474       [out] "r"(out)                   // constants
475     : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
476       "q10", "q11", "q12", "q13"       // clobbered
477   );
478 }
479 
480 #endif
481 
482 #define LOAD_LANE_16b(VALUE, LANE) do {             \
483   (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
484   src += stride;                                    \
485 } while (0)
486 
FTransformWHT_NEON(const int16_t * src,int16_t * out)487 static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
488   const int stride = 16;
489   const int16x4_t zero = vdup_n_s16(0);
490   int32x4x4_t tmp0;
491   int16x4x4_t in;
492   INIT_VECTOR4(in, zero, zero, zero, zero);
493   LOAD_LANE_16b(in.val[0], 0);
494   LOAD_LANE_16b(in.val[1], 0);
495   LOAD_LANE_16b(in.val[2], 0);
496   LOAD_LANE_16b(in.val[3], 0);
497   LOAD_LANE_16b(in.val[0], 1);
498   LOAD_LANE_16b(in.val[1], 1);
499   LOAD_LANE_16b(in.val[2], 1);
500   LOAD_LANE_16b(in.val[3], 1);
501   LOAD_LANE_16b(in.val[0], 2);
502   LOAD_LANE_16b(in.val[1], 2);
503   LOAD_LANE_16b(in.val[2], 2);
504   LOAD_LANE_16b(in.val[3], 2);
505   LOAD_LANE_16b(in.val[0], 3);
506   LOAD_LANE_16b(in.val[1], 3);
507   LOAD_LANE_16b(in.val[2], 3);
508   LOAD_LANE_16b(in.val[3], 3);
509 
510   {
511     // a0 = in[0 * 16] + in[2 * 16]
512     // a1 = in[1 * 16] + in[3 * 16]
513     // a2 = in[1 * 16] - in[3 * 16]
514     // a3 = in[0 * 16] - in[2 * 16]
515     const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
516     const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
517     const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
518     const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
519     tmp0.val[0] = vaddq_s32(a0, a1);
520     tmp0.val[1] = vaddq_s32(a3, a2);
521     tmp0.val[2] = vsubq_s32(a3, a2);
522     tmp0.val[3] = vsubq_s32(a0, a1);
523   }
524   {
525     const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
526     // a0 = tmp[0 + i] + tmp[ 8 + i]
527     // a1 = tmp[4 + i] + tmp[12 + i]
528     // a2 = tmp[4 + i] - tmp[12 + i]
529     // a3 = tmp[0 + i] - tmp[ 8 + i]
530     const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
531     const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
532     const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
533     const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
534     const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
535     const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
536     const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
537     const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
538     const int16x4_t out0 = vmovn_s32(b0);
539     const int16x4_t out1 = vmovn_s32(b1);
540     const int16x4_t out2 = vmovn_s32(b2);
541     const int16x4_t out3 = vmovn_s32(b3);
542 
543     vst1_s16(out +  0, out0);
544     vst1_s16(out +  4, out1);
545     vst1_s16(out +  8, out2);
546     vst1_s16(out + 12, out3);
547   }
548 }
549 #undef LOAD_LANE_16b
550 
551 //------------------------------------------------------------------------------
552 // Texture distortion
553 //
554 // We try to match the spectral content (weighted) between source and
555 // reconstructed samples.
556 
557 // a 0123, b 0123
558 // a 4567, b 4567
559 // a 89ab, b 89ab
560 // a cdef, b cdef
561 //
562 // transpose
563 //
564 // a 048c, b 048c
565 // a 159d, b 159d
566 // a 26ae, b 26ae
567 // a 37bf, b 37bf
568 //
DistoTranspose4x4S16_NEON(int16x8x4_t q4_in)569 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
570   const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
571   const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
572   const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
573                                         vreinterpretq_s32_s16(q2_tmp1.val[0]));
574   const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
575                                         vreinterpretq_s32_s16(q2_tmp1.val[1]));
576   q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
577   q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
578   q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
579   q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
580   return q4_in;
581 }
582 
DistoHorizontalPass_NEON(const int16x8x4_t q4_in)583 static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
584     const int16x8x4_t q4_in) {
585   // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
586   // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
587   const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
588   const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
589   const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
590   const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
591   int16x8x4_t q4_out;
592   // tmp[0] = a0 + a1
593   // tmp[1] = a3 + a2
594   // tmp[2] = a3 - a2
595   // tmp[3] = a0 - a1
596   INIT_VECTOR4(q4_out,
597                vabsq_s16(vaddq_s16(q_a0, q_a1)),
598                vabsq_s16(vaddq_s16(q_a3, q_a2)),
599                vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
600   return q4_out;
601 }
602 
DistoVerticalPass_NEON(const uint8x8x4_t q4_in)603 static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
604   const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
605                                                         q4_in.val[2]));
606   const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
607                                                         q4_in.val[3]));
608   const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
609                                                         q4_in.val[3]));
610   const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
611                                                         q4_in.val[2]));
612   int16x8x4_t q4_out;
613 
614   INIT_VECTOR4(q4_out,
615                vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
616                vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
617   return q4_out;
618 }
619 
DistoLoadW_NEON(const uint16_t * w)620 static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
621   const uint16x8_t q_w07 = vld1q_u16(&w[0]);
622   const uint16x8_t q_w8f = vld1q_u16(&w[8]);
623   int16x4x4_t d4_w;
624   INIT_VECTOR4(d4_w,
625                vget_low_s16(vreinterpretq_s16_u16(q_w07)),
626                vget_high_s16(vreinterpretq_s16_u16(q_w07)),
627                vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
628                vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
629   return d4_w;
630 }
631 
DistoSum_NEON(const int16x8x4_t q4_in,const int16x4x4_t d4_w)632 static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
633                                            const int16x4x4_t d4_w) {
634   int32x2_t d_sum;
635   // sum += w[ 0] * abs(b0);
636   // sum += w[ 4] * abs(b1);
637   // sum += w[ 8] * abs(b2);
638   // sum += w[12] * abs(b3);
639   int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
640   int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
641   int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
642   int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
643   q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
644   q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
645   q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
646   q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
647 
648   q_sum0 = vaddq_s32(q_sum0, q_sum1);
649   q_sum2 = vaddq_s32(q_sum2, q_sum3);
650   q_sum2 = vaddq_s32(q_sum0, q_sum2);
651   d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
652   d_sum = vpadd_s32(d_sum, d_sum);
653   return d_sum;
654 }
655 
656 #define LOAD_LANE_32b(src, VALUE, LANE) \
657     (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
658 
659 // Hadamard transform
660 // Returns the weighted sum of the absolute value of transformed coefficients.
661 // w[] contains a row-major 4 by 4 symmetric matrix.
Disto4x4_NEON(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)662 static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
663                          const uint16_t* const w) {
664   uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
665   uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
666   uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
667   uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
668   uint8x8x4_t d4_in;
669 
670   // load data a, b
671   LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
672   LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
673   LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
674   LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
675   LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
676   LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
677   LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
678   LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
679   INIT_VECTOR4(d4_in,
680                vreinterpret_u8_u32(d_in_ab_0123),
681                vreinterpret_u8_u32(d_in_ab_4567),
682                vreinterpret_u8_u32(d_in_ab_89ab),
683                vreinterpret_u8_u32(d_in_ab_cdef));
684 
685   {
686     // Vertical pass first to avoid a transpose (vertical and horizontal passes
687     // are commutative because w/kWeightY is symmetric) and subsequent
688     // transpose.
689     const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
690     const int16x4x4_t d4_w = DistoLoadW_NEON(w);
691     // horizontal pass
692     const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
693     const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
694     int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
695 
696     // abs(sum2 - sum1) >> 5
697     d_sum = vabs_s32(d_sum);
698     d_sum = vshr_n_s32(d_sum, 5);
699     return vget_lane_s32(d_sum, 0);
700   }
701 }
702 #undef LOAD_LANE_32b
703 
Disto16x16_NEON(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)704 static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
705                            const uint16_t* const w) {
706   int D = 0;
707   int x, y;
708   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
709     for (x = 0; x < 16; x += 4) {
710       D += Disto4x4_NEON(a + x + y, b + x + y, w);
711     }
712   }
713   return D;
714 }
715 
716 //------------------------------------------------------------------------------
717 
CollectHistogram_NEON(const uint8_t * ref,const uint8_t * pred,int start_block,int end_block,VP8Histogram * const histo)718 static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
719                                   int start_block, int end_block,
720                                   VP8Histogram* const histo) {
721   const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
722   int j;
723   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
724   for (j = start_block; j < end_block; ++j) {
725     int16_t out[16];
726     FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
727     {
728       int k;
729       const int16x8_t a0 = vld1q_s16(out + 0);
730       const int16x8_t b0 = vld1q_s16(out + 8);
731       const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
732       const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
733       const uint16x8_t a2 = vshrq_n_u16(a1, 3);
734       const uint16x8_t b2 = vshrq_n_u16(b1, 3);
735       const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
736       const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
737       vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
738       vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
739       // Convert coefficients to bin.
740       for (k = 0; k < 16; ++k) {
741         ++distribution[out[k]];
742       }
743     }
744   }
745   VP8SetHistogramData(distribution, histo);
746 }
747 
748 //------------------------------------------------------------------------------
749 
AccumulateSSE16_NEON(const uint8_t * const a,const uint8_t * const b,uint32x4_t * const sum)750 static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
751                                              const uint8_t* const b,
752                                              uint32x4_t* const sum) {
753   const uint8x16_t a0 = vld1q_u8(a);
754   const uint8x16_t b0 = vld1q_u8(b);
755   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
756   const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
757                                     vget_low_u8(abs_diff));
758   const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
759                                     vget_high_u8(abs_diff));
760   /* pair-wise adds and widen */
761   const uint32x4_t sum1 = vpaddlq_u16(prod1);
762   const uint32x4_t sum2 = vpaddlq_u16(prod2);
763   *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
764 }
765 
766 // Horizontal sum of all four uint32_t values in 'sum'.
SumToInt_NEON(uint32x4_t sum)767 static int SumToInt_NEON(uint32x4_t sum) {
768 #if WEBP_AARCH64
769   return (int)vaddvq_u32(sum);
770 #else
771   const uint64x2_t sum2 = vpaddlq_u32(sum);
772   const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
773                                    vreinterpret_u32_u64(vget_high_u64(sum2)));
774   return (int)vget_lane_u32(sum3, 0);
775 #endif
776 }
777 
SSE16x16_NEON(const uint8_t * a,const uint8_t * b)778 static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
779   uint32x4_t sum = vdupq_n_u32(0);
780   int y;
781   for (y = 0; y < 16; ++y) {
782     AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
783   }
784   return SumToInt_NEON(sum);
785 }
786 
SSE16x8_NEON(const uint8_t * a,const uint8_t * b)787 static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
788   uint32x4_t sum = vdupq_n_u32(0);
789   int y;
790   for (y = 0; y < 8; ++y) {
791     AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
792   }
793   return SumToInt_NEON(sum);
794 }
795 
SSE8x8_NEON(const uint8_t * a,const uint8_t * b)796 static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
797   uint32x4_t sum = vdupq_n_u32(0);
798   int y;
799   for (y = 0; y < 8; ++y) {
800     const uint8x8_t a0 = vld1_u8(a + y * BPS);
801     const uint8x8_t b0 = vld1_u8(b + y * BPS);
802     const uint8x8_t abs_diff = vabd_u8(a0, b0);
803     const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
804     sum = vpadalq_u16(sum, prod);
805   }
806   return SumToInt_NEON(sum);
807 }
808 
SSE4x4_NEON(const uint8_t * a,const uint8_t * b)809 static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
810   const uint8x16_t a0 = Load4x4_NEON(a);
811   const uint8x16_t b0 = Load4x4_NEON(b);
812   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
813   const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
814                                     vget_low_u8(abs_diff));
815   const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
816                                     vget_high_u8(abs_diff));
817   /* pair-wise adds and widen */
818   const uint32x4_t sum1 = vpaddlq_u16(prod1);
819   const uint32x4_t sum2 = vpaddlq_u16(prod2);
820   return SumToInt_NEON(vaddq_u32(sum1, sum2));
821 }
822 
823 //------------------------------------------------------------------------------
824 
825 // Compilation with gcc-4.6.x is problematic for now.
826 #if !defined(WORK_AROUND_GCC)
827 
Quantize_NEON(int16_t * const in,const VP8Matrix * const mtx,int offset)828 static int16x8_t Quantize_NEON(int16_t* const in,
829                                const VP8Matrix* const mtx, int offset) {
830   const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
831   const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
832   const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
833   const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
834   const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
835 
836   const int16x8_t a = vld1q_s16(in + offset);                // in
837   const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
838   const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
839   const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
840   const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
841   const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
842   const uint32x4_t m2 = vhaddq_u32(m0, bias0);
843   const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
844   const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
845                                      vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
846   const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
847   const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
848   const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
849   const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
850   vst1q_s16(in + offset, c4);
851   assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
852   return c3;
853 }
854 
855 static const uint8_t kShuffles[4][8] = {
856   { 0,   1,  2,  3,  8,  9, 16, 17 },
857   { 10, 11,  4,  5,  6,  7, 12, 13 },
858   { 18, 19, 24, 25, 26, 27, 20, 21 },
859   { 14, 15, 22, 23, 28, 29, 30, 31 }
860 };
861 
QuantizeBlock_NEON(int16_t in[16],int16_t out[16],const VP8Matrix * const mtx)862 static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
863                               const VP8Matrix* const mtx) {
864   const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
865   const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
866   uint8x8x4_t shuffles;
867   // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
868   // non-standard versions there.
869 #if defined(__APPLE__) && WEBP_AARCH64 && \
870     defined(__apple_build_version__) && (__apple_build_version__< 6020037)
871   uint8x16x2_t all_out;
872   INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
873   INIT_VECTOR4(shuffles,
874                vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
875                vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
876                vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
877                vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
878 #else
879   uint8x8x4_t all_out;
880   INIT_VECTOR4(all_out,
881                vreinterpret_u8_s16(vget_low_s16(out0)),
882                vreinterpret_u8_s16(vget_high_s16(out0)),
883                vreinterpret_u8_s16(vget_low_s16(out1)),
884                vreinterpret_u8_s16(vget_high_s16(out1)));
885   INIT_VECTOR4(shuffles,
886                vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
887                vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
888                vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
889                vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
890 #endif
891   // Zigzag reordering
892   vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
893   vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
894   vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
895   vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
896   // test zeros
897   if (*(uint64_t*)(out +  0) != 0) return 1;
898   if (*(uint64_t*)(out +  4) != 0) return 1;
899   if (*(uint64_t*)(out +  8) != 0) return 1;
900   if (*(uint64_t*)(out + 12) != 0) return 1;
901   return 0;
902 }
903 
Quantize2Blocks_NEON(int16_t in[32],int16_t out[32],const VP8Matrix * const mtx)904 static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
905                                 const VP8Matrix* const mtx) {
906   int nz;
907   nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
908   nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
909   return nz;
910 }
911 
912 #endif   // !WORK_AROUND_GCC
913 
914 //------------------------------------------------------------------------------
915 // Entry point
916 
917 extern void VP8EncDspInitNEON(void);
918 
VP8EncDspInitNEON(void)919 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
920   VP8ITransform = ITransform_NEON;
921   VP8FTransform = FTransform_NEON;
922 
923   VP8FTransformWHT = FTransformWHT_NEON;
924 
925   VP8TDisto4x4 = Disto4x4_NEON;
926   VP8TDisto16x16 = Disto16x16_NEON;
927   VP8CollectHistogram = CollectHistogram_NEON;
928 
929   VP8SSE16x16 = SSE16x16_NEON;
930   VP8SSE16x8 = SSE16x8_NEON;
931   VP8SSE8x8 = SSE8x8_NEON;
932   VP8SSE4x4 = SSE4x4_NEON;
933 
934 #if !defined(WORK_AROUND_GCC)
935   VP8EncQuantizeBlock = QuantizeBlock_NEON;
936   VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
937 #endif
938 }
939 
940 #else  // !WEBP_USE_NEON
941 
942 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
943 
944 #endif  // WEBP_USE_NEON
945