xref: /aosp_15_r20/external/libaom/av1/encoder/arm/av1_fwd_txfm2d_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 
15 #include "aom_dsp/arm/mem_neon.h"
16 #include "aom_dsp/arm/transpose_neon.h"
17 #include "aom_dsp/txfm_common.h"
18 #include "aom_ports/mem.h"
19 #include "av1/common/av1_txfm.h"
20 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
21 #include "config/aom_config.h"
22 #include "config/av1_rtcd.h"
23 #include "shift_neon.h"
24 #include "txfm_neon.h"
25 
26 #define TXFM_COS_BIT_MAX 13
27 
28 // A note on butterfly helper naming:
29 //
30 // butterfly_[input_ty]_[acc_ty]_[input_num]_[weight_num]_[weight_neg]_neon
31 // e.g. butterfly_s32_s32_x4_0231_neon
32 //                |   |   |  ^ Weights are applied as indices 0, 2, 3, 1
33 //                |   |   |    (see more detail below)
34 //                |   |   ^ (int32)x4 input/output parameters
35 //                |   ^ 32-bit accumulators internally
36 //                ^ 32-bit input/output parameters
37 //
38 // Weights are stored as 4-tuples in Q2.13 format as (w0, 1-w0, -w0, w0-1) to
39 // avoid needing separate negation instructions. This is represented in the
40 // helper naming by referring to the lane index in the loaded tuple that each
41 // multiply is performed with:
42 //
43 //        in0  in1
44 //      /----------
45 // out0 |  w0   w1   ==>  out0 = in0 * w0 + in1 * w1
46 // out1 |  w2   w3   ==>  out1 = in0 * w2 + in1 * w3
47 //
48 // So for indices 0331 from the earlier example, we end up with:
49 //
50 //          in0       in1
51 //      /------------------
52 // out0 | (lane 0) (lane 2)   ==>  out0 = in0 *   w0   + in1 *  -w0
53 // out1 | (lane 3) (lane 1)   ==>  out1 = in0 * (w0-1) + in1 * (1-w0)
54 
butterfly_s32_s32_x4_0112_neon(const int16x4_t w0101_s16,const int32x4_t in0,const int32x4_t in1,int32x4_t * out0,int32x4_t * out1)55 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0112_neon(
56     const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
57     int32x4_t *out0, int32x4_t *out1) {
58   int32x4_t w0101 = vmovl_s16(w0101_s16);
59   int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
60   o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 1);
61   int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
62   o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
63   *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
64   *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
65 }
66 
butterfly_s32_s32_x4_0332_neon(const int16x4_t w0101_s16,const int32x4_t in0,const int32x4_t in1,int32x4_t * out0,int32x4_t * out1)67 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0332_neon(
68     const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
69     int32x4_t *out0, int32x4_t *out1) {
70   int32x4_t w0101 = vmovl_s16(w0101_s16);
71   int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
72   o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 1);
73   int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 1);
74   o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
75   *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
76   *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
77 }
78 
butterfly_s32_s32_x4_1003_neon(const int16x4_t w0101_s16,const int32x4_t in0,const int32x4_t in1,int32x4_t * out0,int32x4_t * out1)79 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1003_neon(
80     const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
81     int32x4_t *out0, int32x4_t *out1) {
82   int32x4_t w0101 = vmovl_s16(w0101_s16);
83   int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
84   o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 0);
85   int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
86   o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
87   *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
88   *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
89 }
90 
butterfly_s32_s32_x4_1223_neon(const int16x4_t w0101_s16,const int32x4_t in0,const int32x4_t in1,int32x4_t * out0,int32x4_t * out1)91 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1223_neon(
92     const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
93     int32x4_t *out0, int32x4_t *out1) {
94   int32x4_t w0101 = vmovl_s16(w0101_s16);
95   int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
96   o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 0);
97   int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 0);
98   o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
99   *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
100   *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
101 }
102 
103 #define butterfly_s16_s32_x4_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
104                                   out0, out1)                                 \
105   do {                                                                        \
106     int32x4_t u0 = vmull_lane_s16(in0, wvec, lane0);                          \
107     u0 = vmlal_lane_s16(u0, in1, wvec, lane1);                                \
108     int32x4_t v0 = vmull_lane_s16(in0, wvec, lane2);                          \
109     v0 = vmlal_lane_s16(v0, in1, wvec, lane3);                                \
110     *out0 = vqrshrn_n_s32(u0, TXFM_COS_BIT_MAX);                              \
111     *out1 = vqrshrn_n_s32(v0, TXFM_COS_BIT_MAX);                              \
112   } while (0)
113 
butterfly_s16_s32_x4_0112_neon(const int16x4_t w0101,const int16x4_t in0,const int16x4_t in1,int16x4_t * out0,int16x4_t * out1)114 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0112_neon(
115     const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
116     int16x4_t *out0, int16x4_t *out1) {
117   butterfly_s16_s32_x4_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
118 }
119 
butterfly_s16_s32_x4_0332_neon(const int16x4_t w0101,const int16x4_t in0,const int16x4_t in1,int16x4_t * out0,int16x4_t * out1)120 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0332_neon(
121     const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
122     int16x4_t *out0, int16x4_t *out1) {
123   butterfly_s16_s32_x4_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
124 }
125 
butterfly_s16_s32_x4_1003_neon(const int16x4_t w0101,const int16x4_t in0,const int16x4_t in1,int16x4_t * out0,int16x4_t * out1)126 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1003_neon(
127     const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
128     int16x4_t *out0, int16x4_t *out1) {
129   butterfly_s16_s32_x4_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
130 }
131 
butterfly_s16_s32_x4_1223_neon(const int16x4_t w0101,const int16x4_t in0,const int16x4_t in1,int16x4_t * out0,int16x4_t * out1)132 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1223_neon(
133     const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
134     int16x4_t *out0, int16x4_t *out1) {
135   butterfly_s16_s32_x4_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
136 }
137 
138 #define butterfly_s16_s32_x8_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
139                                   out0, out1)                                 \
140   do {                                                                        \
141     int32x4_t u0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane0);            \
142     u0 = vmlal_lane_s16(u0, vget_low_s16(in1), wvec, lane1);                  \
143     int32x4_t u1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane0);           \
144     u1 = vmlal_lane_s16(u1, vget_high_s16(in1), wvec, lane1);                 \
145     int32x4_t v0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane2);            \
146     v0 = vmlal_lane_s16(v0, vget_low_s16(in1), wvec, lane3);                  \
147     int32x4_t v1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane2);           \
148     v1 = vmlal_lane_s16(v1, vget_high_s16(in1), wvec, lane3);                 \
149     const int16x4_t c0 = vrshrn_n_s32(u0, TXFM_COS_BIT_MAX);                  \
150     const int16x4_t c1 = vrshrn_n_s32(u1, TXFM_COS_BIT_MAX);                  \
151     const int16x4_t d0 = vrshrn_n_s32(v0, TXFM_COS_BIT_MAX);                  \
152     const int16x4_t d1 = vrshrn_n_s32(v1, TXFM_COS_BIT_MAX);                  \
153     *out0 = vcombine_s16(c0, c1);                                             \
154     *out1 = vcombine_s16(d0, d1);                                             \
155   } while (0)
156 
butterfly_s16_s32_x8_0112_neon(const int16x4_t w0101,const int16x8_t in0,const int16x8_t in1,int16x8_t * out0,int16x8_t * out1)157 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0112_neon(
158     const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
159     int16x8_t *out0, int16x8_t *out1) {
160   butterfly_s16_s32_x8_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
161 }
162 
butterfly_s16_s32_x8_0332_neon(const int16x4_t w0101,const int16x8_t in0,const int16x8_t in1,int16x8_t * out0,int16x8_t * out1)163 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0332_neon(
164     const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
165     int16x8_t *out0, int16x8_t *out1) {
166   butterfly_s16_s32_x8_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
167 }
168 
butterfly_s16_s32_x8_1003_neon(const int16x4_t w0101,const int16x8_t in0,const int16x8_t in1,int16x8_t * out0,int16x8_t * out1)169 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1003_neon(
170     const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
171     int16x8_t *out0, int16x8_t *out1) {
172   butterfly_s16_s32_x8_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
173 }
174 
butterfly_s16_s32_x8_1223_neon(const int16x4_t w0101,const int16x8_t in0,const int16x8_t in1,int16x8_t * out0,int16x8_t * out1)175 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1223_neon(
176     const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
177     int16x8_t *out0, int16x8_t *out1) {
178   butterfly_s16_s32_x8_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
179 }
180 
flip_buf_4_neon(int16x4_t * in,int16x4_t * out,int size)181 static AOM_FORCE_INLINE void flip_buf_4_neon(int16x4_t *in, int16x4_t *out,
182                                              int size) {
183   for (int i = 0; i < size; ++i) {
184     out[size - i - 1] = in[i];
185   }
186 }
187 
flip_buf_8_neon(int16x8_t * in,int16x8_t * out,int size)188 static AOM_FORCE_INLINE void flip_buf_8_neon(int16x8_t *in, int16x8_t *out,
189                                              int size) {
190   for (int i = 0; i < size; ++i) {
191     out[size - i - 1] = in[i];
192   }
193 }
194 
store_buffer_interleaved_s32_x8(int32_t * const out,const int32x4_t * const in1,const int32x4_t * const in2,const int stride,const int out_size)195 static AOM_FORCE_INLINE void store_buffer_interleaved_s32_x8(
196     int32_t *const out, const int32x4_t *const in1, const int32x4_t *const in2,
197     const int stride, const int out_size) {
198   for (int i = 0; i < out_size; ++i) {
199     vst1q_s32(out + stride * i, in1[i]);
200     vst1q_s32(out + stride * i + 4, in2[i]);
201   }
202 }
203 
load_buffer_s16_x4(const int16_t * in,const int stride,int16x4_t * const out,const int out_size)204 static AOM_FORCE_INLINE void load_buffer_s16_x4(const int16_t *in,
205                                                 const int stride,
206                                                 int16x4_t *const out,
207                                                 const int out_size) {
208   for (int i = 0; i < out_size; ++i) {
209     out[i] = vld1_s16(in);
210     in += stride;
211   }
212 }
213 
load_buffer_s16_x8(const int16_t * in,int stride,int16x8_t * out,int out_size)214 static AOM_FORCE_INLINE void load_buffer_s16_x8(const int16_t *in, int stride,
215                                                 int16x8_t *out, int out_size) {
216   for (int i = 0; i < out_size; ++i) {
217     out[i] = vld1q_s16(in + i * stride);
218   }
219 }
220 
store_buffer_s16_x4(const int16x4_t * const in,int32_t * const out,const int stride,const int out_size)221 static AOM_FORCE_INLINE void store_buffer_s16_x4(const int16x4_t *const in,
222                                                  int32_t *const out,
223                                                  const int stride,
224                                                  const int out_size) {
225   for (int i = 0; i < out_size; ++i) {
226     vst1q_s32(out + i * stride, vmovl_s16(in[i]));
227   }
228 }
229 
store_buffer_s16_x8(const int16x8_t * const in,int32_t * const out,const int stride,const int out_size)230 static AOM_FORCE_INLINE void store_buffer_s16_x8(const int16x8_t *const in,
231                                                  int32_t *const out,
232                                                  const int stride,
233                                                  const int out_size) {
234   for (int i = 0; i < out_size; ++i) {
235     vst1q_s32(out + i * stride + 0, vmovl_s16(vget_low_s16(in[i])));
236     vst1q_s32(out + i * stride + 4, vmovl_s16(vget_high_s16(in[i])));
237   }
238 }
239 
240 // A note on naming:
241 //   round_shift_[sqrt2]_s16_s32_4x1_neon(...)
242 //                |      |   |     ^ 1 => a single vector
243 //                |      |   |       n => an array of vectors
244 //                |      |   |   ^ input/output vector element count
245 //                |      |   ^ output type
246 //                |      ^ input type
247 //                ^ multiplicand and shift identifier
248 
249 static AOM_FORCE_INLINE int16x4_t
round_shift_sqrt2_s16_s16_4x1_neon(int16x4_t a)250 round_shift_sqrt2_s16_s16_4x1_neon(int16x4_t a) {
251   return vqrshrn_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
252 }
253 
254 static AOM_FORCE_INLINE int16x8_t
round_shift_sqrt2_s16_s16_8x1_neon(int16x8_t a)255 round_shift_sqrt2_s16_s16_8x1_neon(int16x8_t a) {
256   return vcombine_s16(round_shift_sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
257                       round_shift_sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
258 }
259 
260 static AOM_FORCE_INLINE int16x4_t
round_shift_2sqrt2_s16_s16_4x1_neon(int16x4_t a)261 round_shift_2sqrt2_s16_s16_4x1_neon(int16x4_t a) {
262   return vqrshrn_n_s32(vmull_n_s16(a, 2 * NewSqrt2), NewSqrt2Bits);
263 }
264 
265 static AOM_FORCE_INLINE int16x8_t
round_shift_2sqrt2_s16_s16_8x1_neon(int16x8_t a)266 round_shift_2sqrt2_s16_s16_8x1_neon(int16x8_t a) {
267   return vcombine_s16(round_shift_2sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
268                       round_shift_2sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
269 }
270 
271 static AOM_FORCE_INLINE int32x4_t
round_shift_sqrt2_s16_s32_4x1_neon(int16x4_t a)272 round_shift_sqrt2_s16_s32_4x1_neon(int16x4_t a) {
273   return vrshrq_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
274 }
275 
276 static AOM_FORCE_INLINE int32x4_t
round_shift_sqrt2_s32_s32_4x1_neon(int32x4_t a)277 round_shift_sqrt2_s32_s32_4x1_neon(int32x4_t a) {
278   return vrshrq_n_s32(vmulq_n_s32(a, NewSqrt2), NewSqrt2Bits);
279 }
280 
281 #define ROUND_SHIFT_SQRT_LOOP_HELPER(name, type0, type1, fn)                 \
282   static AOM_FORCE_INLINE void name(const type0 *in, type1 *out, int size) { \
283     for (int i = 0; i < size; ++i) {                                         \
284       out[i] = fn(in[i]);                                                    \
285     }                                                                        \
286   }
287 
ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s32_s32_4xn_neon,int32x4_t,int32x4_t,round_shift_sqrt2_s32_s32_4x1_neon)288 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s32_s32_4xn_neon, int32x4_t,
289                              int32x4_t, round_shift_sqrt2_s32_s32_4x1_neon)
290 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_4xn_neon, int16x4_t,
291                              int16x4_t, round_shift_sqrt2_s16_s16_4x1_neon)
292 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_8xn_neon, int16x8_t,
293                              int16x8_t, round_shift_sqrt2_s16_s16_8x1_neon)
294 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_4xn_neon, int16x4_t,
295                              int16x4_t, round_shift_2sqrt2_s16_s16_4x1_neon)
296 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_8xn_neon, int16x8_t,
297                              int16x8_t, round_shift_2sqrt2_s16_s16_8x1_neon)
298 
299 static AOM_FORCE_INLINE void store_rect_buffer_s16_x4(const int16x4_t *const in,
300                                                       int32_t *const out,
301                                                       const int stride,
302                                                       const int out_size) {
303   for (int i = 0; i < out_size; ++i) {
304     vst1q_s32(out + i * stride, round_shift_sqrt2_s16_s32_4x1_neon(in[i]));
305   }
306 }
307 
store_rect_buffer_s16_x8(const int16x8_t * const in,int32_t * const out,const int stride,const int out_size)308 static AOM_FORCE_INLINE void store_rect_buffer_s16_x8(const int16x8_t *const in,
309                                                       int32_t *const out,
310                                                       const int stride,
311                                                       const int out_size) {
312   for (int i = 0; i < out_size; ++i) {
313     vst1q_s32(out + i * stride + 0,
314               round_shift_sqrt2_s16_s32_4x1_neon(vget_low_s16(in[i])));
315     vst1q_s32(out + i * stride + 4,
316               round_shift_sqrt2_s16_s32_4x1_neon(vget_high_s16(in[i])));
317   }
318 }
319 
fadst4x4_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)320 static AOM_FORCE_INLINE void fadst4x4_neon(const int16x4_t *input,
321                                            int16x4_t *output, int cos_bit) {
322   int32x4_t u[6], v[6];
323   const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
324   const int16x4_t u01 = vqadd_s16(input[0], input[1]);
325 
326   v[5] = vmull_lane_s16(input[2], sinpi, 2);
327   v[0] = vmull_lane_s16(input[1], sinpi, 1);
328   v[0] = vmlal_lane_s16(v[0], input[0], sinpi, 0);
329   v[1] = vmlal_lane_s16(v[5], input[3], sinpi, 3);
330   v[2] = vmull_lane_s16(u01, sinpi, 2);
331   v[3] = vmull_lane_s16(input[0], sinpi, 3);
332   v[3] = vmlsl_lane_s16(v[3], input[1], sinpi, 0);
333   v[4] = vmlsl_lane_s16(v[5], input[3], sinpi, 1);
334 
335   u[0] = vaddq_s32(v[0], v[1]);
336   u[1] = vmlsl_lane_s16(v[2], input[3], sinpi, 2);
337   u[2] = vsubq_s32(v[3], v[4]);
338   u[3] = vsubq_s32(u[2], u[0]);
339   u[3] = vmlaq_n_s32(u[3], v[5], 3);
340 
341   output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
342   output[1] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
343   output[2] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
344   output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
345 }
346 
fadst4x8_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)347 static AOM_FORCE_INLINE void fadst4x8_neon(const int16x4_t *input,
348                                            int16x4_t *output, int cos_bit) {
349   const int16_t *cospi = cospi_arr_q13(cos_bit);
350 
351   const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
352   const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
353   const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
354 
355   const int16x4_t cospi32 = vget_low_s16(cospi32_16);
356   const int16x4_t cospi16 = vget_high_s16(cospi32_16);
357   const int16x4_t cospi4 = vget_low_s16(cospi4_12);
358   const int16x4_t cospi12 = vget_high_s16(cospi4_12);
359   const int16x4_t cospi20 = vget_low_s16(cospi20_28);
360   const int16x4_t cospi28 = vget_high_s16(cospi20_28);
361 
362   // stage 1-2
363   int16x4_t x2[8];
364   butterfly_s16_s32_x4_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
365   butterfly_s16_s32_x4_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
366 
367   // stage 3
368   int16x4_t x3[8];
369   x3[0] = vqadd_s16(input[0], x2[2]);
370   x3[1] = vqsub_s16(x2[3], input[7]);
371   x3[2] = vqsub_s16(input[0], x2[2]);
372   x3[3] = vqadd_s16(input[7], x2[3]);
373   x3[4] = vqsub_s16(x2[6], input[1]);
374   x3[5] = vqadd_s16(input[6], x2[7]);
375   x3[6] = vqadd_s16(input[1], x2[6]);
376   x3[7] = vqsub_s16(input[6], x2[7]);
377 
378   // stage 4
379   int16x4_t x4[8];
380   butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x4[4], &x4[5]);
381   butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x4[6], &x4[7]);
382 
383   // stage 5
384   int16x4_t x5[8];
385   x5[0] = vqadd_s16(x3[0], x4[4]);
386   x5[1] = vqadd_s16(x3[1], x4[5]);
387   x5[2] = vqadd_s16(x3[2], x4[6]);
388   x5[3] = vqsub_s16(x4[7], x3[3]);
389   x5[4] = vqsub_s16(x3[0], x4[4]);
390   x5[5] = vqsub_s16(x3[1], x4[5]);
391   x5[6] = vqsub_s16(x3[2], x4[6]);
392   x5[7] = vqadd_s16(x3[3], x4[7]);
393 
394   // stage 6-7
395   butterfly_s16_s32_x4_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
396   butterfly_s16_s32_x4_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
397   butterfly_s16_s32_x4_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
398   butterfly_s16_s32_x4_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
399 }
400 
fadst8x4_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)401 static AOM_FORCE_INLINE void fadst8x4_neon(const int16x8_t *input,
402                                            int16x8_t *output, int cos_bit) {
403   int32x4_t u_lo[4], u_hi[4];
404   const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
405   const int16x8_t u01 = vqaddq_s16(input[0], input[1]);
406 
407   u_lo[0] = vmull_lane_s16(vget_low_s16(input[1]), sinpi, 1);
408   u_hi[0] = vmull_lane_s16(vget_high_s16(input[1]), sinpi, 1);
409 
410   u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[0]), sinpi, 0);
411   u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[0]), sinpi, 0);
412 
413   u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[3]), sinpi, 3);
414   u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[3]), sinpi, 3);
415 
416   u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[2]), sinpi, 2);
417   u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[2]), sinpi, 2);
418 
419   u_lo[1] = vmull_lane_s16(vget_low_s16(u01), sinpi, 2);
420   u_hi[1] = vmull_lane_s16(vget_high_s16(u01), sinpi, 2);
421 
422   u_lo[2] = vmull_lane_s16(vget_low_s16(input[0]), sinpi, 3);
423   u_hi[2] = vmull_lane_s16(vget_high_s16(input[0]), sinpi, 3);
424 
425   u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[1]), sinpi, 0);
426   u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[1]), sinpi, 0);
427 
428   u_lo[2] = vmlal_lane_s16(u_lo[2], vget_low_s16(input[3]), sinpi, 1);
429   u_hi[2] = vmlal_lane_s16(u_hi[2], vget_high_s16(input[3]), sinpi, 1);
430 
431   u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[2]), sinpi, 2);
432   u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[2]), sinpi, 2);
433 
434   u_lo[1] = vmlsl_lane_s16(u_lo[1], vget_low_s16(input[3]), sinpi, 2);
435   u_hi[1] = vmlsl_lane_s16(u_hi[1], vget_high_s16(input[3]), sinpi, 2);
436 
437   u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]);
438   u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]);
439 
440   const int16x4_t sinpix3 = vmul_n_s16(sinpi, 3);
441   u_lo[3] = vmlal_lane_s16(u_lo[3], vget_low_s16(input[2]), sinpix3, 2);
442   u_hi[3] = vmlal_lane_s16(u_hi[3], vget_high_s16(input[2]), sinpix3, 2);
443 
444   output[0] = vcombine_s16(vrshrn_n_s32(u_lo[0], TXFM_COS_BIT_MAX),
445                            vrshrn_n_s32(u_hi[0], TXFM_COS_BIT_MAX));
446   output[1] = vcombine_s16(vrshrn_n_s32(u_lo[1], TXFM_COS_BIT_MAX),
447                            vrshrn_n_s32(u_hi[1], TXFM_COS_BIT_MAX));
448   output[2] = vcombine_s16(vrshrn_n_s32(u_lo[2], TXFM_COS_BIT_MAX),
449                            vrshrn_n_s32(u_hi[2], TXFM_COS_BIT_MAX));
450   output[3] = vcombine_s16(vrshrn_n_s32(u_lo[3], TXFM_COS_BIT_MAX),
451                            vrshrn_n_s32(u_hi[3], TXFM_COS_BIT_MAX));
452 }
453 
fdct4x4_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)454 static AOM_FORCE_INLINE void fdct4x4_neon(const int16x4_t *input,
455                                           int16x4_t *output, int cos_bit) {
456   const int16_t *cospi = cospi_arr_q13(cos_bit);
457   const int16x4_t cospi16 = vld1_s16(&cospi[4 * 1]);
458 
459   int16x4_t in12a = vadd_s16(input[1], input[2]);
460   int16x4_t in12s = vsub_s16(input[1], input[2]);
461   int16x4_t in03a = vadd_s16(input[0], input[3]);
462   int16x4_t in03s = vsub_s16(input[0], input[3]);
463 
464   int32x4_t u0ad1 = vmull_n_s16(in12a, cospi[4 * 0]);
465   int32x4_t u0ad2 = vmull_n_s16(in03a, cospi[4 * 0]);
466 
467   int32x4_t u[4];
468   u[0] = vaddq_s32(u0ad1, u0ad2);
469   u[1] = vsubq_s32(u0ad2, u0ad1);
470   u[2] = vmull_lane_s16(in12s, cospi16, 1);
471   u[2] = vmlal_lane_s16(u[2], in03s, cospi16, 0);
472   u[3] = vmull_lane_s16(in03s, cospi16, 1);
473   u[3] = vmlsl_lane_s16(u[3], in12s, cospi16, 0);
474 
475   output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
476   output[1] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
477   output[2] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
478   output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
479 }
480 
481 // Butterfly pre-processing:
482 // e.g. n=4:
483 //   out[0] = in[0] + in[3]
484 //   out[1] = in[1] + in[2]
485 //   out[2] = in[1] - in[2]
486 //   out[3] = in[0] - in[3]
487 
butterfly_dct_pre_s16_x4(const int16x4_t * input,int16x4_t * output,int n)488 static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x4(const int16x4_t *input,
489                                                       int16x4_t *output,
490                                                       int n) {
491   for (int i = 0; i < n / 2; ++i) {
492     output[i] = vqadd_s16(input[i], input[n - i - 1]);
493   }
494   for (int i = 0; i < n / 2; ++i) {
495     output[n / 2 + i] = vqsub_s16(input[n / 2 - i - 1], input[n / 2 + i]);
496   }
497 }
498 
butterfly_dct_pre_s16_x8(const int16x8_t * input,int16x8_t * output,int n)499 static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x8(const int16x8_t *input,
500                                                       int16x8_t *output,
501                                                       int n) {
502   for (int i = 0; i < n / 2; ++i) {
503     output[i] = vqaddq_s16(input[i], input[n - i - 1]);
504   }
505   for (int i = 0; i < n / 2; ++i) {
506     output[n / 2 + i] = vqsubq_s16(input[n / 2 - i - 1], input[n / 2 + i]);
507   }
508 }
509 
butterfly_dct_pre_s32_x4(const int32x4_t * input,int32x4_t * output,int n)510 static AOM_FORCE_INLINE void butterfly_dct_pre_s32_x4(const int32x4_t *input,
511                                                       int32x4_t *output,
512                                                       int n) {
513   for (int i = 0; i < n / 2; ++i) {
514     output[i] = vqaddq_s32(input[i], input[n - i - 1]);
515   }
516   for (int i = 0; i < n / 2; ++i) {
517     output[n / 2 + i] = vqsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
518   }
519 }
520 
521 // Butterfly post-processing:
522 // e.g. n=8:
523 //   out[0] = in0[0] + in1[3];
524 //   out[1] = in0[1] + in1[2];
525 //   out[2] = in0[1] - in1[2];
526 //   out[3] = in0[0] - in1[3];
527 //   out[4] = in0[7] - in1[4];
528 //   out[5] = in0[6] - in1[5];
529 //   out[6] = in0[6] + in1[5];
530 //   out[7] = in0[7] + in1[4];
531 
butterfly_dct_post_s16_x4(const int16x4_t * in0,const int16x4_t * in1,int16x4_t * output,int n)532 static AOM_FORCE_INLINE void butterfly_dct_post_s16_x4(const int16x4_t *in0,
533                                                        const int16x4_t *in1,
534                                                        int16x4_t *output,
535                                                        int n) {
536   for (int i = 0; i < n / 4; ++i) {
537     output[i] = vqadd_s16(in0[i], in1[n / 2 - i - 1]);
538   }
539   for (int i = 0; i < n / 4; ++i) {
540     output[n / 4 + i] = vqsub_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
541   }
542   for (int i = 0; i < n / 4; ++i) {
543     output[n / 2 + i] = vqsub_s16(in0[n - i - 1], in1[n / 2 + i]);
544   }
545   for (int i = 0; i < n / 4; ++i) {
546     output[(3 * n) / 4 + i] =
547         vqadd_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
548   }
549 }
550 
butterfly_dct_post_s16_x8(const int16x8_t * in0,const int16x8_t * in1,int16x8_t * output,int n)551 static AOM_FORCE_INLINE void butterfly_dct_post_s16_x8(const int16x8_t *in0,
552                                                        const int16x8_t *in1,
553                                                        int16x8_t *output,
554                                                        int n) {
555   for (int i = 0; i < n / 4; ++i) {
556     output[i] = vqaddq_s16(in0[i], in1[n / 2 - i - 1]);
557   }
558   for (int i = 0; i < n / 4; ++i) {
559     output[n / 4 + i] = vqsubq_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
560   }
561   for (int i = 0; i < n / 4; ++i) {
562     output[n / 2 + i] = vqsubq_s16(in0[n - i - 1], in1[n / 2 + i]);
563   }
564   for (int i = 0; i < n / 4; ++i) {
565     output[(3 * n) / 4 + i] =
566         vqaddq_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
567   }
568 }
569 
butterfly_dct_post_s32_x4(const int32x4_t * in0,const int32x4_t * in1,int32x4_t * output,int n)570 static AOM_FORCE_INLINE void butterfly_dct_post_s32_x4(const int32x4_t *in0,
571                                                        const int32x4_t *in1,
572                                                        int32x4_t *output,
573                                                        int n) {
574   for (int i = 0; i < n / 4; ++i) {
575     output[i] = vqaddq_s32(in0[i], in1[n / 2 - i - 1]);
576   }
577   for (int i = 0; i < n / 4; ++i) {
578     output[n / 4 + i] = vqsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
579   }
580   for (int i = 0; i < n / 4; ++i) {
581     output[n / 2 + i] = vqsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
582   }
583   for (int i = 0; i < n / 4; ++i) {
584     output[(3 * n) / 4 + i] =
585         vqaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
586   }
587 }
588 
fdct8x4_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)589 static AOM_FORCE_INLINE void fdct8x4_neon(const int16x8_t *input,
590                                           int16x8_t *output, int cos_bit) {
591   const int16_t *cospi = cospi_arr_q13(cos_bit);
592 
593   const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
594 
595   const int16x4_t cospi32 = vget_low_s16(cospi32_16);
596   const int16x4_t cospi16 = vget_high_s16(cospi32_16);
597 
598   // stage 1
599   int16x8_t x1[4];
600   butterfly_dct_pre_s16_x8(input, x1, 4);
601 
602   // stage 2
603   int16x8_t x2[4];
604   butterfly_s16_s32_x8_0112_neon(cospi32, x1[0], x1[1], &x2[0], &x2[1]);
605   butterfly_s16_s32_x8_0112_neon(cospi16, x1[3], x1[2], &x2[2], &x2[3]);
606 
607   // stage 3
608   output[0] = x2[0];
609   output[1] = x2[2];
610   output[2] = x2[1];
611   output[3] = x2[3];
612 }
613 
fdct4x8_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)614 static AOM_FORCE_INLINE void fdct4x8_neon(const int16x4_t *input,
615                                           int16x4_t *output, int cos_bit) {
616   const int16_t *cospi = cospi_arr_q13(cos_bit);
617 
618   const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
619   const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
620 
621   const int16x4_t cospi32 = vget_low_s16(cospi32_16);
622   const int16x4_t cospi16 = vget_high_s16(cospi32_16);
623   const int16x4_t cospi8 = vget_low_s16(cospi8_24);
624   const int16x4_t cospi24 = vget_high_s16(cospi8_24);
625 
626   // stage 1
627   int16x4_t x1[8];
628   butterfly_dct_pre_s16_x4(input, x1, 8);
629 
630   // stage 2
631   int16x4_t x2[8];
632   butterfly_dct_pre_s16_x4(x1, x2, 4);
633   butterfly_s16_s32_x4_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
634 
635   // stage 3
636   int16x4_t x3[8];
637   butterfly_s16_s32_x4_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
638   butterfly_s16_s32_x4_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
639   butterfly_dct_post_s16_x4(x1 + 4, x2 + 4, x3 + 4, 4);
640 
641   // stage 4-5
642   butterfly_s16_s32_x4_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
643   butterfly_s16_s32_x4_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
644 }
645 
fdct8x8_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)646 static AOM_FORCE_INLINE void fdct8x8_neon(const int16x8_t *input,
647                                           int16x8_t *output, int cos_bit) {
648   const int16_t *cospi = cospi_arr_q13(cos_bit);
649 
650   const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
651   const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
652 
653   const int16x4_t cospi32 = vget_low_s16(cospi32_16);
654   const int16x4_t cospi16 = vget_high_s16(cospi32_16);
655   const int16x4_t cospi8 = vget_low_s16(cospi8_24);
656   const int16x4_t cospi24 = vget_high_s16(cospi8_24);
657 
658   // stage 1
659   int16x8_t x1[8];
660   butterfly_dct_pre_s16_x8(input, x1, 8);
661 
662   // stage 2
663   int16x8_t x2[8];
664   butterfly_dct_pre_s16_x8(x1, x2, 4);
665   butterfly_s16_s32_x8_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
666 
667   // stage 3
668   int16x8_t x3[8];
669   butterfly_s16_s32_x8_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
670   butterfly_s16_s32_x8_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
671   butterfly_dct_post_s16_x8(x1 + 4, x2 + 4, x3 + 4, 4);
672 
673   // stage 4-5
674   butterfly_s16_s32_x8_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
675   butterfly_s16_s32_x8_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
676 }
677 
fdct4x16_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)678 static AOM_FORCE_INLINE void fdct4x16_neon(const int16x4_t *input,
679                                            int16x4_t *output, int cos_bit) {
680   const int16_t *cospi = cospi_arr_q13(cos_bit);
681 
682   const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
683   const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
684   const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
685   const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
686 
687   const int16x4_t cospi32 = vget_low_s16(cospi32_16);
688   const int16x4_t cospi16 = vget_high_s16(cospi32_16);
689   const int16x4_t cospi8 = vget_low_s16(cospi8_24);
690   const int16x4_t cospi24 = vget_high_s16(cospi8_24);
691   const int16x4_t cospi4 = vget_low_s16(cospi4_12);
692   const int16x4_t cospi12 = vget_high_s16(cospi4_12);
693   const int16x4_t cospi20 = vget_low_s16(cospi20_28);
694   const int16x4_t cospi28 = vget_high_s16(cospi20_28);
695 
696   // stage 1
697   int16x4_t x1[16];
698   butterfly_dct_pre_s16_x4(input, x1, 16);
699 
700   // stage 2
701   int16x4_t x2[16];
702   butterfly_dct_pre_s16_x4(x1, x2, 8);
703   butterfly_s16_s32_x4_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
704   butterfly_s16_s32_x4_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
705 
706   // stage 3
707   int16x4_t x3[16];
708   butterfly_dct_pre_s16_x4(x2, x3, 4);
709   butterfly_s16_s32_x4_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
710   butterfly_dct_post_s16_x4(x1 + 8, x2 + 8, x3 + 8, 8);
711 
712   // stage 4
713   int16x4_t x4[16];
714   butterfly_s16_s32_x4_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
715   butterfly_s16_s32_x4_0112_neon(cospi16, x3[3], x3[2], &output[4],
716                                  &output[12]);
717   butterfly_dct_post_s16_x4(x2 + 4, x3 + 4, x4 + 4, 4);
718   butterfly_s16_s32_x4_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
719   butterfly_s16_s32_x4_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
720 
721   // stage 5
722   int16x4_t x5[16];
723   butterfly_s16_s32_x4_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
724   butterfly_s16_s32_x4_1003_neon(cospi24, x4[6], x4[5], &output[10],
725                                  &output[6]);
726   butterfly_dct_post_s16_x4(x3 + 8, x4 + 8, x5 + 8, 4);
727   butterfly_dct_post_s16_x4(x3 + 12, x4 + 12, x5 + 12, 4);
728 
729   // stage 6-7
730   butterfly_s16_s32_x4_0112_neon(cospi4, x5[15], x5[8], &output[1],
731                                  &output[15]);
732   butterfly_s16_s32_x4_1003_neon(cospi28, x5[14], x5[9], &output[9],
733                                  &output[7]);
734   butterfly_s16_s32_x4_0112_neon(cospi20, x5[13], x5[10], &output[5],
735                                  &output[11]);
736   butterfly_s16_s32_x4_1003_neon(cospi12, x5[12], x5[11], &output[13],
737                                  &output[3]);
738 }
739 
fdct8x16_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)740 static AOM_FORCE_INLINE void fdct8x16_neon(const int16x8_t *input,
741                                            int16x8_t *output, int cos_bit) {
742   const int16_t *cospi = cospi_arr_q13(cos_bit);
743 
744   const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
745   const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
746   const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
747   const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
748 
749   const int16x4_t cospi32 = vget_low_s16(cospi32_16);
750   const int16x4_t cospi16 = vget_high_s16(cospi32_16);
751   const int16x4_t cospi8 = vget_low_s16(cospi8_24);
752   const int16x4_t cospi24 = vget_high_s16(cospi8_24);
753   const int16x4_t cospi4 = vget_low_s16(cospi4_12);
754   const int16x4_t cospi12 = vget_high_s16(cospi4_12);
755   const int16x4_t cospi20 = vget_low_s16(cospi20_28);
756   const int16x4_t cospi28 = vget_high_s16(cospi20_28);
757 
758   // stage 1
759   int16x8_t x1[16];
760   butterfly_dct_pre_s16_x8(input, x1, 16);
761 
762   // stage 2
763   int16x8_t x2[16];
764   butterfly_dct_pre_s16_x8(x1, x2, 8);
765   butterfly_s16_s32_x8_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
766   butterfly_s16_s32_x8_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
767 
768   // stage 3
769   int16x8_t x3[16];
770   butterfly_dct_pre_s16_x8(x2, x3, 4);
771   butterfly_s16_s32_x8_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
772   butterfly_dct_post_s16_x8(x1 + 8, x2 + 8, x3 + 8, 8);
773 
774   // stage 4
775   int16x8_t x4[16];
776   butterfly_s16_s32_x8_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
777   butterfly_s16_s32_x8_0112_neon(cospi16, x3[3], x3[2], &output[4],
778                                  &output[12]);
779   butterfly_dct_post_s16_x8(x2 + 4, x3 + 4, x4 + 4, 4);
780   butterfly_s16_s32_x8_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
781   butterfly_s16_s32_x8_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
782 
783   // stage 5
784   int16x8_t x5[16];
785   butterfly_s16_s32_x8_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
786   butterfly_s16_s32_x8_1003_neon(cospi24, x4[6], x4[5], &output[10],
787                                  &output[6]);
788   butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 4);
789   butterfly_dct_post_s16_x8(x3 + 12, x4 + 12, x5 + 12, 4);
790 
791   // stage 6-7
792   butterfly_s16_s32_x8_0112_neon(cospi4, x5[15], x5[8], &output[1],
793                                  &output[15]);
794   butterfly_s16_s32_x8_1003_neon(cospi28, x5[14], x5[9], &output[9],
795                                  &output[7]);
796   butterfly_s16_s32_x8_0112_neon(cospi20, x5[13], x5[10], &output[5],
797                                  &output[11]);
798   butterfly_s16_s32_x8_1003_neon(cospi12, x5[12], x5[11], &output[13],
799                                  &output[3]);
800 }
801 
fdct8x32_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)802 static AOM_FORCE_INLINE void fdct8x32_neon(const int16x8_t *input,
803                                            int16x8_t *output, int cos_bit) {
804   const int16_t *cospi = cospi_arr_q13(cos_bit);
805 
806   const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
807   const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
808   const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
809   const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
810   const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
811   const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
812   const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
813   const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
814 
815   const int16x4_t cospi32 = vget_low_s16(cospi32_16);
816   const int16x4_t cospi16 = vget_high_s16(cospi32_16);
817   const int16x4_t cospi8 = vget_low_s16(cospi8_24);
818   const int16x4_t cospi24 = vget_high_s16(cospi8_24);
819   const int16x4_t cospi4 = vget_low_s16(cospi4_12);
820   const int16x4_t cospi12 = vget_high_s16(cospi4_12);
821   const int16x4_t cospi20 = vget_low_s16(cospi20_28);
822   const int16x4_t cospi28 = vget_high_s16(cospi20_28);
823   const int16x4_t cospi2 = vget_low_s16(cospi2_6);
824   const int16x4_t cospi6 = vget_high_s16(cospi2_6);
825   const int16x4_t cospi10 = vget_low_s16(cospi10_14);
826   const int16x4_t cospi14 = vget_high_s16(cospi10_14);
827   const int16x4_t cospi18 = vget_low_s16(cospi18_22);
828   const int16x4_t cospi22 = vget_high_s16(cospi18_22);
829   const int16x4_t cospi26 = vget_low_s16(cospi26_30);
830   const int16x4_t cospi30 = vget_high_s16(cospi26_30);
831 
832   // stage 1
833   int16x8_t x1[32];
834   butterfly_dct_pre_s16_x8(input, x1, 32);
835 
836   // stage 2
837   int16x8_t x2[32];
838   butterfly_dct_pre_s16_x8(x1, x2, 16);
839   butterfly_s16_s32_x8_0112_neon(cospi32, x1[27], x1[20], &x2[27], &x2[20]);
840   butterfly_s16_s32_x8_0112_neon(cospi32, x1[26], x1[21], &x2[26], &x2[21]);
841   butterfly_s16_s32_x8_0112_neon(cospi32, x1[25], x1[22], &x2[25], &x2[22]);
842   butterfly_s16_s32_x8_0112_neon(cospi32, x1[24], x1[23], &x2[24], &x2[23]);
843 
844   // stage 3
845   int16x8_t x3[32];
846   butterfly_dct_pre_s16_x8(x2, x3, 8);
847   butterfly_s16_s32_x8_0112_neon(cospi32, x2[13], x2[10], &x3[13], &x3[10]);
848   butterfly_s16_s32_x8_0112_neon(cospi32, x2[12], x2[11], &x3[12], &x3[11]);
849   butterfly_dct_post_s16_x8(x1 + 16, x2 + 16, x3 + 16, 16);
850 
851   // stage 4
852   int16x8_t x4[32];
853   butterfly_dct_pre_s16_x8(x3, x4, 4);
854   butterfly_s16_s32_x8_0112_neon(cospi32, x3[6], x3[5], &x4[6], &x4[5]);
855   butterfly_dct_post_s16_x8(x2 + 8, x3 + 8, x4 + 8, 8);
856   butterfly_s16_s32_x8_0112_neon(cospi16, x3[29], x3[18], &x4[29], &x4[18]);
857   butterfly_s16_s32_x8_0112_neon(cospi16, x3[28], x3[19], &x4[28], &x4[19]);
858   butterfly_s16_s32_x8_1223_neon(cospi16, x3[27], x3[20], &x4[27], &x4[20]);
859   butterfly_s16_s32_x8_1223_neon(cospi16, x3[26], x3[21], &x4[26], &x4[21]);
860 
861   // stage 5
862   int16x8_t x5[32];
863   butterfly_s16_s32_x8_0112_neon(cospi32, x4[0], x4[1], &output[0],
864                                  &output[16]);
865   butterfly_s16_s32_x8_0112_neon(cospi16, x4[3], x4[2], &output[8],
866                                  &output[24]);
867   butterfly_dct_post_s16_x8(x3 + 4, x4 + 4, x5 + 4, 4);
868   butterfly_s16_s32_x8_0112_neon(cospi16, x4[14], x4[9], &x5[14], &x5[9]);
869   butterfly_s16_s32_x8_1223_neon(cospi16, x4[13], x4[10], &x5[13], &x5[10]);
870   butterfly_dct_post_s16_x8(x3 + 16, x4 + 16, x5 + 16, 8);
871   butterfly_dct_post_s16_x8(x3 + 24, x4 + 24, x5 + 24, 8);
872 
873   // stage 6
874   int16x8_t x6[32];
875   butterfly_s16_s32_x8_0112_neon(cospi8, x5[7], x5[4], &output[4], &output[28]);
876   butterfly_s16_s32_x8_1003_neon(cospi24, x5[6], x5[5], &output[20],
877                                  &output[12]);
878   butterfly_dct_post_s16_x8(x4 + 8, x5 + 8, x6 + 8, 4);
879   butterfly_dct_post_s16_x8(x4 + 12, x5 + 12, x6 + 12, 4);
880   butterfly_s16_s32_x8_0112_neon(cospi8, x5[30], x5[17], &x6[30], &x6[17]);
881   butterfly_s16_s32_x8_1223_neon(cospi8, x5[29], x5[18], &x6[29], &x6[18]);
882   butterfly_s16_s32_x8_1003_neon(cospi24, x5[26], x5[21], &x6[26], &x6[21]);
883   butterfly_s16_s32_x8_0332_neon(cospi24, x5[25], x5[22], &x6[25], &x6[22]);
884 
885   // stage 7
886   int16x8_t x7[32];
887   butterfly_s16_s32_x8_0112_neon(cospi4, x6[15], x6[8], &output[2],
888                                  &output[30]);
889   butterfly_s16_s32_x8_1003_neon(cospi28, x6[14], x6[9], &output[18],
890                                  &output[14]);
891   butterfly_s16_s32_x8_0112_neon(cospi20, x6[13], x6[10], &output[10],
892                                  &output[22]);
893   butterfly_s16_s32_x8_1003_neon(cospi12, x6[12], x6[11], &output[26],
894                                  &output[6]);
895   butterfly_dct_post_s16_x8(x5 + 16, x6 + 16, x7 + 16, 4);
896   butterfly_dct_post_s16_x8(x5 + 20, x6 + 20, x7 + 20, 4);
897   butterfly_dct_post_s16_x8(x5 + 24, x6 + 24, x7 + 24, 4);
898   butterfly_dct_post_s16_x8(x5 + 28, x6 + 28, x7 + 28, 4);
899 
900   butterfly_s16_s32_x8_0112_neon(cospi2, x7[31], x7[16], &output[1],
901                                  &output[31]);
902   butterfly_s16_s32_x8_1003_neon(cospi30, x7[30], x7[17], &output[17],
903                                  &output[15]);
904   butterfly_s16_s32_x8_0112_neon(cospi18, x7[29], x7[18], &output[9],
905                                  &output[23]);
906   butterfly_s16_s32_x8_1003_neon(cospi14, x7[28], x7[19], &output[25],
907                                  &output[7]);
908   butterfly_s16_s32_x8_0112_neon(cospi10, x7[27], x7[20], &output[5],
909                                  &output[27]);
910   butterfly_s16_s32_x8_1003_neon(cospi22, x7[26], x7[21], &output[21],
911                                  &output[11]);
912   butterfly_s16_s32_x8_0112_neon(cospi26, x7[25], x7[22], &output[13],
913                                  &output[19]);
914   butterfly_s16_s32_x8_1003_neon(cospi6, x7[24], x7[23], &output[29],
915                                  &output[3]);
916 }
917 
fdct8x64_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)918 static AOM_FORCE_INLINE void fdct8x64_neon(const int16x8_t *input,
919                                            int16x8_t *output, int cos_bit) {
920   const int16_t *cospi = cospi_arr_q13(cos_bit);
921 
922   const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
923   const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
924   const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
925   const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
926   const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
927   const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
928   const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
929   const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
930   const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
931   const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
932   const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
933   const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
934   const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
935   const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
936   const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
937   const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
938 
939   const int16x4_t cospi32 = vget_low_s16(cospi32_16);
940   const int16x4_t cospi16 = vget_high_s16(cospi32_16);
941   const int16x4_t cospi8 = vget_low_s16(cospi8_24);
942   const int16x4_t cospi24 = vget_high_s16(cospi8_24);
943   const int16x4_t cospi4 = vget_low_s16(cospi4_12);
944   const int16x4_t cospi12 = vget_high_s16(cospi4_12);
945   const int16x4_t cospi20 = vget_low_s16(cospi20_28);
946   const int16x4_t cospi28 = vget_high_s16(cospi20_28);
947   const int16x4_t cospi2 = vget_low_s16(cospi2_6);
948   const int16x4_t cospi6 = vget_high_s16(cospi2_6);
949   const int16x4_t cospi10 = vget_low_s16(cospi10_14);
950   const int16x4_t cospi14 = vget_high_s16(cospi10_14);
951   const int16x4_t cospi18 = vget_low_s16(cospi18_22);
952   const int16x4_t cospi22 = vget_high_s16(cospi18_22);
953   const int16x4_t cospi26 = vget_low_s16(cospi26_30);
954   const int16x4_t cospi30 = vget_high_s16(cospi26_30);
955   const int16x4_t cospi1 = vget_low_s16(cospi1_3);
956   const int16x4_t cospi3 = vget_high_s16(cospi1_3);
957   const int16x4_t cospi5 = vget_low_s16(cospi5_7);
958   const int16x4_t cospi7 = vget_high_s16(cospi5_7);
959   const int16x4_t cospi9 = vget_low_s16(cospi9_11);
960   const int16x4_t cospi11 = vget_high_s16(cospi9_11);
961   const int16x4_t cospi13 = vget_low_s16(cospi13_15);
962   const int16x4_t cospi15 = vget_high_s16(cospi13_15);
963   const int16x4_t cospi17 = vget_low_s16(cospi17_19);
964   const int16x4_t cospi19 = vget_high_s16(cospi17_19);
965   const int16x4_t cospi21 = vget_low_s16(cospi21_23);
966   const int16x4_t cospi23 = vget_high_s16(cospi21_23);
967   const int16x4_t cospi25 = vget_low_s16(cospi25_27);
968   const int16x4_t cospi27 = vget_high_s16(cospi25_27);
969   const int16x4_t cospi29 = vget_low_s16(cospi29_31);
970   const int16x4_t cospi31 = vget_high_s16(cospi29_31);
971 
972   // stage 1
973   int16x8_t x1[64];
974   butterfly_dct_pre_s16_x8(input, x1, 64);
975 
976   // stage 2
977   int16x8_t x2[64];
978   butterfly_dct_pre_s16_x8(x1, x2, 32);
979   butterfly_s16_s32_x8_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
980   butterfly_s16_s32_x8_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
981   butterfly_s16_s32_x8_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
982   butterfly_s16_s32_x8_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
983   butterfly_s16_s32_x8_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
984   butterfly_s16_s32_x8_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
985   butterfly_s16_s32_x8_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
986   butterfly_s16_s32_x8_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
987 
988   // stage 3
989   int16x8_t x3[64];
990   butterfly_dct_pre_s16_x8(x2, x3, 16);
991   x3[16] = x2[16];
992   x3[17] = x2[17];
993   x3[18] = x2[18];
994   x3[19] = x2[19];
995   butterfly_s16_s32_x8_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
996   butterfly_s16_s32_x8_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
997   butterfly_s16_s32_x8_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
998   butterfly_s16_s32_x8_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
999   x3[28] = x2[28];
1000   x3[29] = x2[29];
1001   x3[30] = x2[30];
1002   x3[31] = x2[31];
1003   butterfly_dct_post_s16_x8(x1 + 32, x2 + 32, x3 + 32, 32);
1004 
1005   // stage 4
1006   int16x8_t x4[64];
1007   butterfly_dct_pre_s16_x8(x3, x4, 8);
1008   butterfly_s16_s32_x8_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
1009   butterfly_s16_s32_x8_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
1010   butterfly_dct_post_s16_x8(x3 + 16, x3 + 16, x4 + 16, 16);
1011   butterfly_s16_s32_x8_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
1012   butterfly_s16_s32_x8_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
1013   butterfly_s16_s32_x8_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
1014   butterfly_s16_s32_x8_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
1015   butterfly_s16_s32_x8_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
1016   butterfly_s16_s32_x8_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
1017   butterfly_s16_s32_x8_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
1018   butterfly_s16_s32_x8_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
1019 
1020   // stage 5
1021   int16x8_t x5[64];
1022   butterfly_dct_pre_s16_x8(x4, x5, 4);
1023   butterfly_s16_s32_x8_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
1024   butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 8);
1025   butterfly_s16_s32_x8_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
1026   butterfly_s16_s32_x8_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
1027   butterfly_s16_s32_x8_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
1028   butterfly_s16_s32_x8_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
1029   butterfly_dct_post_s16_x8(x3 + 32, x4 + 32, x5 + 32, 16);
1030   butterfly_dct_post_s16_x8(x3 + 48, x4 + 48, x5 + 48, 16);
1031 
1032   // stage 6
1033   int16x8_t x6[64];
1034   butterfly_s16_s32_x8_0112_neon(cospi32, x5[1], x5[0], &x6[0], &x6[1]);
1035   butterfly_s16_s32_x8_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
1036   butterfly_dct_post_s16_x8(x4 + 4, x5 + 4, x6 + 4, 4);
1037   butterfly_s16_s32_x8_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
1038   butterfly_s16_s32_x8_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
1039   butterfly_dct_post_s16_x8(x4 + 16, x5 + 16, x6 + 16, 8);
1040   butterfly_dct_post_s16_x8(x4 + 24, x5 + 24, x6 + 24, 8);
1041   butterfly_s16_s32_x8_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
1042   butterfly_s16_s32_x8_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
1043   butterfly_s16_s32_x8_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
1044   butterfly_s16_s32_x8_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
1045   butterfly_s16_s32_x8_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
1046   butterfly_s16_s32_x8_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
1047   butterfly_s16_s32_x8_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
1048   butterfly_s16_s32_x8_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
1049 
1050   // stage 7
1051   int16x8_t x7[64];
1052   butterfly_s16_s32_x8_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
1053   butterfly_s16_s32_x8_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
1054   butterfly_dct_post_s16_x8(x5 + 8, x6 + 8, x7 + 8, 4);
1055   butterfly_dct_post_s16_x8(x5 + 12, x6 + 12, x7 + 12, 4);
1056   butterfly_s16_s32_x8_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
1057   butterfly_s16_s32_x8_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
1058   butterfly_s16_s32_x8_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
1059   butterfly_s16_s32_x8_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
1060   butterfly_dct_post_s16_x8(x5 + 32, x6 + 32, x7 + 32, 8);
1061   butterfly_dct_post_s16_x8(x5 + 40, x6 + 40, x7 + 40, 8);
1062   butterfly_dct_post_s16_x8(x5 + 48, x6 + 48, x7 + 48, 8);
1063   butterfly_dct_post_s16_x8(x5 + 56, x6 + 56, x7 + 56, 8);
1064 
1065   // stage 8
1066   int16x8_t x8[64];
1067   butterfly_s16_s32_x8_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
1068   butterfly_s16_s32_x8_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
1069   butterfly_s16_s32_x8_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
1070   butterfly_s16_s32_x8_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
1071   butterfly_dct_post_s16_x8(x6 + 16, x7 + 16, x8 + 16, 4);
1072   butterfly_dct_post_s16_x8(x6 + 20, x7 + 20, x8 + 20, 4);
1073   butterfly_dct_post_s16_x8(x6 + 24, x7 + 24, x8 + 24, 4);
1074   butterfly_dct_post_s16_x8(x6 + 28, x7 + 28, x8 + 28, 4);
1075   butterfly_s16_s32_x8_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
1076   butterfly_s16_s32_x8_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
1077   butterfly_s16_s32_x8_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
1078   butterfly_s16_s32_x8_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
1079   butterfly_s16_s32_x8_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
1080   butterfly_s16_s32_x8_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
1081   butterfly_s16_s32_x8_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
1082   butterfly_s16_s32_x8_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
1083 
1084   // stage 9
1085   int16x8_t x9[64];
1086   butterfly_s16_s32_x8_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
1087   butterfly_s16_s32_x8_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
1088   butterfly_s16_s32_x8_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
1089   butterfly_s16_s32_x8_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
1090   butterfly_s16_s32_x8_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
1091   butterfly_s16_s32_x8_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
1092   butterfly_s16_s32_x8_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
1093   butterfly_s16_s32_x8_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
1094   butterfly_dct_post_s16_x8(x7 + 32, x8 + 32, x9 + 32, 4);
1095   butterfly_dct_post_s16_x8(x7 + 36, x8 + 36, x9 + 36, 4);
1096   butterfly_dct_post_s16_x8(x7 + 40, x8 + 40, x9 + 40, 4);
1097   butterfly_dct_post_s16_x8(x7 + 44, x8 + 44, x9 + 44, 4);
1098   butterfly_dct_post_s16_x8(x7 + 48, x8 + 48, x9 + 48, 4);
1099   butterfly_dct_post_s16_x8(x7 + 52, x8 + 52, x9 + 52, 4);
1100   butterfly_dct_post_s16_x8(x7 + 56, x8 + 56, x9 + 56, 4);
1101   butterfly_dct_post_s16_x8(x7 + 60, x8 + 60, x9 + 60, 4);
1102 
1103   // stage 10
1104   butterfly_s16_s32_x8_0112_neon(cospi1, x9[63], x9[32], &output[1],
1105                                  &output[63]);
1106   butterfly_s16_s32_x8_1003_neon(cospi31, x9[62], x9[33], &output[33],
1107                                  &output[31]);
1108   butterfly_s16_s32_x8_0112_neon(cospi17, x9[61], x9[34], &output[17],
1109                                  &output[47]);
1110   butterfly_s16_s32_x8_1003_neon(cospi15, x9[60], x9[35], &output[49],
1111                                  &output[15]);
1112   butterfly_s16_s32_x8_0112_neon(cospi9, x9[59], x9[36], &output[9],
1113                                  &output[55]);
1114   butterfly_s16_s32_x8_1003_neon(cospi23, x9[58], x9[37], &output[41],
1115                                  &output[23]);
1116   butterfly_s16_s32_x8_0112_neon(cospi25, x9[57], x9[38], &output[25],
1117                                  &output[39]);
1118   butterfly_s16_s32_x8_1003_neon(cospi7, x9[56], x9[39], &output[57],
1119                                  &output[7]);
1120   butterfly_s16_s32_x8_0112_neon(cospi5, x9[55], x9[40], &output[5],
1121                                  &output[59]);
1122   butterfly_s16_s32_x8_1003_neon(cospi27, x9[54], x9[41], &output[37],
1123                                  &output[27]);
1124   butterfly_s16_s32_x8_0112_neon(cospi21, x9[53], x9[42], &output[21],
1125                                  &output[43]);
1126   butterfly_s16_s32_x8_1003_neon(cospi11, x9[52], x9[43], &output[53],
1127                                  &output[11]);
1128   butterfly_s16_s32_x8_0112_neon(cospi13, x9[51], x9[44], &output[13],
1129                                  &output[51]);
1130   butterfly_s16_s32_x8_1003_neon(cospi19, x9[50], x9[45], &output[45],
1131                                  &output[19]);
1132   butterfly_s16_s32_x8_0112_neon(cospi29, x9[49], x9[46], &output[29],
1133                                  &output[35]);
1134   butterfly_s16_s32_x8_1003_neon(cospi3, x9[48], x9[47], &output[61],
1135                                  &output[3]);
1136 
1137   // stage 11
1138   output[0] = x6[0];
1139   output[2] = x9[16];
1140   output[4] = x8[8];
1141   output[6] = x9[24];
1142   output[8] = x7[4];
1143   output[10] = x9[20];
1144   output[12] = x8[12];
1145   output[14] = x9[28];
1146   output[16] = x6[2];
1147   output[18] = x9[18];
1148   output[20] = x8[10];
1149   output[22] = x9[26];
1150   output[24] = x7[6];
1151   output[26] = x9[22];
1152   output[28] = x8[14];
1153   output[30] = x9[30];
1154   output[32] = x6[1];
1155   output[34] = x9[17];
1156   output[36] = x8[9];
1157   output[38] = x9[25];
1158   output[40] = x7[5];
1159   output[42] = x9[21];
1160   output[44] = x8[13];
1161   output[46] = x9[29];
1162   output[48] = x6[3];
1163   output[52] = x8[11];
1164   output[54] = x9[27];
1165   output[56] = x7[7];
1166   output[58] = x9[23];
1167   output[60] = x8[15];
1168   output[62] = x9[31];
1169 }
1170 
fadst8x8_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)1171 static AOM_FORCE_INLINE void fadst8x8_neon(const int16x8_t *input,
1172                                            int16x8_t *output, int cos_bit) {
1173   const int16_t *cospi = cospi_arr_q13(cos_bit);
1174 
1175   const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
1176   const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
1177   const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
1178 
1179   const int16x4_t cospi32 = vget_low_s16(cospi32_16);
1180   const int16x4_t cospi16 = vget_high_s16(cospi32_16);
1181   const int16x4_t cospi4 = vget_low_s16(cospi4_12);
1182   const int16x4_t cospi12 = vget_high_s16(cospi4_12);
1183   const int16x4_t cospi20 = vget_low_s16(cospi20_28);
1184   const int16x4_t cospi28 = vget_high_s16(cospi20_28);
1185 
1186   // stage 2
1187   int16x8_t x2[8];
1188   butterfly_s16_s32_x8_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
1189   butterfly_s16_s32_x8_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
1190 
1191   // stage 3
1192   int16x8_t x3[8];
1193   x3[0] = vqaddq_s16(input[0], x2[2]);
1194   x3[1] = vqsubq_s16(x2[3], input[7]);
1195   x3[2] = vqsubq_s16(input[0], x2[2]);
1196   x3[3] = vqaddq_s16(input[7], x2[3]);
1197   x3[4] = vqsubq_s16(x2[6], input[1]);
1198   x3[5] = vqaddq_s16(input[6], x2[7]);
1199   x3[6] = vqaddq_s16(input[1], x2[6]);
1200   x3[7] = vqsubq_s16(input[6], x2[7]);
1201 
1202   // stage 4
1203   butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
1204   butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
1205 
1206   // stage 5
1207   int16x8_t x5[8];
1208   x5[0] = vqaddq_s16(x3[0], x3[4]);
1209   x5[1] = vqaddq_s16(x3[1], x3[5]);
1210   x5[2] = vqaddq_s16(x3[2], x3[6]);
1211   x5[3] = vqsubq_s16(x3[7], x3[3]);
1212   x5[4] = vqsubq_s16(x3[0], x3[4]);
1213   x5[5] = vqsubq_s16(x3[1], x3[5]);
1214   x5[6] = vqsubq_s16(x3[2], x3[6]);
1215   x5[7] = vqaddq_s16(x3[3], x3[7]);
1216 
1217   // stage 6
1218   butterfly_s16_s32_x8_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
1219   butterfly_s16_s32_x8_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
1220   butterfly_s16_s32_x8_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
1221   butterfly_s16_s32_x8_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
1222 }
1223 
fadst4x16_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)1224 static AOM_FORCE_INLINE void fadst4x16_neon(const int16x4_t *input,
1225                                             int16x4_t *output, int cos_bit) {
1226   const int16_t *cospi = cospi_arr_q13(cos_bit);
1227 
1228   const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
1229   const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
1230   const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
1231   const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
1232   const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
1233   const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
1234 
1235   const int16x4_t cospi32 = vget_low_s16(cospi32_16);
1236   const int16x4_t cospi16 = vget_high_s16(cospi32_16);
1237   const int16x4_t cospi8 = vget_low_s16(cospi8_24);
1238   const int16x4_t cospi24 = vget_high_s16(cospi8_24);
1239   const int16x4_t cospi2 = vget_low_s16(cospi2_6);
1240   const int16x4_t cospi6 = vget_high_s16(cospi2_6);
1241   const int16x4_t cospi10 = vget_low_s16(cospi10_14);
1242   const int16x4_t cospi14 = vget_high_s16(cospi10_14);
1243   const int16x4_t cospi18 = vget_low_s16(cospi18_22);
1244   const int16x4_t cospi22 = vget_high_s16(cospi18_22);
1245   const int16x4_t cospi26 = vget_low_s16(cospi26_30);
1246   const int16x4_t cospi30 = vget_high_s16(cospi26_30);
1247 
1248   // stage 2
1249   int16x4_t x2[8];
1250   butterfly_s16_s32_x4_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
1251   butterfly_s16_s32_x4_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
1252   butterfly_s16_s32_x4_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
1253   butterfly_s16_s32_x4_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
1254 
1255   // stage 3
1256   int16x4_t x3[16];
1257   x3[0] = vqadd_s16(input[0], x2[0]);
1258   x3[1] = vqsub_s16(x2[1], input[15]);
1259   x3[2] = vqsub_s16(input[0], x2[0]);
1260   x3[3] = vqadd_s16(input[15], x2[1]);
1261   x3[4] = vqsub_s16(x2[2], input[3]);
1262   x3[5] = vqadd_s16(input[12], x2[3]);
1263   x3[6] = vqadd_s16(input[3], x2[2]);
1264   x3[7] = vqsub_s16(input[12], x2[3]);
1265   x3[8] = vqsub_s16(x2[4], input[1]);
1266   x3[9] = vqadd_s16(input[14], x2[5]);
1267   x3[10] = vqadd_s16(input[1], x2[4]);
1268   x3[11] = vqsub_s16(input[14], x2[5]);
1269   x3[12] = vqadd_s16(input[2], x2[6]);
1270   x3[13] = vqsub_s16(x2[7], input[13]);
1271   x3[14] = vqsub_s16(input[2], x2[6]);
1272   x3[15] = vqadd_s16(input[13], x2[7]);
1273 
1274   // stage 4
1275   butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
1276   butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
1277   butterfly_s16_s32_x4_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
1278   butterfly_s16_s32_x4_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
1279 
1280   // stage 5
1281   int16x4_t x5[16];
1282   x5[0] = vqadd_s16(x3[0], x3[4]);
1283   x5[1] = vqadd_s16(x3[1], x3[5]);
1284   x5[2] = vqadd_s16(x3[2], x3[6]);
1285   x5[3] = vqsub_s16(x3[7], x3[3]);
1286   x5[4] = vqsub_s16(x3[0], x3[4]);
1287   x5[5] = vqsub_s16(x3[1], x3[5]);
1288   x5[6] = vqsub_s16(x3[2], x3[6]);
1289   x5[7] = vqadd_s16(x3[3], x3[7]);
1290   x5[8] = vqadd_s16(x3[8], x3[12]);
1291   x5[9] = vqadd_s16(x3[9], x3[13]);
1292   x5[10] = vqsub_s16(x3[14], x3[10]);
1293   x5[11] = vqadd_s16(x3[11], x3[15]);
1294   x5[12] = vqsub_s16(x3[8], x3[12]);
1295   x5[13] = vqsub_s16(x3[9], x3[13]);
1296   x5[14] = vqadd_s16(x3[10], x3[14]);
1297   x5[15] = vqsub_s16(x3[11], x3[15]);
1298 
1299   // stage 6
1300   butterfly_s16_s32_x4_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
1301   butterfly_s16_s32_x4_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
1302   butterfly_s16_s32_x4_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
1303   butterfly_s16_s32_x4_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
1304 
1305   // stage 7
1306   int16x4_t x7[16];
1307   x7[0] = vqadd_s16(x5[0], x5[8]);
1308   x7[1] = vqadd_s16(x5[1], x5[9]);
1309   x7[2] = vqadd_s16(x5[2], x5[10]);
1310   x7[3] = vqadd_s16(x5[3], x5[11]);
1311   x7[4] = vqadd_s16(x5[4], x5[12]);
1312   x7[5] = vqadd_s16(x5[5], x5[13]);
1313   x7[6] = vqadd_s16(x5[6], x5[14]);
1314   x7[7] = vqsub_s16(x5[15], x5[7]);
1315   x7[8] = vqsub_s16(x5[0], x5[8]);
1316   x7[9] = vqsub_s16(x5[1], x5[9]);
1317   x7[10] = vqsub_s16(x5[2], x5[10]);
1318   x7[11] = vqsub_s16(x5[3], x5[11]);
1319   x7[12] = vqsub_s16(x5[4], x5[12]);
1320   x7[13] = vqsub_s16(x5[5], x5[13]);
1321   x7[14] = vqsub_s16(x5[6], x5[14]);
1322   x7[15] = vqadd_s16(x5[7], x5[15]);
1323 
1324   // stage 8
1325   butterfly_s16_s32_x4_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
1326   butterfly_s16_s32_x4_0112_neon(cospi10, x7[2], x7[3], &output[13],
1327                                  &output[2]);
1328   butterfly_s16_s32_x4_0112_neon(cospi18, x7[4], x7[5], &output[11],
1329                                  &output[4]);
1330   butterfly_s16_s32_x4_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
1331   butterfly_s16_s32_x4_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
1332   butterfly_s16_s32_x4_1003_neon(cospi22, x7[10], x7[11], &output[5],
1333                                  &output[10]);
1334   butterfly_s16_s32_x4_1003_neon(cospi14, x7[12], x7[13], &output[3],
1335                                  &output[12]);
1336   butterfly_s16_s32_x4_0112_neon(cospi6, x7[14], x7[15], &output[14],
1337                                  &output[1]);
1338 }
1339 
fadst8x16_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)1340 static AOM_FORCE_INLINE void fadst8x16_neon(const int16x8_t *input,
1341                                             int16x8_t *output, int cos_bit) {
1342   const int16_t *cospi = cospi_arr_q13(cos_bit);
1343 
1344   const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
1345   const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
1346   const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
1347   const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
1348   const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
1349   const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
1350 
1351   const int16x4_t cospi32 = vget_low_s16(cospi32_16);
1352   const int16x4_t cospi16 = vget_high_s16(cospi32_16);
1353   const int16x4_t cospi8 = vget_low_s16(cospi8_24);
1354   const int16x4_t cospi24 = vget_high_s16(cospi8_24);
1355   const int16x4_t cospi2 = vget_low_s16(cospi2_6);
1356   const int16x4_t cospi6 = vget_high_s16(cospi2_6);
1357   const int16x4_t cospi10 = vget_low_s16(cospi10_14);
1358   const int16x4_t cospi14 = vget_high_s16(cospi10_14);
1359   const int16x4_t cospi18 = vget_low_s16(cospi18_22);
1360   const int16x4_t cospi22 = vget_high_s16(cospi18_22);
1361   const int16x4_t cospi26 = vget_low_s16(cospi26_30);
1362   const int16x4_t cospi30 = vget_high_s16(cospi26_30);
1363 
1364   // stage 2
1365   int16x8_t x2[8];
1366   butterfly_s16_s32_x8_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
1367   butterfly_s16_s32_x8_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
1368   butterfly_s16_s32_x8_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
1369   butterfly_s16_s32_x8_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
1370 
1371   // stage 3
1372   int16x8_t x3[16];
1373   x3[0] = vqaddq_s16(input[0], x2[0]);
1374   x3[1] = vqsubq_s16(x2[1], input[15]);
1375   x3[2] = vqsubq_s16(input[0], x2[0]);
1376   x3[3] = vqaddq_s16(input[15], x2[1]);
1377   x3[4] = vqsubq_s16(x2[2], input[3]);
1378   x3[5] = vqaddq_s16(input[12], x2[3]);
1379   x3[6] = vqaddq_s16(input[3], x2[2]);
1380   x3[7] = vqsubq_s16(input[12], x2[3]);
1381   x3[8] = vqsubq_s16(x2[4], input[1]);
1382   x3[9] = vqaddq_s16(input[14], x2[5]);
1383   x3[10] = vqaddq_s16(input[1], x2[4]);
1384   x3[11] = vqsubq_s16(input[14], x2[5]);
1385   x3[12] = vqaddq_s16(input[2], x2[6]);
1386   x3[13] = vqsubq_s16(x2[7], input[13]);
1387   x3[14] = vqsubq_s16(input[2], x2[6]);
1388   x3[15] = vqaddq_s16(input[13], x2[7]);
1389 
1390   // stage 4
1391   butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
1392   butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
1393   butterfly_s16_s32_x8_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
1394   butterfly_s16_s32_x8_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
1395 
1396   // stage 5
1397   int16x8_t x5[16];
1398   x5[0] = vqaddq_s16(x3[0], x3[4]);
1399   x5[1] = vqaddq_s16(x3[1], x3[5]);
1400   x5[2] = vqaddq_s16(x3[2], x3[6]);
1401   x5[3] = vqsubq_s16(x3[7], x3[3]);
1402   x5[4] = vqsubq_s16(x3[0], x3[4]);
1403   x5[5] = vqsubq_s16(x3[1], x3[5]);
1404   x5[6] = vqsubq_s16(x3[2], x3[6]);
1405   x5[7] = vqaddq_s16(x3[3], x3[7]);
1406   x5[8] = vqaddq_s16(x3[8], x3[12]);
1407   x5[9] = vqaddq_s16(x3[9], x3[13]);
1408   x5[10] = vqsubq_s16(x3[14], x3[10]);
1409   x5[11] = vqaddq_s16(x3[11], x3[15]);
1410   x5[12] = vqsubq_s16(x3[8], x3[12]);
1411   x5[13] = vqsubq_s16(x3[9], x3[13]);
1412   x5[14] = vqaddq_s16(x3[10], x3[14]);
1413   x5[15] = vqsubq_s16(x3[11], x3[15]);
1414 
1415   // stage 6
1416   butterfly_s16_s32_x8_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
1417   butterfly_s16_s32_x8_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
1418   butterfly_s16_s32_x8_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
1419   butterfly_s16_s32_x8_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
1420 
1421   // stage 7
1422   int16x8_t x7[16];
1423   x7[0] = vqaddq_s16(x5[0], x5[8]);
1424   x7[1] = vqaddq_s16(x5[1], x5[9]);
1425   x7[2] = vqaddq_s16(x5[2], x5[10]);
1426   x7[3] = vqaddq_s16(x5[3], x5[11]);
1427   x7[4] = vqaddq_s16(x5[4], x5[12]);
1428   x7[5] = vqaddq_s16(x5[5], x5[13]);
1429   x7[6] = vqaddq_s16(x5[6], x5[14]);
1430   x7[7] = vqsubq_s16(x5[15], x5[7]);
1431   x7[8] = vqsubq_s16(x5[0], x5[8]);
1432   x7[9] = vqsubq_s16(x5[1], x5[9]);
1433   x7[10] = vqsubq_s16(x5[2], x5[10]);
1434   x7[11] = vqsubq_s16(x5[3], x5[11]);
1435   x7[12] = vqsubq_s16(x5[4], x5[12]);
1436   x7[13] = vqsubq_s16(x5[5], x5[13]);
1437   x7[14] = vqsubq_s16(x5[6], x5[14]);
1438   x7[15] = vqaddq_s16(x5[7], x5[15]);
1439 
1440   // stage 8
1441   butterfly_s16_s32_x8_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
1442   butterfly_s16_s32_x8_0112_neon(cospi10, x7[2], x7[3], &output[13],
1443                                  &output[2]);
1444   butterfly_s16_s32_x8_0112_neon(cospi18, x7[4], x7[5], &output[11],
1445                                  &output[4]);
1446   butterfly_s16_s32_x8_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
1447   butterfly_s16_s32_x8_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
1448   butterfly_s16_s32_x8_1003_neon(cospi22, x7[10], x7[11], &output[5],
1449                                  &output[10]);
1450   butterfly_s16_s32_x8_1003_neon(cospi14, x7[12], x7[13], &output[3],
1451                                  &output[12]);
1452   butterfly_s16_s32_x8_0112_neon(cospi6, x7[14], x7[15], &output[14],
1453                                  &output[1]);
1454 }
1455 
fidentity4x4_neon(const int16x4_t * const input,int16x4_t * const output,const int cos_bit)1456 static AOM_FORCE_INLINE void fidentity4x4_neon(const int16x4_t *const input,
1457                                                int16x4_t *const output,
1458                                                const int cos_bit) {
1459   (void)cos_bit;
1460   round_shift_sqrt2_s16_s16_4xn_neon(input, output, 4);
1461 }
1462 
fidentity8x4_neon(const int16x8_t * const input,int16x8_t * const output,const int cos_bit)1463 static AOM_FORCE_INLINE void fidentity8x4_neon(const int16x8_t *const input,
1464                                                int16x8_t *const output,
1465                                                const int cos_bit) {
1466   (void)cos_bit;
1467   round_shift_sqrt2_s16_s16_8xn_neon(input, output, 4);
1468 }
1469 
fidentity4x8_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)1470 static AOM_FORCE_INLINE void fidentity4x8_neon(const int16x4_t *input,
1471                                                int16x4_t *output, int cos_bit) {
1472   (void)cos_bit;
1473   shift_left_1_s16_x4(input, output, 8);
1474 }
1475 
fidentity8x8_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)1476 static AOM_FORCE_INLINE void fidentity8x8_neon(const int16x8_t *input,
1477                                                int16x8_t *output, int cos_bit) {
1478   (void)cos_bit;
1479   shift_left_1_s16_x8(input, output, 8);
1480 }
1481 
fidentity4x16_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)1482 static AOM_FORCE_INLINE void fidentity4x16_neon(const int16x4_t *input,
1483                                                 int16x4_t *output,
1484                                                 int cos_bit) {
1485   (void)cos_bit;
1486   round_shift_2sqrt2_s16_s16_4xn_neon(input, output, 16);
1487 }
1488 
fidentity8x16_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)1489 static AOM_FORCE_INLINE void fidentity8x16_neon(const int16x8_t *input,
1490                                                 int16x8_t *output,
1491                                                 int cos_bit) {
1492   (void)cos_bit;
1493   round_shift_2sqrt2_s16_s16_8xn_neon(input, output, 16);
1494 }
1495 
fidentity8x32_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)1496 static AOM_FORCE_INLINE void fidentity8x32_neon(const int16x8_t *input,
1497                                                 int16x8_t *output,
1498                                                 int cos_bit) {
1499   (void)cos_bit;
1500   shift_left_2_s16_x8(input, output, 32);
1501 }
1502 
1503 #define TRANSFORM_COL(name, tw, n)                                          \
1504   static void name##_col_neon(const int16_t *input, int16x##tw##_t *output, \
1505                               int stride, int cos_bit) {                    \
1506     int16x##tw##_t buf0[n];                                                 \
1507     load_buffer_s16_x##tw(input, stride, buf0, n);                          \
1508     shift_left_2_s16_x##tw(buf0, buf0, n);                                  \
1509     name##_neon(buf0, output, cos_bit);                                     \
1510   }
1511 
1512 TRANSFORM_COL(fadst4x4, 4, 4)
1513 TRANSFORM_COL(fadst4x8, 4, 8)
1514 TRANSFORM_COL(fadst4x16, 4, 16)
1515 TRANSFORM_COL(fadst8x4, 8, 4)
1516 TRANSFORM_COL(fadst8x8, 8, 8)
1517 TRANSFORM_COL(fadst8x16, 8, 16)
1518 TRANSFORM_COL(fdct4x4, 4, 4)
1519 TRANSFORM_COL(fdct4x8, 4, 8)
1520 TRANSFORM_COL(fdct4x16, 4, 16)
1521 TRANSFORM_COL(fdct8x4, 8, 4)
1522 TRANSFORM_COL(fdct8x8, 8, 8)
1523 TRANSFORM_COL(fdct8x16, 8, 16)
1524 TRANSFORM_COL(fdct8x32, 8, 32)
1525 TRANSFORM_COL(fidentity4x4, 4, 4)
1526 TRANSFORM_COL(fidentity4x8, 4, 8)
1527 TRANSFORM_COL(fidentity4x16, 4, 16)
1528 TRANSFORM_COL(fidentity8x4, 8, 4)
1529 TRANSFORM_COL(fidentity8x8, 8, 8)
1530 TRANSFORM_COL(fidentity8x16, 8, 16)
1531 TRANSFORM_COL(fidentity8x32, 8, 32)
1532 
1533 #define TRANSFORM_ROW(name, tw, n)                                          \
1534   static void name##_row_neon(const int16x##tw##_t *input, int32_t *output, \
1535                               int stride, int cos_bit) {                    \
1536     int16x##tw##_t buf0[n];                                                 \
1537     name##_neon(input, buf0, cos_bit);                                      \
1538     store_buffer_s16_x##tw(buf0, output, stride, n);                        \
1539   }
1540 
1541 #define TRANSFORM_ROW_RECT(name, tw, n)                                        \
1542   static void name##_row_rect_neon(const int16x##tw##_t *input,                \
1543                                    int32_t *output, int stride, int cos_bit) { \
1544     int16x##tw##_t buf0[n];                                                    \
1545     name##_neon(input, buf0, cos_bit);                                         \
1546     store_rect_buffer_s16_x##tw(buf0, output, stride, n);                      \
1547   }
1548 
1549 TRANSFORM_ROW(fadst4x4, 4, 4)
1550 TRANSFORM_ROW(fadst4x16, 4, 16)
1551 TRANSFORM_ROW(fadst8x4, 8, 4)
1552 TRANSFORM_ROW(fadst8x8, 8, 8)
1553 TRANSFORM_ROW(fadst8x16, 8, 16)
1554 TRANSFORM_ROW(fdct4x4, 4, 4)
1555 TRANSFORM_ROW(fdct4x16, 4, 16)
1556 TRANSFORM_ROW(fdct8x4, 8, 4)
1557 TRANSFORM_ROW(fdct8x8, 8, 8)
1558 TRANSFORM_ROW(fdct8x16, 8, 16)
1559 TRANSFORM_ROW(fdct8x32, 8, 32)
1560 TRANSFORM_ROW(fidentity4x4, 4, 4)
1561 TRANSFORM_ROW(fidentity4x16, 4, 16)
1562 TRANSFORM_ROW(fidentity8x4, 8, 4)
1563 TRANSFORM_ROW(fidentity8x8, 8, 8)
1564 TRANSFORM_ROW(fidentity8x16, 8, 16)
1565 TRANSFORM_ROW(fidentity8x32, 8, 32)
1566 
1567 TRANSFORM_ROW_RECT(fadst4x8, 4, 8)
1568 TRANSFORM_ROW_RECT(fadst8x4, 8, 4)
1569 TRANSFORM_ROW_RECT(fadst8x8, 8, 8)
1570 TRANSFORM_ROW_RECT(fadst8x16, 8, 16)
1571 TRANSFORM_ROW_RECT(fdct4x8, 4, 8)
1572 TRANSFORM_ROW_RECT(fdct8x4, 8, 4)
1573 TRANSFORM_ROW_RECT(fdct8x8, 8, 8)
1574 TRANSFORM_ROW_RECT(fdct8x16, 8, 16)
1575 TRANSFORM_ROW_RECT(fdct8x32, 8, 32)
1576 TRANSFORM_ROW_RECT(fidentity4x8, 4, 8)
1577 TRANSFORM_ROW_RECT(fidentity8x4, 8, 4)
1578 TRANSFORM_ROW_RECT(fidentity8x8, 8, 8)
1579 TRANSFORM_ROW_RECT(fidentity8x16, 8, 16)
1580 TRANSFORM_ROW_RECT(fidentity8x32, 8, 32)
1581 
1582 typedef void (*transform_1d_lbd_4_neon)(const int16x4_t *input,
1583                                         int16x4_t *output, int cos_bit);
1584 typedef void (*transform_1d_lbd_8_neon)(const int16x8_t *input,
1585                                         int16x8_t *output, int cos_bit);
1586 
1587 typedef void (*col_transform_1d_lbd_4_neon)(const int16_t *input,
1588                                             int16x4_t *output, int stride,
1589                                             int cos_bit);
1590 typedef void (*col_transform_1d_lbd_8_neon)(const int16_t *input,
1591                                             int16x8_t *output, int stride,
1592                                             int cos_bit);
1593 
1594 typedef void (*row_transform_1d_lbd_4_neon)(const int16x4_t *input,
1595                                             int32_t *output, int stride,
1596                                             int cos_bit);
1597 typedef void (*row_transform_1d_lbd_8_neon)(const int16x8_t *input,
1598                                             int32_t *output, int stride,
1599                                             int cos_bit);
1600 
1601 static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = {
1602   fdct4x8_col_neon,       // DCT_DCT
1603   fadst4x8_col_neon,      // ADST_DCT
1604   fdct4x8_col_neon,       // DCT_ADST
1605   fadst4x8_col_neon,      // ADST_ADST
1606   fadst4x8_col_neon,      // FLIPADST_DCT
1607   fdct4x8_col_neon,       // DCT_FLIPADST
1608   fadst4x8_col_neon,      // FLIPADST_FLIPADST
1609   fadst4x8_col_neon,      // ADST_FLIPADST
1610   fadst4x8_col_neon,      // FLIPADST_ADST
1611   fidentity4x8_col_neon,  // IDTX
1612   fdct4x8_col_neon,       // V_DCT
1613   fidentity4x8_col_neon,  // H_DCT
1614   fadst4x8_col_neon,      // V_ADST
1615   fidentity4x8_col_neon,  // H_ADST
1616   fadst4x8_col_neon,      // V_FLIPADST
1617   fidentity4x8_col_neon   // H_FLIPADST
1618 };
1619 
1620 static const row_transform_1d_lbd_8_neon row_txfm8x4_arr[TX_TYPES] = {
1621   fdct8x4_row_neon,       // DCT_DCT
1622   fdct8x4_row_neon,       // ADST_DCT
1623   fadst8x4_row_neon,      // DCT_ADST
1624   fadst8x4_row_neon,      // ADST_ADST
1625   fdct8x4_row_neon,       // FLIPADST_DCT
1626   fadst8x4_row_neon,      // DCT_FLIPADST
1627   fadst8x4_row_neon,      // FLIPADST_FLIPADST
1628   fadst8x4_row_neon,      // ADST_FLIPADST
1629   fadst8x4_row_neon,      // FLIPADST_ADST
1630   fidentity8x4_row_neon,  // IDTX
1631   fidentity8x4_row_neon,  // V_DCT
1632   fdct8x4_row_neon,       // H_DCT
1633   fidentity8x4_row_neon,  // V_ADST
1634   fadst8x4_row_neon,      // H_ADST
1635   fidentity8x4_row_neon,  // V_FLIPADST
1636   fadst8x4_row_neon       // H_FLIPADST
1637 };
1638 
1639 static const row_transform_1d_lbd_8_neon row_rect_txfm8x4_arr[TX_TYPES] = {
1640   fdct8x4_row_rect_neon,       // DCT_DCT
1641   fdct8x4_row_rect_neon,       // ADST_DCT
1642   fadst8x4_row_rect_neon,      // DCT_ADST
1643   fadst8x4_row_rect_neon,      // ADST_ADST
1644   fdct8x4_row_rect_neon,       // FLIPADST_DCT
1645   fadst8x4_row_rect_neon,      // DCT_FLIPADST
1646   fadst8x4_row_rect_neon,      // FLIPADST_FLIPADST
1647   fadst8x4_row_rect_neon,      // ADST_FLIPADST
1648   fadst8x4_row_rect_neon,      // FLIPADST_ADST
1649   fidentity8x4_row_rect_neon,  // IDTX
1650   fidentity8x4_row_rect_neon,  // V_DCT
1651   fdct8x4_row_rect_neon,       // H_DCT
1652   fidentity8x4_row_rect_neon,  // V_ADST
1653   fadst8x4_row_rect_neon,      // H_ADST
1654   fidentity8x4_row_rect_neon,  // V_FLIPADST
1655   fadst8x4_row_rect_neon       // H_FLIPADST
1656 };
1657 
1658 static const col_transform_1d_lbd_8_neon col_txfm8x4_arr[TX_TYPES] = {
1659   fdct8x4_col_neon,       // DCT_DCT
1660   fadst8x4_col_neon,      // ADST_DCT
1661   fdct8x4_col_neon,       // DCT_ADST
1662   fadst8x4_col_neon,      // ADST_ADST
1663   fadst8x4_col_neon,      // FLIPADST_DCT
1664   fdct8x4_col_neon,       // DCT_FLIPADST
1665   fadst8x4_col_neon,      // FLIPADST_FLIPADST
1666   fadst8x4_col_neon,      // ADST_FLIPADST
1667   fadst8x4_col_neon,      // FLIPADST_ADST
1668   fidentity8x4_col_neon,  // IDTX
1669   fdct8x4_col_neon,       // V_DCT
1670   fidentity8x4_col_neon,  // H_DCT
1671   fadst8x4_col_neon,      // V_ADST
1672   fidentity8x4_col_neon,  // H_ADST
1673   fadst8x4_col_neon,      // V_FLIPADST
1674   fidentity8x4_col_neon   // H_FLIPADST
1675 };
1676 
1677 static const row_transform_1d_lbd_4_neon row_rect_txfm4x8_arr[TX_TYPES] = {
1678   fdct4x8_row_rect_neon,       // DCT_DCT
1679   fdct4x8_row_rect_neon,       // ADST_DCT
1680   fadst4x8_row_rect_neon,      // DCT_ADST
1681   fadst4x8_row_rect_neon,      // ADST_ADST
1682   fdct4x8_row_rect_neon,       // FLIPADST_DCT
1683   fadst4x8_row_rect_neon,      // DCT_FLIPADST
1684   fadst4x8_row_rect_neon,      // FLIPADST_FLIPADST
1685   fadst4x8_row_rect_neon,      // ADST_FLIPADST
1686   fadst4x8_row_rect_neon,      // FLIPADST_ADST
1687   fidentity4x8_row_rect_neon,  // IDTX
1688   fidentity4x8_row_rect_neon,  // V_DCT
1689   fdct4x8_row_rect_neon,       // H_DCT
1690   fidentity4x8_row_rect_neon,  // V_ADST
1691   fadst4x8_row_rect_neon,      // H_ADST
1692   fidentity4x8_row_rect_neon,  // V_FLIPADST
1693   fadst4x8_row_rect_neon       // H_FLIPADST
1694 };
1695 
1696 static const col_transform_1d_lbd_8_neon col_txfm8x8_arr[TX_TYPES] = {
1697   fdct8x8_col_neon,       // DCT_DCT
1698   fadst8x8_col_neon,      // ADST_DCT
1699   fdct8x8_col_neon,       // DCT_ADST
1700   fadst8x8_col_neon,      // ADST_ADST
1701   fadst8x8_col_neon,      // FLIPADST_DCT
1702   fdct8x8_col_neon,       // DCT_FLIPADST
1703   fadst8x8_col_neon,      // FLIPADST_FLIPADST
1704   fadst8x8_col_neon,      // ADST_FLIPADST
1705   fadst8x8_col_neon,      // FLIPADST_ADST
1706   fidentity8x8_col_neon,  // IDTX
1707   fdct8x8_col_neon,       // V_DCT
1708   fidentity8x8_col_neon,  // H_DCT
1709   fadst8x8_col_neon,      // V_ADST
1710   fidentity8x8_col_neon,  // H_ADST
1711   fadst8x8_col_neon,      // V_FLIPADST
1712   fidentity8x8_col_neon,  // H_FLIPADST
1713 };
1714 
1715 static const row_transform_1d_lbd_8_neon row_txfm8x8_arr[TX_TYPES] = {
1716   fdct8x8_row_neon,       // DCT_DCT
1717   fdct8x8_row_neon,       // ADST_DCT
1718   fadst8x8_row_neon,      // DCT_ADST
1719   fadst8x8_row_neon,      // ADST_ADST
1720   fdct8x8_row_neon,       // FLIPADST_DCT
1721   fadst8x8_row_neon,      // DCT_FLIPADST
1722   fadst8x8_row_neon,      // FLIPADST_FLIPADST
1723   fadst8x8_row_neon,      // ADST_FLIPADST
1724   fadst8x8_row_neon,      // FLIPADST_ADST
1725   fidentity8x8_row_neon,  // IDTX
1726   fidentity8x8_row_neon,  // V_DCT
1727   fdct8x8_row_neon,       // H_DCT
1728   fidentity8x8_row_neon,  // V_ADST
1729   fadst8x8_row_neon,      // H_ADST
1730   fidentity8x8_row_neon,  // V_FLIPADST
1731   fadst8x8_row_neon       // H_FLIPADST
1732 };
1733 
1734 static const row_transform_1d_lbd_8_neon row_rect_txfm8x8_arr[TX_TYPES] = {
1735   fdct8x8_row_rect_neon,       // DCT_DCT
1736   fdct8x8_row_rect_neon,       // ADST_DCT
1737   fadst8x8_row_rect_neon,      // DCT_ADST
1738   fadst8x8_row_rect_neon,      // ADST_ADST
1739   fdct8x8_row_rect_neon,       // FLIPADST_DCT
1740   fadst8x8_row_rect_neon,      // DCT_FLIPADST
1741   fadst8x8_row_rect_neon,      // FLIPADST_FLIPADST
1742   fadst8x8_row_rect_neon,      // ADST_FLIPADST
1743   fadst8x8_row_rect_neon,      // FLIPADST_ADST
1744   fidentity8x8_row_rect_neon,  // IDTX
1745   fidentity8x8_row_rect_neon,  // V_DCT
1746   fdct8x8_row_rect_neon,       // H_DCT
1747   fidentity8x8_row_rect_neon,  // V_ADST
1748   fadst8x8_row_rect_neon,      // H_ADST
1749   fidentity8x8_row_rect_neon,  // V_FLIPADST
1750   fadst8x8_row_rect_neon       // H_FLIPADST
1751 };
1752 
1753 static const col_transform_1d_lbd_4_neon col_txfm4x16_arr[TX_TYPES] = {
1754   fdct4x16_col_neon,       // DCT_DCT
1755   fadst4x16_col_neon,      // ADST_DCT
1756   fdct4x16_col_neon,       // DCT_ADST
1757   fadst4x16_col_neon,      // ADST_ADST
1758   fadst4x16_col_neon,      // FLIPADST_DCT
1759   fdct4x16_col_neon,       // DCT_FLIPADST
1760   fadst4x16_col_neon,      // FLIPADST_FLIPADST
1761   fadst4x16_col_neon,      // ADST_FLIPADST
1762   fadst4x16_col_neon,      // FLIPADST_ADST
1763   fidentity4x16_col_neon,  // IDTX
1764   fdct4x16_col_neon,       // V_DCT
1765   fidentity4x16_col_neon,  // H_DCT
1766   fadst4x16_col_neon,      // V_ADST
1767   fidentity4x16_col_neon,  // H_ADST
1768   fadst4x16_col_neon,      // V_FLIPADST
1769   fidentity4x16_col_neon   // H_FLIPADST
1770 };
1771 
1772 static const row_transform_1d_lbd_4_neon row_txfm4x16_arr[TX_TYPES] = {
1773   fdct4x16_row_neon,       // DCT_DCT
1774   fdct4x16_row_neon,       // ADST_DCT
1775   fadst4x16_row_neon,      // DCT_ADST
1776   fadst4x16_row_neon,      // ADST_ADST
1777   fdct4x16_row_neon,       // FLIPADST_DCT
1778   fadst4x16_row_neon,      // DCT_FLIPADST
1779   fadst4x16_row_neon,      // FLIPADST_FLIPADST
1780   fadst4x16_row_neon,      // ADST_FLIPADST
1781   fadst4x16_row_neon,      // FLIPADST_ADST
1782   fidentity4x16_row_neon,  // IDTX
1783   fidentity4x16_row_neon,  // V_DCT
1784   fdct4x16_row_neon,       // H_DCT
1785   fidentity4x16_row_neon,  // V_ADST
1786   fadst4x16_row_neon,      // H_ADST
1787   fidentity4x16_row_neon,  // V_FLIPADST
1788   fadst4x16_row_neon       // H_FLIPADST
1789 };
1790 
1791 static const col_transform_1d_lbd_8_neon col_txfm8x16_arr[TX_TYPES] = {
1792   fdct8x16_col_neon,       // DCT_DCT
1793   fadst8x16_col_neon,      // ADST_DCT
1794   fdct8x16_col_neon,       // DCT_ADST
1795   fadst8x16_col_neon,      // ADST_ADST
1796   fadst8x16_col_neon,      // FLIPADST_DCT
1797   fdct8x16_col_neon,       // DCT_FLIPADST
1798   fadst8x16_col_neon,      // FLIPADST_FLIPADST
1799   fadst8x16_col_neon,      // ADST_FLIPADST
1800   fadst8x16_col_neon,      // FLIPADST_ADST
1801   fidentity8x16_col_neon,  // IDTX
1802   fdct8x16_col_neon,       // V_DCT
1803   fidentity8x16_col_neon,  // H_DCT
1804   fadst8x16_col_neon,      // V_ADST
1805   fidentity8x16_col_neon,  // H_ADST
1806   fadst8x16_col_neon,      // V_FLIPADST
1807   fidentity8x16_col_neon   // H_FLIPADST
1808 };
1809 
1810 static const row_transform_1d_lbd_8_neon row_txfm8x16_arr[TX_TYPES] = {
1811   fdct8x16_row_neon,       // DCT_DCT
1812   fdct8x16_row_neon,       // ADST_DCT
1813   fadst8x16_row_neon,      // DCT_ADST
1814   fadst8x16_row_neon,      // ADST_ADST
1815   fdct8x16_row_neon,       // FLIPADST_DCT
1816   fadst8x16_row_neon,      // DCT_FLIPADST
1817   fadst8x16_row_neon,      // FLIPADST_FLIPADST
1818   fadst8x16_row_neon,      // ADST_FLIPADST
1819   fadst8x16_row_neon,      // FLIPADST_ADST
1820   fidentity8x16_row_neon,  // IDTX
1821   fidentity8x16_row_neon,  // V_DCT
1822   fdct8x16_row_neon,       // H_DCT
1823   fidentity8x16_row_neon,  // V_ADST
1824   fadst8x16_row_neon,      // H_ADST
1825   fidentity8x16_row_neon,  // V_FLIPADST
1826   fadst8x16_row_neon       // H_FLIPADST
1827 };
1828 
1829 static const row_transform_1d_lbd_8_neon row_rect_txfm8x16_arr[TX_TYPES] = {
1830   fdct8x16_row_rect_neon,       // DCT_DCT
1831   fdct8x16_row_rect_neon,       // ADST_DCT
1832   fadst8x16_row_rect_neon,      // DCT_ADST
1833   fadst8x16_row_rect_neon,      // ADST_ADST
1834   fdct8x16_row_rect_neon,       // FLIPADST_DCT
1835   fadst8x16_row_rect_neon,      // DCT_FLIPADST
1836   fadst8x16_row_rect_neon,      // FLIPADST_FLIPADST
1837   fadst8x16_row_rect_neon,      // ADST_FLIPADST
1838   fadst8x16_row_rect_neon,      // FLIPADST_ADST
1839   fidentity8x16_row_rect_neon,  // IDTX
1840   fidentity8x16_row_rect_neon,  // V_DCT
1841   fdct8x16_row_rect_neon,       // H_DCT
1842   fidentity8x16_row_rect_neon,  // V_ADST
1843   fadst8x16_row_rect_neon,      // H_ADST
1844   fidentity8x16_row_rect_neon,  // V_FLIPADST
1845   fadst8x16_row_rect_neon       // H_FLIPADST
1846 };
1847 
1848 static const row_transform_1d_lbd_8_neon row_txfm8x32_arr[TX_TYPES] = {
1849   fdct8x32_row_neon,       // DCT_DCT
1850   NULL,                    // ADST_DCT
1851   NULL,                    // DCT_ADST
1852   NULL,                    // ADST_ADST
1853   NULL,                    // FLIPADST_DCT
1854   NULL,                    // DCT_FLIPADST
1855   NULL,                    // FLIPADST_FLIPADST
1856   NULL,                    // ADST_FLIPADST
1857   NULL,                    // FLIPADST_ADST
1858   fidentity8x32_row_neon,  // IDTX
1859   fidentity8x32_row_neon,  // V_DCT
1860   fdct8x32_row_neon,       // H_DCT
1861   NULL,                    // V_ADST
1862   NULL,                    // H_ADST
1863   NULL,                    // V_FLIPADST
1864   NULL                     // H_FLIPADST
1865 };
1866 
1867 static const row_transform_1d_lbd_8_neon row_rect_txfm8x32_arr[TX_TYPES] = {
1868   fdct8x32_row_rect_neon,       // DCT_DCT
1869   NULL,                         // ADST_DCT
1870   NULL,                         // DCT_ADST
1871   NULL,                         // ADST_ADST
1872   NULL,                         // FLIPADST_DCT
1873   NULL,                         // DCT_FLIPADST
1874   NULL,                         // FLIPADST_FLIPADST
1875   NULL,                         // ADST_FLIPADST
1876   NULL,                         // FLIPADST_ADST
1877   fidentity8x32_row_rect_neon,  // IDTX
1878   fidentity8x32_row_rect_neon,  // V_DCT
1879   fdct8x32_row_rect_neon,       // H_DCT
1880   NULL,                         // V_ADST
1881   NULL,                         // H_ADST
1882   NULL,                         // V_FLIPADST
1883   NULL                          // H_FLIPADST
1884 };
1885 
1886 static const col_transform_1d_lbd_8_neon col_txfm8x32_arr[TX_TYPES] = {
1887   fdct8x32_col_neon,       // DCT_DCT
1888   NULL,                    // ADST_DCT
1889   NULL,                    // DCT_ADST
1890   NULL,                    // ADST_ADST
1891   NULL,                    // FLIPADST_DCT
1892   NULL,                    // DCT_FLIPADST
1893   NULL,                    // FLIPADST_FLIPADST
1894   NULL,                    // ADST_FLIPADST
1895   NULL,                    // FLIPADST_ADST
1896   fidentity8x32_col_neon,  // IDTX
1897   fdct8x32_col_neon,       // V_DCT
1898   fidentity8x32_col_neon,  // H_DCT
1899   NULL,                    // V_ADST
1900   NULL,                    // H_ADST
1901   NULL,                    // V_FLIPADST
1902   NULL                     // H_FLIPADST
1903 };
1904 
lowbd_fwd_txfm2d_4x4_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1905 static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
1906                                       int stride, TX_TYPE tx_type, int bd) {
1907   (void)bd;
1908   int ud_flip, lr_flip;
1909   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1910   ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
1911 
1912   int16x4_t buf0[4], buf1[4];
1913   switch (tx_type) {
1914     case DCT_DCT:
1915       fdct4x4_col_neon(input, buf0, stride, 13);
1916       transpose_arrays_s16_4x4(buf0, buf1);
1917       fdct4x4_row_neon(buf1, output, 4, 13);
1918       break;
1919     case ADST_DCT:
1920       fadst4x4_col_neon(input, buf0, stride, 13);
1921       transpose_arrays_s16_4x4(buf0, buf1);
1922       fdct4x4_row_neon(buf1, output, 4, 13);
1923       break;
1924     case DCT_ADST:
1925       fdct4x4_col_neon(input, buf0, stride, 13);
1926       transpose_arrays_s16_4x4(buf0, buf1);
1927       fadst4x4_row_neon(buf1, output, 4, 13);
1928       break;
1929     case ADST_ADST:
1930       fadst4x4_col_neon(input, buf0, stride, 13);
1931       transpose_arrays_s16_4x4(buf0, buf1);
1932       fadst4x4_row_neon(buf1, output, 4, 13);
1933       break;
1934     case FLIPADST_DCT:
1935       fadst4x4_col_neon(input, buf0, stride, 13);
1936       transpose_arrays_s16_4x4(buf0, buf1);
1937       fdct4x4_row_neon(buf1, output, 4, 13);
1938       break;
1939     case DCT_FLIPADST:
1940       fdct4x4_col_neon(input, buf0, stride, 13);
1941       transpose_arrays_s16_4x4(buf0, buf1);
1942       flip_buf_4_neon(buf1, buf0, 4);
1943       fadst4x4_row_neon(buf0, output, 4, 13);
1944       break;
1945     case FLIPADST_FLIPADST:
1946       fadst4x4_col_neon(input, buf0, stride, 13);
1947       transpose_arrays_s16_4x4(buf0, buf1);
1948       flip_buf_4_neon(buf1, buf0, 4);
1949       fadst4x4_row_neon(buf0, output, 4, 13);
1950       break;
1951     case ADST_FLIPADST:
1952       fadst4x4_col_neon(input, buf0, stride, 13);
1953       transpose_arrays_s16_4x4(buf0, buf1);
1954       flip_buf_4_neon(buf1, buf0, 4);
1955       fadst4x4_row_neon(buf0, output, 4, 13);
1956       break;
1957     case FLIPADST_ADST:
1958       fadst4x4_col_neon(input, buf0, stride, 13);
1959       transpose_arrays_s16_4x4(buf0, buf1);
1960       fadst4x4_row_neon(buf1, output, 4, 13);
1961       break;
1962     case IDTX:
1963       fidentity4x4_col_neon(input, buf0, stride, 13);
1964       transpose_arrays_s16_4x4(buf0, buf1);
1965       fidentity4x4_row_neon(buf1, output, 4, 13);
1966       break;
1967     case V_DCT:
1968       fdct4x4_col_neon(input, buf0, stride, 13);
1969       transpose_arrays_s16_4x4(buf0, buf1);
1970       fidentity4x4_row_neon(buf1, output, 4, 13);
1971       break;
1972     case H_DCT:
1973       fidentity4x4_col_neon(input, buf0, stride, 13);
1974       transpose_arrays_s16_4x4(buf0, buf1);
1975       fdct4x4_row_neon(buf1, output, 4, 13);
1976       break;
1977     case V_ADST:
1978       fadst4x4_col_neon(input, buf0, stride, 13);
1979       transpose_arrays_s16_4x4(buf0, buf1);
1980       fidentity4x4_row_neon(buf1, output, 4, 13);
1981       break;
1982     case H_ADST:
1983       fidentity4x4_col_neon(input, buf0, stride, 13);
1984       transpose_arrays_s16_4x4(buf0, buf1);
1985       fadst4x4_row_neon(buf1, output, 4, 13);
1986       break;
1987     case V_FLIPADST:
1988       fadst4x4_col_neon(input, buf0, stride, 13);
1989       transpose_arrays_s16_4x4(buf0, buf1);
1990       fidentity4x4_row_neon(buf1, output, 4, 13);
1991       break;
1992     case H_FLIPADST:
1993       fidentity4x4_col_neon(input, buf0, stride, 13);
1994       transpose_arrays_s16_4x4(buf0, buf1);
1995       flip_buf_4_neon(buf1, buf0, 4);
1996       fadst4x4_row_neon(buf0, output, 4, 13);
1997       break;
1998   }
1999 }
2000 
lowbd_fwd_txfm2d_4x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2001 static void lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
2002                                       int stride, TX_TYPE tx_type, int bd) {
2003   (void)bd;
2004   int16x4_t buf0[8];
2005   int16x8_t buf1[8];
2006   const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x8_arr[tx_type];
2007   const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x4_arr[tx_type];
2008 
2009   int ud_flip, lr_flip;
2010   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2011   ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
2012   col_txfm(input, buf0, stride, 13);
2013   shift_right_1_round_s16_x4(buf0, buf0, 8);
2014   transpose_arrays_s16_4x8(buf0, buf1);
2015 
2016   if (lr_flip) {
2017     int16x8_t buf2[8];
2018     flip_buf_8_neon(buf1, buf2, 4);
2019     row_txfm(buf2, output, 8, 13);
2020   } else {
2021     row_txfm(buf1, output, 8, 13);
2022   }
2023 }
2024 
lowbd_fwd_txfm2d_4x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2025 static void lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
2026                                        int stride, TX_TYPE tx_type, int bd) {
2027   (void)bd;
2028   int16x4_t buf0[16];
2029   int16x8_t buf1[16];
2030   const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x16_arr[tx_type];
2031   const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type];
2032   int ud_flip, lr_flip;
2033 
2034   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2035   ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
2036   col_txfm(input, buf0, stride, 13);
2037   shift_right_1_round_s16_x4(buf0, buf0, 16);
2038   transpose_arrays_s16_4x8(buf0, buf1);
2039   transpose_arrays_s16_4x8(buf0 + 8, buf1 + 8);
2040 
2041   for (int i = 0; i < 2; i++) {
2042     if (lr_flip) {
2043       int16x8_t buf2[16];
2044       flip_buf_8_neon(buf1 + 8 * i, buf2, 4);
2045       row_txfm(buf2, output + 8 * i, 16, 12);
2046     } else {
2047       int16x8_t *buf = buf1 + 8 * i;
2048       row_txfm(buf, output + 8 * i, 16, 12);
2049     }
2050   }
2051 }
2052 
lowbd_fwd_txfm2d_8x4_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2053 static void lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
2054                                       int stride, TX_TYPE tx_type, int bd) {
2055   (void)bd;
2056   int16x8_t buf0[8];
2057   int16x4_t buf1[8];
2058   const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
2059   const row_transform_1d_lbd_4_neon row_txfm = row_rect_txfm4x8_arr[tx_type];
2060   int ud_flip, lr_flip;
2061 
2062   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2063   ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
2064   col_txfm(input, buf0, stride, 13);
2065   shift_right_1_round_s16_x8(buf0, buf0, 4);
2066   transpose_arrays_s16_8x4(buf0, buf1);
2067 
2068   if (lr_flip) {
2069     int16x4_t buf2[8];
2070     flip_buf_4_neon(buf1, buf2, 8);
2071     row_txfm(buf2, output, 4, 13);
2072   } else {
2073     row_txfm(buf1, output, 4, 13);
2074   }
2075 }
2076 
lowbd_fwd_txfm2d_8x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2077 static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
2078                                       int stride, TX_TYPE tx_type, int bd) {
2079   (void)bd;
2080   int ud_flip, lr_flip;
2081   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2082   ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
2083 
2084   int16x8_t buf0[8], buf1[8];
2085 
2086   switch (tx_type) {
2087     case DCT_DCT:
2088       fdct8x8_col_neon(input, buf0, stride, 13);
2089       shift_right_1_round_s16_x8(buf0, buf0, 8);
2090       transpose_arrays_s16_8x8(buf0, buf1);
2091       fdct8x8_row_neon(buf1, output, 8, 13);
2092       break;
2093     case ADST_DCT:
2094       fadst8x8_col_neon(input, buf0, stride, 13);
2095       shift_right_1_round_s16_x8(buf0, buf0, 8);
2096       transpose_arrays_s16_8x8(buf0, buf1);
2097       fdct8x8_row_neon(buf1, output, 8, 13);
2098       break;
2099     case DCT_ADST:
2100       fdct8x8_col_neon(input, buf0, stride, 13);
2101       shift_right_1_round_s16_x8(buf0, buf0, 8);
2102       transpose_arrays_s16_8x8(buf0, buf1);
2103       fadst8x8_row_neon(buf1, output, 8, 13);
2104       break;
2105     case ADST_ADST:
2106       fadst8x8_col_neon(input, buf0, stride, 13);
2107       shift_right_1_round_s16_x8(buf0, buf0, 8);
2108       transpose_arrays_s16_8x8(buf0, buf1);
2109       fadst8x8_row_neon(buf1, output, 8, 13);
2110       break;
2111     case FLIPADST_DCT:
2112       fadst8x8_col_neon(input, buf0, stride, 13);
2113       shift_right_1_round_s16_x8(buf0, buf0, 8);
2114       transpose_arrays_s16_8x8(buf0, buf1);
2115       fdct8x8_row_neon(buf1, output, 8, 13);
2116       break;
2117     case DCT_FLIPADST:
2118       fdct8x8_col_neon(input, buf0, stride, 13);
2119       shift_right_1_round_s16_x8(buf0, buf0, 8);
2120       transpose_arrays_s16_8x8(buf0, buf1);
2121       flip_buf_8_neon(buf1, buf0, 8);
2122       fadst8x8_row_neon(buf0, output, 8, 13);
2123       break;
2124     case FLIPADST_FLIPADST:
2125       fadst8x8_col_neon(input, buf0, stride, 13);
2126       shift_right_1_round_s16_x8(buf0, buf0, 8);
2127       transpose_arrays_s16_8x8(buf0, buf1);
2128       flip_buf_8_neon(buf1, buf0, 8);
2129       fadst8x8_row_neon(buf0, output, 8, 13);
2130       break;
2131     case ADST_FLIPADST:
2132       fadst8x8_col_neon(input, buf0, stride, 13);
2133       shift_right_1_round_s16_x8(buf0, buf0, 8);
2134       transpose_arrays_s16_8x8(buf0, buf1);
2135       flip_buf_8_neon(buf1, buf0, 8);
2136       fadst8x8_row_neon(buf0, output, 8, 13);
2137       break;
2138     case FLIPADST_ADST:
2139       fadst8x8_col_neon(input, buf0, stride, 13);
2140       shift_right_1_round_s16_x8(buf0, buf0, 8);
2141       transpose_arrays_s16_8x8(buf0, buf1);
2142       fadst8x8_row_neon(buf1, output, 8, 13);
2143       break;
2144     case IDTX:
2145       fidentity8x8_col_neon(input, buf0, stride, 13);
2146       shift_right_1_round_s16_x8(buf0, buf0, 8);
2147       transpose_arrays_s16_8x8(buf0, buf1);
2148       fidentity8x8_row_neon(buf1, output, 8, 13);
2149       break;
2150     case V_DCT:
2151       fdct8x8_col_neon(input, buf0, stride, 13);
2152       shift_right_1_round_s16_x8(buf0, buf0, 8);
2153       transpose_arrays_s16_8x8(buf0, buf1);
2154       fidentity8x8_row_neon(buf1, output, 8, 13);
2155       break;
2156     case H_DCT:
2157       fidentity8x8_col_neon(input, buf0, stride, 13);
2158       shift_right_1_round_s16_x8(buf0, buf0, 8);
2159       transpose_arrays_s16_8x8(buf0, buf1);
2160       fdct8x8_row_neon(buf1, output, 8, 13);
2161       break;
2162     case V_ADST:
2163       fadst8x8_col_neon(input, buf0, stride, 13);
2164       shift_right_1_round_s16_x8(buf0, buf0, 8);
2165       transpose_arrays_s16_8x8(buf0, buf1);
2166       fidentity8x8_row_neon(buf1, output, 8, 13);
2167       break;
2168     case H_ADST:
2169       fidentity8x8_col_neon(input, buf0, stride, 13);
2170       shift_right_1_round_s16_x8(buf0, buf0, 8);
2171       transpose_arrays_s16_8x8(buf0, buf1);
2172       fadst8x8_row_neon(buf1, output, 8, 13);
2173       break;
2174     case V_FLIPADST:
2175       fadst8x8_col_neon(input, buf0, stride, 13);
2176       shift_right_1_round_s16_x8(buf0, buf0, 8);
2177       transpose_arrays_s16_8x8(buf0, buf1);
2178       fidentity8x8_row_neon(buf1, output, 8, 13);
2179       break;
2180     case H_FLIPADST:
2181       fidentity8x8_col_neon(input, buf0, stride, 13);
2182       shift_right_1_round_s16_x8(buf0, buf0, 8);
2183       transpose_arrays_s16_8x8(buf0, buf1);
2184       flip_buf_8_neon(buf1, buf0, 8);
2185       fadst8x8_row_neon(buf0, output, 8, 13);
2186       break;
2187   }
2188 }
2189 
lowbd_fwd_txfm2d_8x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2190 static void lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
2191                                        int stride, TX_TYPE tx_type, int bd) {
2192   (void)bd;
2193   int16x8_t buf0[16], buf1[16];
2194   const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
2195   const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x8_arr[tx_type];
2196   int ud_flip, lr_flip;
2197 
2198   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2199   ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
2200   col_txfm(input, buf0, stride, 13);
2201   shift_right_2_round_s16_x8(buf0, buf0, 16);
2202   transpose_arrays_s16_8x8(buf0, buf1);
2203   transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
2204 
2205   for (int i = 0; i < 2; i++) {
2206     if (lr_flip) {
2207       flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
2208       row_txfm(buf0, output + 8 * i, 16, 13);
2209     } else {
2210       int16x8_t *buf = buf1 + 8 * i;
2211       row_txfm(buf, output + 8 * i, 16, 13);
2212     }
2213   }
2214 }
2215 
lowbd_fwd_txfm2d_8x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2216 static void lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
2217                                        int stride, TX_TYPE tx_type, int bd) {
2218   (void)bd;
2219   int16x8_t buf0[32], buf1[32];
2220   const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
2221   const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
2222   int ud_flip, lr_flip;
2223 
2224   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2225   ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
2226   col_txfm(input, buf0, stride, 12);
2227   shift_right_2_round_s16_x8(buf0, buf0, 32);
2228   transpose_arrays_s16_8x8(buf0, buf1);
2229   transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
2230   transpose_arrays_s16_8x8(buf0 + 16, buf1 + 16);
2231   transpose_arrays_s16_8x8(buf0 + 24, buf1 + 24);
2232 
2233   for (int i = 0; i < 4; i++) {
2234     if (lr_flip) {
2235       flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
2236       row_txfm(buf0, output + 8 * i, 32, 12);
2237     } else {
2238       int16x8_t *buf = buf1 + 8 * i;
2239       row_txfm(buf, output + 8 * i, 32, 12);
2240     }
2241   }
2242 }
2243 
lowbd_fwd_txfm2d_16x4_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2244 static void lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
2245                                        int stride, TX_TYPE tx_type, int bd) {
2246   (void)bd;
2247   int16x8_t buf0[16];
2248   int16x4_t buf1[16];
2249   int16x4_t buf2[16];
2250   const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
2251   const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x16_arr[tx_type];
2252   int ud_flip, lr_flip;
2253 
2254   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2255   ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
2256   for (int i = 0; i < 2; i++) {
2257     col_txfm(input + 8 * i, buf0, stride, 13);
2258     shift_right_1_round_s16_x8(buf0, buf0, 4);
2259     transpose_arrays_s16_8x4(buf0, buf1 + 8 * i);
2260   }
2261 
2262   if (lr_flip) {
2263     flip_buf_4_neon(buf1, buf2, 16);
2264     row_txfm(buf2, output, 4, 13);
2265   } else {
2266     row_txfm(buf1, output, 4, 13);
2267   }
2268 }
2269 
lowbd_fwd_txfm2d_16x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2270 static void lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
2271                                        int stride, TX_TYPE tx_type, int bd) {
2272   (void)bd;
2273   int16x8_t buf0[16], buf1[16];
2274   const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
2275   const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
2276   int ud_flip, lr_flip;
2277 
2278   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2279   ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
2280   for (int i = 0; i < 2; i++) {
2281     col_txfm(input + 8 * i, buf0, stride, 13);
2282     shift_right_2_round_s16_x8(buf0, buf0, 8);
2283     transpose_arrays_s16_8x8(buf0, buf1 + 8 * i);
2284   }
2285 
2286   if (lr_flip) {
2287     flip_buf_8_neon(buf1, buf0, 16);
2288     row_txfm(buf0, output, 8, 13);
2289   } else {
2290     row_txfm(buf1, output, 8, 13);
2291   }
2292 }
2293 
lowbd_fwd_txfm2d_16x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2294 static void lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
2295                                         int stride, TX_TYPE tx_type, int bd) {
2296   (void)bd;
2297   int16x8_t buf0[16], buf1[32];
2298   const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
2299   const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type];
2300   int ud_flip, lr_flip;
2301 
2302   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2303   ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
2304   for (int i = 0; i < 2; i++) {
2305     col_txfm(input + 8 * i, buf0, stride, 13);
2306     shift_right_2_round_s16_x8(buf0, buf0, 16);
2307     transpose_arrays_s16_8x8(buf0, buf1 + 0 * 16 + 8 * i);
2308     transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 16 + 8 * i);
2309   }
2310 
2311   for (int i = 0; i < 2; i++) {
2312     if (lr_flip) {
2313       flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
2314       row_txfm(buf0, output + 8 * i, 16, 12);
2315     } else {
2316       int16x8_t *buf = buf1 + 16 * i;
2317       row_txfm(buf, output + 8 * i, 16, 12);
2318     }
2319   }
2320 }
2321 
lowbd_fwd_txfm2d_16x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2322 static void lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
2323                                         int stride, TX_TYPE tx_type, int bd) {
2324   (void)bd;
2325   int16x8_t buf0[32], buf1[64];
2326   const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
2327   const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
2328 
2329   if (col_txfm == NULL || row_txfm == NULL) {
2330     av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
2331     return;
2332   }
2333 
2334   int ud_flip, lr_flip;
2335   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2336   ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
2337   for (int i = 0; i < 2; i++) {
2338     col_txfm(input + 8 * i, buf0, stride, 12);
2339     shift_right_4_round_s16_x8(buf0, buf0, 32);
2340     transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 16 + 8 * i);
2341     transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 16 + 8 * i);
2342     transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 16 + 8 * i);
2343     transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 16 + 8 * i);
2344   }
2345 
2346   for (int i = 0; i < 4; i++) {
2347     if (lr_flip) {
2348       flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
2349       row_txfm(buf0, output + 8 * i, 32, 13);
2350     } else {
2351       int16x8_t *buf = buf1 + 16 * i;
2352       row_txfm(buf, output + 8 * i, 32, 13);
2353     }
2354   }
2355 }
2356 
lowbd_fwd_txfm2d_32x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2357 static void lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
2358                                        int stride, TX_TYPE tx_type, int bd) {
2359   (void)bd;
2360   int16x8_t buf0[32], buf1[32];
2361   const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
2362   const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
2363 
2364   if (col_txfm == NULL || row_txfm == NULL) {
2365     av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
2366     return;
2367   }
2368 
2369   int ud_flip, lr_flip;
2370   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2371   ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
2372   for (int i = 0; i < 4; i++) {
2373     col_txfm(input + 8 * i, buf0, stride, 13);
2374     shift_right_2_round_s16_x8(buf0, buf0, 8);
2375     transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
2376   }
2377 
2378   if (lr_flip) {
2379     flip_buf_8_neon(buf1, buf0, 32);
2380     row_txfm(buf0, output, 8, 12);
2381   } else {
2382     row_txfm(buf1, output, 8, 12);
2383   }
2384 }
2385 
lowbd_fwd_txfm2d_32x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2386 static void lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
2387                                         int stride, TX_TYPE tx_type, int bd) {
2388   (void)bd;
2389   int16x8_t buf0[32], buf1[64];
2390   const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
2391   const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x32_arr[tx_type];
2392 
2393   if (col_txfm == NULL || row_txfm == NULL) {
2394     av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
2395     return;
2396   }
2397 
2398   int ud_flip, lr_flip;
2399   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2400   ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
2401   for (int i = 0; i < 4; i++) {
2402     col_txfm(input + 8 * i, buf0, stride, 13);
2403     shift_right_4_round_s16_x8(buf0, buf0, 16);
2404     transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
2405     transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 32 + 8 * i);
2406   }
2407 
2408   for (int i = 0; i < 2; i++) {
2409     if (lr_flip) {
2410       flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
2411       row_txfm(buf0, output + 8 * i, 16, 13);
2412     } else {
2413       int16x8_t *buf = buf1 + 32 * i;
2414       row_txfm(buf, output + 8 * i, 16, 13);
2415     }
2416   }
2417 }
2418 
lowbd_fwd_txfm2d_32x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2419 static void lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
2420                                         int stride, TX_TYPE tx_type, int bd) {
2421   (void)bd;
2422   int16x8_t buf0[32], buf1[128];
2423   const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
2424   const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
2425 
2426   if (col_txfm == NULL || row_txfm == NULL) {
2427     av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
2428     return;
2429   }
2430 
2431   int ud_flip, lr_flip;
2432   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2433   ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
2434   for (int i = 0; i < 4; i++) {
2435     col_txfm(input + 8 * i, buf0, stride, 12);
2436     shift_right_4_round_s16_x8(buf0, buf0, 32);
2437     transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 32 + 8 * i);
2438     transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 32 + 8 * i);
2439     transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 32 + 8 * i);
2440     transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 32 + 8 * i);
2441   }
2442 
2443   for (int i = 0; i < 4; i++) {
2444     if (lr_flip) {
2445       flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
2446       row_txfm(buf0, output + 8 * i, 32, 12);
2447     } else {
2448       int16x8_t *buf = buf1 + 32 * i;
2449       row_txfm(buf, output + 8 * i, 32, 12);
2450     }
2451   }
2452 }
2453 
lowbd_fwd_txfm2d_64x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2454 static void lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output,
2455                                         int stride, TX_TYPE tx_type, int bd) {
2456   (void)bd;
2457   (void)tx_type;
2458   assert(tx_type == DCT_DCT);
2459   int16x8_t buf0[64], buf1[128];
2460   const transform_1d_lbd_8_neon col_txfm = fdct8x16_neon;
2461   const transform_1d_lbd_8_neon row_txfm = fdct8x64_neon;
2462 
2463   for (int i = 0; i < 8; i++) {
2464     load_buffer_s16_x8(input + 8 * i, stride, buf0, 16);
2465     shift_left_2_s16_x8(buf0, buf0, 16);
2466     col_txfm(buf0, buf0, 13);
2467     shift_right_4_round_s16_x8(buf0, buf0, 16);
2468     for (int j = 0; j < 2; ++j) {
2469       transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
2470     }
2471   }
2472 
2473   for (int i = 0; i < 2; i++) {
2474     int16x8_t *buf = buf1 + 64 * i;
2475     row_txfm(buf, buf, 12);
2476     store_buffer_s16_x8(buf, output + 8 * i, 16, 32);
2477   }
2478   // Zero out the bottom 16x32 area.
2479   memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
2480 }
2481 
lowbd_fwd_txfm2d_16x64_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2482 static void lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output,
2483                                         int stride, TX_TYPE tx_type, int bd) {
2484   (void)bd;
2485   (void)tx_type;
2486   assert(tx_type == DCT_DCT);
2487   int16x8_t buf0[64], buf1[128];
2488   const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
2489   const transform_1d_lbd_8_neon row_txfm = fdct8x16_neon;
2490 
2491   for (int i = 0; i < 2; i++) {
2492     load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
2493     col_txfm(buf0, buf0, 13);
2494     shift_right_2_round_s16_x8(buf0, buf0, 64);
2495     for (int j = 0; j < 8; ++j) {
2496       transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 16 + 8 * i);
2497     }
2498   }
2499 
2500   for (int i = 0; i < 4; i++) {
2501     int16x8_t *buf = buf1 + 16 * i;
2502     row_txfm(buf, buf, 12);
2503     store_buffer_s16_x8(buf, output + 8 * i, 32, 16);
2504   }
2505 }
2506 
fdct32_neon(const int32x4_t * input,int32x4_t * output,int cos_bit)2507 static void fdct32_neon(const int32x4_t *input, int32x4_t *output,
2508                         int cos_bit) {
2509   const int16_t *cospi = cospi_arr_q13(cos_bit);
2510 
2511   const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
2512   const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
2513   const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
2514   const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
2515   const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
2516   const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
2517   const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
2518   const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
2519 
2520   const int16x4_t cospi32 = vget_low_s16(cospi32_16);
2521   const int16x4_t cospi16 = vget_high_s16(cospi32_16);
2522   const int16x4_t cospi8 = vget_low_s16(cospi8_24);
2523   const int16x4_t cospi24 = vget_high_s16(cospi8_24);
2524   const int16x4_t cospi4 = vget_low_s16(cospi4_12);
2525   const int16x4_t cospi12 = vget_high_s16(cospi4_12);
2526   const int16x4_t cospi20 = vget_low_s16(cospi20_28);
2527   const int16x4_t cospi28 = vget_high_s16(cospi20_28);
2528   const int16x4_t cospi2 = vget_low_s16(cospi2_6);
2529   const int16x4_t cospi6 = vget_high_s16(cospi2_6);
2530   const int16x4_t cospi10 = vget_low_s16(cospi10_14);
2531   const int16x4_t cospi14 = vget_high_s16(cospi10_14);
2532   const int16x4_t cospi18 = vget_low_s16(cospi18_22);
2533   const int16x4_t cospi22 = vget_high_s16(cospi18_22);
2534   const int16x4_t cospi26 = vget_low_s16(cospi26_30);
2535   const int16x4_t cospi30 = vget_high_s16(cospi26_30);
2536 
2537   int32x4_t buf0[32];
2538   int32x4_t buf1[32];
2539 
2540   // stage 1
2541   butterfly_dct_pre_s32_x4(input, buf1, 32);
2542 
2543   // stage 2
2544   butterfly_dct_pre_s32_x4(buf1, buf0, 16);
2545   buf0[16] = buf1[16];
2546   buf0[17] = buf1[17];
2547   buf0[18] = buf1[18];
2548   buf0[19] = buf1[19];
2549   butterfly_s32_s32_x4_0112_neon(cospi32, buf1[27], buf1[20], &buf0[27],
2550                                  &buf0[20]);
2551   butterfly_s32_s32_x4_0112_neon(cospi32, buf1[26], buf1[21], &buf0[26],
2552                                  &buf0[21]);
2553   butterfly_s32_s32_x4_0112_neon(cospi32, buf1[25], buf1[22], &buf0[25],
2554                                  &buf0[22]);
2555   butterfly_s32_s32_x4_0112_neon(cospi32, buf1[24], buf1[23], &buf0[24],
2556                                  &buf0[23]);
2557   buf0[28] = buf1[28];
2558   buf0[29] = buf1[29];
2559   buf0[30] = buf1[30];
2560   buf0[31] = buf1[31];
2561 
2562   // stage 3
2563   butterfly_dct_pre_s32_x4(buf0, buf1, 8);
2564   buf1[8] = buf0[8];
2565   buf1[9] = buf0[9];
2566   butterfly_s32_s32_x4_0112_neon(cospi32, buf0[13], buf0[10], &buf1[13],
2567                                  &buf1[10]);
2568   butterfly_s32_s32_x4_0112_neon(cospi32, buf0[12], buf0[11], &buf1[12],
2569                                  &buf1[11]);
2570   buf1[14] = buf0[14];
2571   buf1[15] = buf0[15];
2572   butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 16);
2573 
2574   // stage 4
2575   butterfly_dct_pre_s32_x4(buf1, buf0, 4);
2576   buf0[4] = buf1[4];
2577   butterfly_s32_s32_x4_0112_neon(cospi32, buf1[6], buf1[5], &buf0[6], &buf0[5]);
2578   buf0[7] = buf1[7];
2579   butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 8);
2580   buf0[16] = buf1[16];
2581   buf0[17] = buf1[17];
2582   butterfly_s32_s32_x4_0112_neon(cospi16, buf1[29], buf1[18], &buf0[29],
2583                                  &buf0[18]);
2584   butterfly_s32_s32_x4_0112_neon(cospi16, buf1[28], buf1[19], &buf0[28],
2585                                  &buf0[19]);
2586   butterfly_s32_s32_x4_1223_neon(cospi16, buf1[27], buf1[20], &buf0[27],
2587                                  &buf0[20]);
2588   butterfly_s32_s32_x4_1223_neon(cospi16, buf1[26], buf1[21], &buf0[26],
2589                                  &buf0[21]);
2590   buf0[22] = buf1[22];
2591   buf0[23] = buf1[23];
2592   buf0[24] = buf1[24];
2593   buf0[25] = buf1[25];
2594   buf0[30] = buf1[30];
2595   buf0[31] = buf1[31];
2596 
2597   // stage 5
2598   butterfly_s32_s32_x4_0112_neon(cospi32, buf0[0], buf0[1], &buf1[0], &buf1[1]);
2599   butterfly_s32_s32_x4_0112_neon(cospi16, buf0[3], buf0[2], &buf1[2], &buf1[3]);
2600   butterfly_dct_post_s32_x4(buf0 + 4, buf0 + 4, buf1 + 4, 4);
2601   buf1[8] = buf0[8];
2602   butterfly_s32_s32_x4_0112_neon(cospi16, buf0[14], buf0[9], &buf1[14],
2603                                  &buf1[9]);
2604   butterfly_s32_s32_x4_1223_neon(cospi16, buf0[13], buf0[10], &buf1[13],
2605                                  &buf1[10]);
2606   buf1[11] = buf0[11];
2607   buf1[12] = buf0[12];
2608   buf1[15] = buf0[15];
2609   butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 8);
2610   butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 8);
2611 
2612   // stage 6
2613   buf0[0] = buf1[0];
2614   buf0[1] = buf1[1];
2615   buf0[2] = buf1[2];
2616   buf0[3] = buf1[3];
2617   butterfly_s32_s32_x4_0112_neon(cospi8, buf1[7], buf1[4], &buf0[4], &buf0[7]);
2618   butterfly_s32_s32_x4_1003_neon(cospi24, buf1[6], buf1[5], &buf0[5], &buf0[6]);
2619   butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 4);
2620   butterfly_dct_post_s32_x4(buf1 + 12, buf1 + 12, buf0 + 12, 4);
2621   buf0[16] = buf1[16];
2622   butterfly_s32_s32_x4_0112_neon(cospi8, buf1[30], buf1[17], &buf0[30],
2623                                  &buf0[17]);
2624   butterfly_s32_s32_x4_1223_neon(cospi8, buf1[29], buf1[18], &buf0[29],
2625                                  &buf0[18]);
2626   buf0[19] = buf1[19];
2627   buf0[20] = buf1[20];
2628   butterfly_s32_s32_x4_1003_neon(cospi24, buf1[26], buf1[21], &buf0[26],
2629                                  &buf0[21]);
2630   butterfly_s32_s32_x4_0332_neon(cospi24, buf1[25], buf1[22], &buf0[25],
2631                                  &buf0[22]);
2632   buf0[23] = buf1[23];
2633   buf0[24] = buf1[24];
2634   buf0[27] = buf1[27];
2635   buf0[28] = buf1[28];
2636   buf0[31] = buf1[31];
2637 
2638   // stage 7
2639   buf1[0] = buf0[0];
2640   buf1[1] = buf0[1];
2641   buf1[2] = buf0[2];
2642   buf1[3] = buf0[3];
2643   buf1[4] = buf0[4];
2644   buf1[5] = buf0[5];
2645   buf1[6] = buf0[6];
2646   buf1[7] = buf0[7];
2647   butterfly_s32_s32_x4_0112_neon(cospi4, buf0[15], buf0[8], &buf1[8],
2648                                  &buf1[15]);
2649   butterfly_s32_s32_x4_1003_neon(cospi28, buf0[14], buf0[9], &buf1[9],
2650                                  &buf1[14]);
2651   butterfly_s32_s32_x4_0112_neon(cospi20, buf0[13], buf0[10], &buf1[10],
2652                                  &buf1[13]);
2653   butterfly_s32_s32_x4_1003_neon(cospi12, buf0[12], buf0[11], &buf1[11],
2654                                  &buf1[12]);
2655   butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 4);
2656   butterfly_dct_post_s32_x4(buf0 + 20, buf0 + 20, buf1 + 20, 4);
2657   butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 4);
2658   butterfly_dct_post_s32_x4(buf0 + 28, buf0 + 28, buf1 + 28, 4);
2659 
2660   // stage 8
2661   buf0[0] = buf1[0];
2662   buf0[1] = buf1[1];
2663   buf0[2] = buf1[2];
2664   buf0[3] = buf1[3];
2665   buf0[4] = buf1[4];
2666   buf0[5] = buf1[5];
2667   buf0[6] = buf1[6];
2668   buf0[7] = buf1[7];
2669   buf0[8] = buf1[8];
2670   buf0[9] = buf1[9];
2671   buf0[10] = buf1[10];
2672   buf0[11] = buf1[11];
2673   buf0[12] = buf1[12];
2674   buf0[13] = buf1[13];
2675   buf0[14] = buf1[14];
2676   buf0[15] = buf1[15];
2677   butterfly_s32_s32_x4_0112_neon(cospi2, buf1[31], buf1[16], &buf0[16],
2678                                  &buf0[31]);
2679   butterfly_s32_s32_x4_1003_neon(cospi30, buf1[30], buf1[17], &buf0[17],
2680                                  &buf0[30]);
2681   butterfly_s32_s32_x4_0112_neon(cospi18, buf1[29], buf1[18], &buf0[18],
2682                                  &buf0[29]);
2683   butterfly_s32_s32_x4_1003_neon(cospi14, buf1[28], buf1[19], &buf0[19],
2684                                  &buf0[28]);
2685   butterfly_s32_s32_x4_0112_neon(cospi10, buf1[27], buf1[20], &buf0[20],
2686                                  &buf0[27]);
2687   butterfly_s32_s32_x4_1003_neon(cospi22, buf1[26], buf1[21], &buf0[21],
2688                                  &buf0[26]);
2689   butterfly_s32_s32_x4_0112_neon(cospi26, buf1[25], buf1[22], &buf0[22],
2690                                  &buf0[25]);
2691   butterfly_s32_s32_x4_1003_neon(cospi6, buf1[24], buf1[23], &buf0[23],
2692                                  &buf0[24]);
2693 
2694   // stage 9
2695   output[0] = buf0[0];
2696   output[1] = buf0[16];
2697   output[2] = buf0[8];
2698   output[3] = buf0[24];
2699   output[4] = buf0[4];
2700   output[5] = buf0[20];
2701   output[6] = buf0[12];
2702   output[7] = buf0[28];
2703   output[8] = buf0[2];
2704   output[9] = buf0[18];
2705   output[10] = buf0[10];
2706   output[11] = buf0[26];
2707   output[12] = buf0[6];
2708   output[13] = buf0[22];
2709   output[14] = buf0[14];
2710   output[15] = buf0[30];
2711   output[16] = buf0[1];
2712   output[17] = buf0[17];
2713   output[18] = buf0[9];
2714   output[19] = buf0[25];
2715   output[20] = buf0[5];
2716   output[21] = buf0[21];
2717   output[22] = buf0[13];
2718   output[23] = buf0[29];
2719   output[24] = buf0[3];
2720   output[25] = buf0[19];
2721   output[26] = buf0[11];
2722   output[27] = buf0[27];
2723   output[28] = buf0[7];
2724   output[29] = buf0[23];
2725   output[30] = buf0[15];
2726   output[31] = buf0[31];
2727 }
2728 
fdct64_neon(const int32x4_t * input,int32x4_t * output,int cos_bit)2729 static void fdct64_neon(const int32x4_t *input, int32x4_t *output,
2730                         int cos_bit) {
2731   const int16_t *cospi = cospi_arr_q13(cos_bit);
2732 
2733   const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
2734   const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
2735   const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
2736   const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
2737   const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
2738   const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
2739   const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
2740   const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
2741   const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
2742   const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
2743   const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
2744   const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
2745   const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
2746   const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
2747   const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
2748   const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
2749 
2750   const int16x4_t cospi32 = vget_low_s16(cospi32_16);
2751   const int16x4_t cospi16 = vget_high_s16(cospi32_16);
2752   const int16x4_t cospi8 = vget_low_s16(cospi8_24);
2753   const int16x4_t cospi24 = vget_high_s16(cospi8_24);
2754   const int16x4_t cospi4 = vget_low_s16(cospi4_12);
2755   const int16x4_t cospi12 = vget_high_s16(cospi4_12);
2756   const int16x4_t cospi20 = vget_low_s16(cospi20_28);
2757   const int16x4_t cospi28 = vget_high_s16(cospi20_28);
2758   const int16x4_t cospi2 = vget_low_s16(cospi2_6);
2759   const int16x4_t cospi6 = vget_high_s16(cospi2_6);
2760   const int16x4_t cospi10 = vget_low_s16(cospi10_14);
2761   const int16x4_t cospi14 = vget_high_s16(cospi10_14);
2762   const int16x4_t cospi18 = vget_low_s16(cospi18_22);
2763   const int16x4_t cospi22 = vget_high_s16(cospi18_22);
2764   const int16x4_t cospi26 = vget_low_s16(cospi26_30);
2765   const int16x4_t cospi30 = vget_high_s16(cospi26_30);
2766   const int16x4_t cospi1 = vget_low_s16(cospi1_3);
2767   const int16x4_t cospi3 = vget_high_s16(cospi1_3);
2768   const int16x4_t cospi5 = vget_low_s16(cospi5_7);
2769   const int16x4_t cospi7 = vget_high_s16(cospi5_7);
2770   const int16x4_t cospi9 = vget_low_s16(cospi9_11);
2771   const int16x4_t cospi11 = vget_high_s16(cospi9_11);
2772   const int16x4_t cospi13 = vget_low_s16(cospi13_15);
2773   const int16x4_t cospi15 = vget_high_s16(cospi13_15);
2774   const int16x4_t cospi17 = vget_low_s16(cospi17_19);
2775   const int16x4_t cospi19 = vget_high_s16(cospi17_19);
2776   const int16x4_t cospi21 = vget_low_s16(cospi21_23);
2777   const int16x4_t cospi23 = vget_high_s16(cospi21_23);
2778   const int16x4_t cospi25 = vget_low_s16(cospi25_27);
2779   const int16x4_t cospi27 = vget_high_s16(cospi25_27);
2780   const int16x4_t cospi29 = vget_low_s16(cospi29_31);
2781   const int16x4_t cospi31 = vget_high_s16(cospi29_31);
2782 
2783   // stage 1
2784   int32x4_t x1[64];
2785   butterfly_dct_pre_s32_x4(input, x1, 64);
2786 
2787   // stage 2
2788   int32x4_t x2[64];
2789   butterfly_dct_pre_s32_x4(x1, x2, 32);
2790   butterfly_s32_s32_x4_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
2791   butterfly_s32_s32_x4_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
2792   butterfly_s32_s32_x4_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
2793   butterfly_s32_s32_x4_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
2794   butterfly_s32_s32_x4_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
2795   butterfly_s32_s32_x4_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
2796   butterfly_s32_s32_x4_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
2797   butterfly_s32_s32_x4_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
2798 
2799   // stage 3
2800   int32x4_t x3[64];
2801   butterfly_dct_pre_s32_x4(x2, x3, 16);
2802   butterfly_s32_s32_x4_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
2803   butterfly_s32_s32_x4_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
2804   butterfly_s32_s32_x4_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
2805   butterfly_s32_s32_x4_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
2806   butterfly_dct_post_s32_x4(x1 + 32, x2 + 32, x3 + 32, 32);
2807 
2808   // stage 4
2809   int32x4_t x4[64];
2810   butterfly_dct_pre_s32_x4(x3, x4, 8);
2811   butterfly_s32_s32_x4_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
2812   butterfly_s32_s32_x4_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
2813   butterfly_dct_post_s32_x4(x2 + 16, x3 + 16, x4 + 16, 16);
2814   butterfly_s32_s32_x4_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
2815   butterfly_s32_s32_x4_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
2816   butterfly_s32_s32_x4_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
2817   butterfly_s32_s32_x4_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
2818   butterfly_s32_s32_x4_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
2819   butterfly_s32_s32_x4_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
2820   butterfly_s32_s32_x4_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
2821   butterfly_s32_s32_x4_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
2822 
2823   // stage 5
2824   int32x4_t x5[64];
2825   butterfly_dct_pre_s32_x4(x4, x5, 4);
2826   butterfly_s32_s32_x4_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
2827   butterfly_dct_post_s32_x4(x3 + 8, x4 + 8, x5 + 8, 8);
2828   butterfly_s32_s32_x4_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
2829   butterfly_s32_s32_x4_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
2830   butterfly_s32_s32_x4_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
2831   butterfly_s32_s32_x4_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
2832   butterfly_dct_post_s32_x4(x3 + 32, x4 + 32, x5 + 32, 16);
2833   butterfly_dct_post_s32_x4(x3 + 48, x4 + 48, x5 + 48, 16);
2834 
2835   // stage 6
2836   int32x4_t x6[64];
2837   butterfly_s32_s32_x4_0112_neon(cospi32, x5[0], x5[1], &x6[0], &x6[1]);
2838   butterfly_s32_s32_x4_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
2839   butterfly_dct_post_s32_x4(x4 + 4, x5 + 4, x6 + 4, 4);
2840   butterfly_s32_s32_x4_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
2841   butterfly_s32_s32_x4_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
2842   butterfly_dct_post_s32_x4(x4 + 16, x5 + 16, x6 + 16, 8);
2843   butterfly_dct_post_s32_x4(x4 + 24, x5 + 24, x6 + 24, 8);
2844   butterfly_s32_s32_x4_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
2845   butterfly_s32_s32_x4_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
2846   butterfly_s32_s32_x4_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
2847   butterfly_s32_s32_x4_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
2848   butterfly_s32_s32_x4_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
2849   butterfly_s32_s32_x4_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
2850   butterfly_s32_s32_x4_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
2851   butterfly_s32_s32_x4_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
2852 
2853   // stage 7
2854   int32x4_t x7[64];
2855   butterfly_s32_s32_x4_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
2856   butterfly_s32_s32_x4_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
2857   butterfly_dct_post_s32_x4(x5 + 8, x6 + 8, x7 + 8, 4);
2858   butterfly_dct_post_s32_x4(x5 + 12, x6 + 12, x7 + 12, 4);
2859   butterfly_s32_s32_x4_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
2860   butterfly_s32_s32_x4_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
2861   butterfly_s32_s32_x4_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
2862   butterfly_s32_s32_x4_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
2863   butterfly_dct_post_s32_x4(x5 + 32, x6 + 32, x7 + 32, 8);
2864   butterfly_dct_post_s32_x4(x5 + 40, x6 + 40, x7 + 40, 8);
2865   butterfly_dct_post_s32_x4(x5 + 48, x6 + 48, x7 + 48, 8);
2866   butterfly_dct_post_s32_x4(x5 + 56, x6 + 56, x7 + 56, 8);
2867 
2868   // stage 8
2869   int32x4_t x8[64];
2870   butterfly_s32_s32_x4_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
2871   butterfly_s32_s32_x4_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
2872   butterfly_s32_s32_x4_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
2873   butterfly_s32_s32_x4_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
2874   butterfly_dct_post_s32_x4(x6 + 16, x7 + 16, x8 + 16, 4);
2875   butterfly_dct_post_s32_x4(x6 + 20, x7 + 20, x8 + 20, 4);
2876   butterfly_dct_post_s32_x4(x6 + 24, x7 + 24, x8 + 24, 4);
2877   butterfly_dct_post_s32_x4(x6 + 28, x7 + 28, x8 + 28, 4);
2878   butterfly_s32_s32_x4_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
2879   butterfly_s32_s32_x4_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
2880   butterfly_s32_s32_x4_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
2881   butterfly_s32_s32_x4_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
2882   butterfly_s32_s32_x4_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
2883   butterfly_s32_s32_x4_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
2884   butterfly_s32_s32_x4_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
2885   butterfly_s32_s32_x4_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
2886 
2887   // stage 9
2888   int32x4_t x9[64];
2889   butterfly_s32_s32_x4_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
2890   butterfly_s32_s32_x4_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
2891   butterfly_s32_s32_x4_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
2892   butterfly_s32_s32_x4_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
2893   butterfly_s32_s32_x4_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
2894   butterfly_s32_s32_x4_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
2895   butterfly_s32_s32_x4_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
2896   butterfly_s32_s32_x4_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
2897   butterfly_dct_post_s32_x4(x7 + 32, x8 + 32, x9 + 32, 4);
2898   butterfly_dct_post_s32_x4(x7 + 36, x8 + 36, x9 + 36, 4);
2899   butterfly_dct_post_s32_x4(x7 + 40, x8 + 40, x9 + 40, 4);
2900   butterfly_dct_post_s32_x4(x7 + 44, x8 + 44, x9 + 44, 4);
2901   butterfly_dct_post_s32_x4(x7 + 48, x8 + 48, x9 + 48, 4);
2902   butterfly_dct_post_s32_x4(x7 + 52, x8 + 52, x9 + 52, 4);
2903   butterfly_dct_post_s32_x4(x7 + 56, x8 + 56, x9 + 56, 4);
2904   butterfly_dct_post_s32_x4(x7 + 60, x8 + 60, x9 + 60, 4);
2905 
2906   // stage 10
2907   int32x4_t x10[64];
2908   butterfly_s32_s32_x4_0112_neon(cospi1, x9[63], x9[32], &x10[32], &x10[63]);
2909   butterfly_s32_s32_x4_1003_neon(cospi31, x9[62], x9[33], &x10[33], &x10[62]);
2910   butterfly_s32_s32_x4_0112_neon(cospi17, x9[61], x9[34], &x10[34], &x10[61]);
2911   butterfly_s32_s32_x4_1003_neon(cospi15, x9[60], x9[35], &x10[35], &x10[60]);
2912   butterfly_s32_s32_x4_0112_neon(cospi9, x9[59], x9[36], &x10[36], &x10[59]);
2913   butterfly_s32_s32_x4_1003_neon(cospi23, x9[58], x9[37], &x10[37], &x10[58]);
2914   butterfly_s32_s32_x4_0112_neon(cospi25, x9[57], x9[38], &x10[38], &x10[57]);
2915   butterfly_s32_s32_x4_1003_neon(cospi7, x9[56], x9[39], &x10[39], &x10[56]);
2916   butterfly_s32_s32_x4_0112_neon(cospi5, x9[55], x9[40], &x10[40], &x10[55]);
2917   butterfly_s32_s32_x4_1003_neon(cospi27, x9[54], x9[41], &x10[41], &x10[54]);
2918   butterfly_s32_s32_x4_0112_neon(cospi21, x9[53], x9[42], &x10[42], &x10[53]);
2919   butterfly_s32_s32_x4_1003_neon(cospi11, x9[52], x9[43], &x10[43], &x10[52]);
2920   butterfly_s32_s32_x4_0112_neon(cospi13, x9[51], x9[44], &x10[44], &x10[51]);
2921   butterfly_s32_s32_x4_1003_neon(cospi19, x9[50], x9[45], &x10[45], &x10[50]);
2922   butterfly_s32_s32_x4_0112_neon(cospi29, x9[49], x9[46], &x10[46], &x10[49]);
2923   butterfly_s32_s32_x4_1003_neon(cospi3, x9[48], x9[47], &x10[47], &x10[48]);
2924 
2925   // stage 11, only store into the low 32 output indices.
2926   output[0] = x6[0];
2927   output[1] = x10[32];
2928   output[2] = x9[16];
2929   output[3] = x10[48];
2930   output[4] = x8[8];
2931   output[5] = x10[40];
2932   output[6] = x9[24];
2933   output[7] = x10[56];
2934   output[8] = x7[4];
2935   output[9] = x10[36];
2936   output[10] = x9[20];
2937   output[11] = x10[52];
2938   output[12] = x8[12];
2939   output[13] = x10[44];
2940   output[14] = x9[28];
2941   output[15] = x10[60];
2942   output[16] = x6[2];
2943   output[17] = x10[34];
2944   output[18] = x9[18];
2945   output[19] = x10[50];
2946   output[20] = x8[10];
2947   output[21] = x10[42];
2948   output[22] = x9[26];
2949   output[23] = x10[58];
2950   output[24] = x7[6];
2951   output[25] = x10[38];
2952   output[26] = x9[22];
2953   output[27] = x10[54];
2954   output[28] = x8[14];
2955   output[29] = x10[46];
2956   output[30] = x9[30];
2957   output[31] = x10[62];
2958 }
2959 
lowbd_fwd_txfm2d_64x64_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2960 static void lowbd_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
2961                                         int stride, TX_TYPE tx_type, int bd) {
2962   (void)bd;
2963   (void)tx_type;
2964   assert(tx_type == DCT_DCT);
2965   int16x8_t buf0[64], buf1[512];
2966   const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
2967 
2968   for (int i = 0; i < 8; i++) {
2969     load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
2970     col_txfm(buf0, buf0, 13);
2971     shift_right_2_round_s16_x8(buf0, buf0, 64);
2972     for (int j = 0; j < 4; ++j) {
2973       transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
2974     }
2975   }
2976   for (int i = 0; i < 4; i++) {
2977     int32x4_t bufA[64];
2978     int32x4_t bufB[64];
2979     int16x8_t *buf = buf1 + 64 * i;
2980     for (int j = 0; j < 64; ++j) {
2981       bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
2982       bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
2983     }
2984     fdct64_neon(bufA, bufA, 10);
2985     fdct64_neon(bufB, bufB, 10);
2986     shift_right_2_round_s32_x4(bufA, bufA, 32);
2987     shift_right_2_round_s32_x4(bufB, bufB, 32);
2988     store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
2989   }
2990 }
2991 
lowbd_fwd_txfm2d_64x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2992 static void lowbd_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *output,
2993                                         int stride, TX_TYPE tx_type, int bd) {
2994   (void)bd;
2995   int16x8_t buf0[64], buf1[256];
2996   const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
2997 
2998   for (int i = 0; i < 8; i++) {
2999     col_txfm(input + 8 * i, buf0, stride, 12);
3000     shift_right_4_round_s16_x8(buf0, buf0, 32);
3001     for (int j = 0; j < 4; ++j) {
3002       transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
3003     }
3004   }
3005   assert(tx_type == DCT_DCT);
3006   for (int i = 0; i < 4; i++) {
3007     int32x4_t bufA[64];
3008     int32x4_t bufB[64];
3009     int16x8_t *buf = buf1 + 64 * i;
3010     for (int j = 0; j < 64; ++j) {
3011       bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
3012       bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
3013     }
3014     fdct64_neon(bufA, bufA, 11);
3015     fdct64_neon(bufB, bufB, 11);
3016     shift_right_2_round_s32_x4(bufA, bufA, 32);
3017     shift_right_2_round_s32_x4(bufB, bufB, 32);
3018     round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
3019     round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
3020     store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
3021   }
3022 }
3023 
lowbd_fwd_txfm2d_32x64_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)3024 static void lowbd_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *output,
3025                                         int stride, TX_TYPE tx_type, int bd) {
3026   (void)bd;
3027   (void)tx_type;
3028   assert(tx_type == DCT_DCT);
3029   int16x8_t buf0[64], buf1[256];
3030   const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
3031 
3032   for (int i = 0; i < 4; i++) {
3033     load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
3034     col_txfm(buf0, buf0, 13);
3035     shift_right_2_round_s16_x8(buf0, buf0, 64);
3036     for (int j = 0; j < 4; ++j) {
3037       transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 32 + 8 * i);
3038     }
3039   }
3040 
3041   for (int i = 0; i < 4; i++) {
3042     int32x4_t bufA[32];
3043     int32x4_t bufB[32];
3044     int16x8_t *buf = buf1 + 32 * i;
3045     for (int j = 0; j < 32; ++j) {
3046       bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
3047       bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
3048     }
3049     fdct32_neon(bufA, bufA, 11);
3050     fdct32_neon(bufB, bufB, 11);
3051     shift_right_2_round_s32_x4(bufA, bufA, 32);
3052     shift_right_2_round_s32_x4(bufB, bufB, 32);
3053     round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
3054     round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
3055     store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
3056   }
3057 }
3058 
3059 static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = {
3060   lowbd_fwd_txfm2d_4x4_neon,    // 4x4 transform
3061   lowbd_fwd_txfm2d_8x8_neon,    // 8x8 transform
3062   lowbd_fwd_txfm2d_16x16_neon,  // 16x16 transform
3063   lowbd_fwd_txfm2d_32x32_neon,  // 32x32 transform
3064   lowbd_fwd_txfm2d_64x64_neon,  // 64x64 transform
3065   lowbd_fwd_txfm2d_4x8_neon,    // 4x8 transform
3066   lowbd_fwd_txfm2d_8x4_neon,    // 8x4 transform
3067   lowbd_fwd_txfm2d_8x16_neon,   // 8x16 transform
3068   lowbd_fwd_txfm2d_16x8_neon,   // 16x8 transform
3069   lowbd_fwd_txfm2d_16x32_neon,  // 16x32 transform
3070   lowbd_fwd_txfm2d_32x16_neon,  // 32x16 transform
3071   lowbd_fwd_txfm2d_32x64_neon,  // 32x64 transform
3072   lowbd_fwd_txfm2d_64x32_neon,  // 64x32 transform
3073   lowbd_fwd_txfm2d_4x16_neon,   // 4x16 transform
3074   lowbd_fwd_txfm2d_16x4_neon,   // 16x4 transform
3075   lowbd_fwd_txfm2d_8x32_neon,   // 8x32 transform
3076   lowbd_fwd_txfm2d_32x8_neon,   // 32x8 transform
3077   lowbd_fwd_txfm2d_16x64_neon,  // 16x64 transform
3078   lowbd_fwd_txfm2d_64x16_neon,  // 64x16 transform
3079 };
3080 
av1_lowbd_fwd_txfm_neon(const int16_t * src_diff,tran_low_t * coeff,int diff_stride,TxfmParam * txfm_param)3081 void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff,
3082                              int diff_stride, TxfmParam *txfm_param) {
3083   FwdTxfm2dFunc fwd_txfm2d_func = lowbd_fwd_txfm_func_ls[txfm_param->tx_size];
3084   if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
3085     av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
3086   } else {
3087     fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
3088                     txfm_param->bd);
3089   }
3090 }
3091