1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14
15 #include "aom_dsp/arm/mem_neon.h"
16 #include "aom_dsp/arm/transpose_neon.h"
17 #include "aom_dsp/txfm_common.h"
18 #include "aom_ports/mem.h"
19 #include "av1/common/av1_txfm.h"
20 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
21 #include "config/aom_config.h"
22 #include "config/av1_rtcd.h"
23 #include "shift_neon.h"
24 #include "txfm_neon.h"
25
26 #define TXFM_COS_BIT_MAX 13
27
28 // A note on butterfly helper naming:
29 //
30 // butterfly_[input_ty]_[acc_ty]_[input_num]_[weight_num]_[weight_neg]_neon
31 // e.g. butterfly_s32_s32_x4_0231_neon
32 // | | | ^ Weights are applied as indices 0, 2, 3, 1
33 // | | | (see more detail below)
34 // | | ^ (int32)x4 input/output parameters
35 // | ^ 32-bit accumulators internally
36 // ^ 32-bit input/output parameters
37 //
38 // Weights are stored as 4-tuples in Q2.13 format as (w0, 1-w0, -w0, w0-1) to
39 // avoid needing separate negation instructions. This is represented in the
40 // helper naming by referring to the lane index in the loaded tuple that each
41 // multiply is performed with:
42 //
43 // in0 in1
44 // /----------
45 // out0 | w0 w1 ==> out0 = in0 * w0 + in1 * w1
46 // out1 | w2 w3 ==> out1 = in0 * w2 + in1 * w3
47 //
48 // So for indices 0331 from the earlier example, we end up with:
49 //
50 // in0 in1
51 // /------------------
52 // out0 | (lane 0) (lane 2) ==> out0 = in0 * w0 + in1 * -w0
53 // out1 | (lane 3) (lane 1) ==> out1 = in0 * (w0-1) + in1 * (1-w0)
54
butterfly_s32_s32_x4_0112_neon(const int16x4_t w0101_s16,const int32x4_t in0,const int32x4_t in1,int32x4_t * out0,int32x4_t * out1)55 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0112_neon(
56 const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
57 int32x4_t *out0, int32x4_t *out1) {
58 int32x4_t w0101 = vmovl_s16(w0101_s16);
59 int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
60 o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 1);
61 int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
62 o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
63 *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
64 *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
65 }
66
butterfly_s32_s32_x4_0332_neon(const int16x4_t w0101_s16,const int32x4_t in0,const int32x4_t in1,int32x4_t * out0,int32x4_t * out1)67 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0332_neon(
68 const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
69 int32x4_t *out0, int32x4_t *out1) {
70 int32x4_t w0101 = vmovl_s16(w0101_s16);
71 int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
72 o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 1);
73 int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 1);
74 o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
75 *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
76 *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
77 }
78
butterfly_s32_s32_x4_1003_neon(const int16x4_t w0101_s16,const int32x4_t in0,const int32x4_t in1,int32x4_t * out0,int32x4_t * out1)79 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1003_neon(
80 const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
81 int32x4_t *out0, int32x4_t *out1) {
82 int32x4_t w0101 = vmovl_s16(w0101_s16);
83 int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
84 o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 0);
85 int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
86 o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
87 *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
88 *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
89 }
90
butterfly_s32_s32_x4_1223_neon(const int16x4_t w0101_s16,const int32x4_t in0,const int32x4_t in1,int32x4_t * out0,int32x4_t * out1)91 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1223_neon(
92 const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
93 int32x4_t *out0, int32x4_t *out1) {
94 int32x4_t w0101 = vmovl_s16(w0101_s16);
95 int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
96 o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 0);
97 int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 0);
98 o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
99 *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
100 *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
101 }
102
103 #define butterfly_s16_s32_x4_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
104 out0, out1) \
105 do { \
106 int32x4_t u0 = vmull_lane_s16(in0, wvec, lane0); \
107 u0 = vmlal_lane_s16(u0, in1, wvec, lane1); \
108 int32x4_t v0 = vmull_lane_s16(in0, wvec, lane2); \
109 v0 = vmlal_lane_s16(v0, in1, wvec, lane3); \
110 *out0 = vqrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \
111 *out1 = vqrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \
112 } while (0)
113
butterfly_s16_s32_x4_0112_neon(const int16x4_t w0101,const int16x4_t in0,const int16x4_t in1,int16x4_t * out0,int16x4_t * out1)114 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0112_neon(
115 const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
116 int16x4_t *out0, int16x4_t *out1) {
117 butterfly_s16_s32_x4_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
118 }
119
butterfly_s16_s32_x4_0332_neon(const int16x4_t w0101,const int16x4_t in0,const int16x4_t in1,int16x4_t * out0,int16x4_t * out1)120 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0332_neon(
121 const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
122 int16x4_t *out0, int16x4_t *out1) {
123 butterfly_s16_s32_x4_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
124 }
125
butterfly_s16_s32_x4_1003_neon(const int16x4_t w0101,const int16x4_t in0,const int16x4_t in1,int16x4_t * out0,int16x4_t * out1)126 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1003_neon(
127 const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
128 int16x4_t *out0, int16x4_t *out1) {
129 butterfly_s16_s32_x4_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
130 }
131
butterfly_s16_s32_x4_1223_neon(const int16x4_t w0101,const int16x4_t in0,const int16x4_t in1,int16x4_t * out0,int16x4_t * out1)132 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1223_neon(
133 const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
134 int16x4_t *out0, int16x4_t *out1) {
135 butterfly_s16_s32_x4_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
136 }
137
138 #define butterfly_s16_s32_x8_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
139 out0, out1) \
140 do { \
141 int32x4_t u0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane0); \
142 u0 = vmlal_lane_s16(u0, vget_low_s16(in1), wvec, lane1); \
143 int32x4_t u1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane0); \
144 u1 = vmlal_lane_s16(u1, vget_high_s16(in1), wvec, lane1); \
145 int32x4_t v0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane2); \
146 v0 = vmlal_lane_s16(v0, vget_low_s16(in1), wvec, lane3); \
147 int32x4_t v1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane2); \
148 v1 = vmlal_lane_s16(v1, vget_high_s16(in1), wvec, lane3); \
149 const int16x4_t c0 = vrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \
150 const int16x4_t c1 = vrshrn_n_s32(u1, TXFM_COS_BIT_MAX); \
151 const int16x4_t d0 = vrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \
152 const int16x4_t d1 = vrshrn_n_s32(v1, TXFM_COS_BIT_MAX); \
153 *out0 = vcombine_s16(c0, c1); \
154 *out1 = vcombine_s16(d0, d1); \
155 } while (0)
156
butterfly_s16_s32_x8_0112_neon(const int16x4_t w0101,const int16x8_t in0,const int16x8_t in1,int16x8_t * out0,int16x8_t * out1)157 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0112_neon(
158 const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
159 int16x8_t *out0, int16x8_t *out1) {
160 butterfly_s16_s32_x8_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
161 }
162
butterfly_s16_s32_x8_0332_neon(const int16x4_t w0101,const int16x8_t in0,const int16x8_t in1,int16x8_t * out0,int16x8_t * out1)163 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0332_neon(
164 const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
165 int16x8_t *out0, int16x8_t *out1) {
166 butterfly_s16_s32_x8_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
167 }
168
butterfly_s16_s32_x8_1003_neon(const int16x4_t w0101,const int16x8_t in0,const int16x8_t in1,int16x8_t * out0,int16x8_t * out1)169 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1003_neon(
170 const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
171 int16x8_t *out0, int16x8_t *out1) {
172 butterfly_s16_s32_x8_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
173 }
174
butterfly_s16_s32_x8_1223_neon(const int16x4_t w0101,const int16x8_t in0,const int16x8_t in1,int16x8_t * out0,int16x8_t * out1)175 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1223_neon(
176 const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
177 int16x8_t *out0, int16x8_t *out1) {
178 butterfly_s16_s32_x8_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
179 }
180
flip_buf_4_neon(int16x4_t * in,int16x4_t * out,int size)181 static AOM_FORCE_INLINE void flip_buf_4_neon(int16x4_t *in, int16x4_t *out,
182 int size) {
183 for (int i = 0; i < size; ++i) {
184 out[size - i - 1] = in[i];
185 }
186 }
187
flip_buf_8_neon(int16x8_t * in,int16x8_t * out,int size)188 static AOM_FORCE_INLINE void flip_buf_8_neon(int16x8_t *in, int16x8_t *out,
189 int size) {
190 for (int i = 0; i < size; ++i) {
191 out[size - i - 1] = in[i];
192 }
193 }
194
store_buffer_interleaved_s32_x8(int32_t * const out,const int32x4_t * const in1,const int32x4_t * const in2,const int stride,const int out_size)195 static AOM_FORCE_INLINE void store_buffer_interleaved_s32_x8(
196 int32_t *const out, const int32x4_t *const in1, const int32x4_t *const in2,
197 const int stride, const int out_size) {
198 for (int i = 0; i < out_size; ++i) {
199 vst1q_s32(out + stride * i, in1[i]);
200 vst1q_s32(out + stride * i + 4, in2[i]);
201 }
202 }
203
load_buffer_s16_x4(const int16_t * in,const int stride,int16x4_t * const out,const int out_size)204 static AOM_FORCE_INLINE void load_buffer_s16_x4(const int16_t *in,
205 const int stride,
206 int16x4_t *const out,
207 const int out_size) {
208 for (int i = 0; i < out_size; ++i) {
209 out[i] = vld1_s16(in);
210 in += stride;
211 }
212 }
213
load_buffer_s16_x8(const int16_t * in,int stride,int16x8_t * out,int out_size)214 static AOM_FORCE_INLINE void load_buffer_s16_x8(const int16_t *in, int stride,
215 int16x8_t *out, int out_size) {
216 for (int i = 0; i < out_size; ++i) {
217 out[i] = vld1q_s16(in + i * stride);
218 }
219 }
220
store_buffer_s16_x4(const int16x4_t * const in,int32_t * const out,const int stride,const int out_size)221 static AOM_FORCE_INLINE void store_buffer_s16_x4(const int16x4_t *const in,
222 int32_t *const out,
223 const int stride,
224 const int out_size) {
225 for (int i = 0; i < out_size; ++i) {
226 vst1q_s32(out + i * stride, vmovl_s16(in[i]));
227 }
228 }
229
store_buffer_s16_x8(const int16x8_t * const in,int32_t * const out,const int stride,const int out_size)230 static AOM_FORCE_INLINE void store_buffer_s16_x8(const int16x8_t *const in,
231 int32_t *const out,
232 const int stride,
233 const int out_size) {
234 for (int i = 0; i < out_size; ++i) {
235 vst1q_s32(out + i * stride + 0, vmovl_s16(vget_low_s16(in[i])));
236 vst1q_s32(out + i * stride + 4, vmovl_s16(vget_high_s16(in[i])));
237 }
238 }
239
240 // A note on naming:
241 // round_shift_[sqrt2]_s16_s32_4x1_neon(...)
242 // | | | ^ 1 => a single vector
243 // | | | n => an array of vectors
244 // | | | ^ input/output vector element count
245 // | | ^ output type
246 // | ^ input type
247 // ^ multiplicand and shift identifier
248
249 static AOM_FORCE_INLINE int16x4_t
round_shift_sqrt2_s16_s16_4x1_neon(int16x4_t a)250 round_shift_sqrt2_s16_s16_4x1_neon(int16x4_t a) {
251 return vqrshrn_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
252 }
253
254 static AOM_FORCE_INLINE int16x8_t
round_shift_sqrt2_s16_s16_8x1_neon(int16x8_t a)255 round_shift_sqrt2_s16_s16_8x1_neon(int16x8_t a) {
256 return vcombine_s16(round_shift_sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
257 round_shift_sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
258 }
259
260 static AOM_FORCE_INLINE int16x4_t
round_shift_2sqrt2_s16_s16_4x1_neon(int16x4_t a)261 round_shift_2sqrt2_s16_s16_4x1_neon(int16x4_t a) {
262 return vqrshrn_n_s32(vmull_n_s16(a, 2 * NewSqrt2), NewSqrt2Bits);
263 }
264
265 static AOM_FORCE_INLINE int16x8_t
round_shift_2sqrt2_s16_s16_8x1_neon(int16x8_t a)266 round_shift_2sqrt2_s16_s16_8x1_neon(int16x8_t a) {
267 return vcombine_s16(round_shift_2sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
268 round_shift_2sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
269 }
270
271 static AOM_FORCE_INLINE int32x4_t
round_shift_sqrt2_s16_s32_4x1_neon(int16x4_t a)272 round_shift_sqrt2_s16_s32_4x1_neon(int16x4_t a) {
273 return vrshrq_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
274 }
275
276 static AOM_FORCE_INLINE int32x4_t
round_shift_sqrt2_s32_s32_4x1_neon(int32x4_t a)277 round_shift_sqrt2_s32_s32_4x1_neon(int32x4_t a) {
278 return vrshrq_n_s32(vmulq_n_s32(a, NewSqrt2), NewSqrt2Bits);
279 }
280
281 #define ROUND_SHIFT_SQRT_LOOP_HELPER(name, type0, type1, fn) \
282 static AOM_FORCE_INLINE void name(const type0 *in, type1 *out, int size) { \
283 for (int i = 0; i < size; ++i) { \
284 out[i] = fn(in[i]); \
285 } \
286 }
287
ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s32_s32_4xn_neon,int32x4_t,int32x4_t,round_shift_sqrt2_s32_s32_4x1_neon)288 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s32_s32_4xn_neon, int32x4_t,
289 int32x4_t, round_shift_sqrt2_s32_s32_4x1_neon)
290 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_4xn_neon, int16x4_t,
291 int16x4_t, round_shift_sqrt2_s16_s16_4x1_neon)
292 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_8xn_neon, int16x8_t,
293 int16x8_t, round_shift_sqrt2_s16_s16_8x1_neon)
294 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_4xn_neon, int16x4_t,
295 int16x4_t, round_shift_2sqrt2_s16_s16_4x1_neon)
296 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_8xn_neon, int16x8_t,
297 int16x8_t, round_shift_2sqrt2_s16_s16_8x1_neon)
298
299 static AOM_FORCE_INLINE void store_rect_buffer_s16_x4(const int16x4_t *const in,
300 int32_t *const out,
301 const int stride,
302 const int out_size) {
303 for (int i = 0; i < out_size; ++i) {
304 vst1q_s32(out + i * stride, round_shift_sqrt2_s16_s32_4x1_neon(in[i]));
305 }
306 }
307
store_rect_buffer_s16_x8(const int16x8_t * const in,int32_t * const out,const int stride,const int out_size)308 static AOM_FORCE_INLINE void store_rect_buffer_s16_x8(const int16x8_t *const in,
309 int32_t *const out,
310 const int stride,
311 const int out_size) {
312 for (int i = 0; i < out_size; ++i) {
313 vst1q_s32(out + i * stride + 0,
314 round_shift_sqrt2_s16_s32_4x1_neon(vget_low_s16(in[i])));
315 vst1q_s32(out + i * stride + 4,
316 round_shift_sqrt2_s16_s32_4x1_neon(vget_high_s16(in[i])));
317 }
318 }
319
fadst4x4_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)320 static AOM_FORCE_INLINE void fadst4x4_neon(const int16x4_t *input,
321 int16x4_t *output, int cos_bit) {
322 int32x4_t u[6], v[6];
323 const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
324 const int16x4_t u01 = vqadd_s16(input[0], input[1]);
325
326 v[5] = vmull_lane_s16(input[2], sinpi, 2);
327 v[0] = vmull_lane_s16(input[1], sinpi, 1);
328 v[0] = vmlal_lane_s16(v[0], input[0], sinpi, 0);
329 v[1] = vmlal_lane_s16(v[5], input[3], sinpi, 3);
330 v[2] = vmull_lane_s16(u01, sinpi, 2);
331 v[3] = vmull_lane_s16(input[0], sinpi, 3);
332 v[3] = vmlsl_lane_s16(v[3], input[1], sinpi, 0);
333 v[4] = vmlsl_lane_s16(v[5], input[3], sinpi, 1);
334
335 u[0] = vaddq_s32(v[0], v[1]);
336 u[1] = vmlsl_lane_s16(v[2], input[3], sinpi, 2);
337 u[2] = vsubq_s32(v[3], v[4]);
338 u[3] = vsubq_s32(u[2], u[0]);
339 u[3] = vmlaq_n_s32(u[3], v[5], 3);
340
341 output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
342 output[1] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
343 output[2] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
344 output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
345 }
346
fadst4x8_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)347 static AOM_FORCE_INLINE void fadst4x8_neon(const int16x4_t *input,
348 int16x4_t *output, int cos_bit) {
349 const int16_t *cospi = cospi_arr_q13(cos_bit);
350
351 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
352 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
353 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
354
355 const int16x4_t cospi32 = vget_low_s16(cospi32_16);
356 const int16x4_t cospi16 = vget_high_s16(cospi32_16);
357 const int16x4_t cospi4 = vget_low_s16(cospi4_12);
358 const int16x4_t cospi12 = vget_high_s16(cospi4_12);
359 const int16x4_t cospi20 = vget_low_s16(cospi20_28);
360 const int16x4_t cospi28 = vget_high_s16(cospi20_28);
361
362 // stage 1-2
363 int16x4_t x2[8];
364 butterfly_s16_s32_x4_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
365 butterfly_s16_s32_x4_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
366
367 // stage 3
368 int16x4_t x3[8];
369 x3[0] = vqadd_s16(input[0], x2[2]);
370 x3[1] = vqsub_s16(x2[3], input[7]);
371 x3[2] = vqsub_s16(input[0], x2[2]);
372 x3[3] = vqadd_s16(input[7], x2[3]);
373 x3[4] = vqsub_s16(x2[6], input[1]);
374 x3[5] = vqadd_s16(input[6], x2[7]);
375 x3[6] = vqadd_s16(input[1], x2[6]);
376 x3[7] = vqsub_s16(input[6], x2[7]);
377
378 // stage 4
379 int16x4_t x4[8];
380 butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x4[4], &x4[5]);
381 butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x4[6], &x4[7]);
382
383 // stage 5
384 int16x4_t x5[8];
385 x5[0] = vqadd_s16(x3[0], x4[4]);
386 x5[1] = vqadd_s16(x3[1], x4[5]);
387 x5[2] = vqadd_s16(x3[2], x4[6]);
388 x5[3] = vqsub_s16(x4[7], x3[3]);
389 x5[4] = vqsub_s16(x3[0], x4[4]);
390 x5[5] = vqsub_s16(x3[1], x4[5]);
391 x5[6] = vqsub_s16(x3[2], x4[6]);
392 x5[7] = vqadd_s16(x3[3], x4[7]);
393
394 // stage 6-7
395 butterfly_s16_s32_x4_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
396 butterfly_s16_s32_x4_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
397 butterfly_s16_s32_x4_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
398 butterfly_s16_s32_x4_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
399 }
400
fadst8x4_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)401 static AOM_FORCE_INLINE void fadst8x4_neon(const int16x8_t *input,
402 int16x8_t *output, int cos_bit) {
403 int32x4_t u_lo[4], u_hi[4];
404 const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
405 const int16x8_t u01 = vqaddq_s16(input[0], input[1]);
406
407 u_lo[0] = vmull_lane_s16(vget_low_s16(input[1]), sinpi, 1);
408 u_hi[0] = vmull_lane_s16(vget_high_s16(input[1]), sinpi, 1);
409
410 u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[0]), sinpi, 0);
411 u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[0]), sinpi, 0);
412
413 u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[3]), sinpi, 3);
414 u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[3]), sinpi, 3);
415
416 u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[2]), sinpi, 2);
417 u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[2]), sinpi, 2);
418
419 u_lo[1] = vmull_lane_s16(vget_low_s16(u01), sinpi, 2);
420 u_hi[1] = vmull_lane_s16(vget_high_s16(u01), sinpi, 2);
421
422 u_lo[2] = vmull_lane_s16(vget_low_s16(input[0]), sinpi, 3);
423 u_hi[2] = vmull_lane_s16(vget_high_s16(input[0]), sinpi, 3);
424
425 u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[1]), sinpi, 0);
426 u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[1]), sinpi, 0);
427
428 u_lo[2] = vmlal_lane_s16(u_lo[2], vget_low_s16(input[3]), sinpi, 1);
429 u_hi[2] = vmlal_lane_s16(u_hi[2], vget_high_s16(input[3]), sinpi, 1);
430
431 u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[2]), sinpi, 2);
432 u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[2]), sinpi, 2);
433
434 u_lo[1] = vmlsl_lane_s16(u_lo[1], vget_low_s16(input[3]), sinpi, 2);
435 u_hi[1] = vmlsl_lane_s16(u_hi[1], vget_high_s16(input[3]), sinpi, 2);
436
437 u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]);
438 u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]);
439
440 const int16x4_t sinpix3 = vmul_n_s16(sinpi, 3);
441 u_lo[3] = vmlal_lane_s16(u_lo[3], vget_low_s16(input[2]), sinpix3, 2);
442 u_hi[3] = vmlal_lane_s16(u_hi[3], vget_high_s16(input[2]), sinpix3, 2);
443
444 output[0] = vcombine_s16(vrshrn_n_s32(u_lo[0], TXFM_COS_BIT_MAX),
445 vrshrn_n_s32(u_hi[0], TXFM_COS_BIT_MAX));
446 output[1] = vcombine_s16(vrshrn_n_s32(u_lo[1], TXFM_COS_BIT_MAX),
447 vrshrn_n_s32(u_hi[1], TXFM_COS_BIT_MAX));
448 output[2] = vcombine_s16(vrshrn_n_s32(u_lo[2], TXFM_COS_BIT_MAX),
449 vrshrn_n_s32(u_hi[2], TXFM_COS_BIT_MAX));
450 output[3] = vcombine_s16(vrshrn_n_s32(u_lo[3], TXFM_COS_BIT_MAX),
451 vrshrn_n_s32(u_hi[3], TXFM_COS_BIT_MAX));
452 }
453
fdct4x4_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)454 static AOM_FORCE_INLINE void fdct4x4_neon(const int16x4_t *input,
455 int16x4_t *output, int cos_bit) {
456 const int16_t *cospi = cospi_arr_q13(cos_bit);
457 const int16x4_t cospi16 = vld1_s16(&cospi[4 * 1]);
458
459 int16x4_t in12a = vadd_s16(input[1], input[2]);
460 int16x4_t in12s = vsub_s16(input[1], input[2]);
461 int16x4_t in03a = vadd_s16(input[0], input[3]);
462 int16x4_t in03s = vsub_s16(input[0], input[3]);
463
464 int32x4_t u0ad1 = vmull_n_s16(in12a, cospi[4 * 0]);
465 int32x4_t u0ad2 = vmull_n_s16(in03a, cospi[4 * 0]);
466
467 int32x4_t u[4];
468 u[0] = vaddq_s32(u0ad1, u0ad2);
469 u[1] = vsubq_s32(u0ad2, u0ad1);
470 u[2] = vmull_lane_s16(in12s, cospi16, 1);
471 u[2] = vmlal_lane_s16(u[2], in03s, cospi16, 0);
472 u[3] = vmull_lane_s16(in03s, cospi16, 1);
473 u[3] = vmlsl_lane_s16(u[3], in12s, cospi16, 0);
474
475 output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
476 output[1] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
477 output[2] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
478 output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
479 }
480
481 // Butterfly pre-processing:
482 // e.g. n=4:
483 // out[0] = in[0] + in[3]
484 // out[1] = in[1] + in[2]
485 // out[2] = in[1] - in[2]
486 // out[3] = in[0] - in[3]
487
butterfly_dct_pre_s16_x4(const int16x4_t * input,int16x4_t * output,int n)488 static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x4(const int16x4_t *input,
489 int16x4_t *output,
490 int n) {
491 for (int i = 0; i < n / 2; ++i) {
492 output[i] = vqadd_s16(input[i], input[n - i - 1]);
493 }
494 for (int i = 0; i < n / 2; ++i) {
495 output[n / 2 + i] = vqsub_s16(input[n / 2 - i - 1], input[n / 2 + i]);
496 }
497 }
498
butterfly_dct_pre_s16_x8(const int16x8_t * input,int16x8_t * output,int n)499 static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x8(const int16x8_t *input,
500 int16x8_t *output,
501 int n) {
502 for (int i = 0; i < n / 2; ++i) {
503 output[i] = vqaddq_s16(input[i], input[n - i - 1]);
504 }
505 for (int i = 0; i < n / 2; ++i) {
506 output[n / 2 + i] = vqsubq_s16(input[n / 2 - i - 1], input[n / 2 + i]);
507 }
508 }
509
butterfly_dct_pre_s32_x4(const int32x4_t * input,int32x4_t * output,int n)510 static AOM_FORCE_INLINE void butterfly_dct_pre_s32_x4(const int32x4_t *input,
511 int32x4_t *output,
512 int n) {
513 for (int i = 0; i < n / 2; ++i) {
514 output[i] = vqaddq_s32(input[i], input[n - i - 1]);
515 }
516 for (int i = 0; i < n / 2; ++i) {
517 output[n / 2 + i] = vqsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
518 }
519 }
520
521 // Butterfly post-processing:
522 // e.g. n=8:
523 // out[0] = in0[0] + in1[3];
524 // out[1] = in0[1] + in1[2];
525 // out[2] = in0[1] - in1[2];
526 // out[3] = in0[0] - in1[3];
527 // out[4] = in0[7] - in1[4];
528 // out[5] = in0[6] - in1[5];
529 // out[6] = in0[6] + in1[5];
530 // out[7] = in0[7] + in1[4];
531
butterfly_dct_post_s16_x4(const int16x4_t * in0,const int16x4_t * in1,int16x4_t * output,int n)532 static AOM_FORCE_INLINE void butterfly_dct_post_s16_x4(const int16x4_t *in0,
533 const int16x4_t *in1,
534 int16x4_t *output,
535 int n) {
536 for (int i = 0; i < n / 4; ++i) {
537 output[i] = vqadd_s16(in0[i], in1[n / 2 - i - 1]);
538 }
539 for (int i = 0; i < n / 4; ++i) {
540 output[n / 4 + i] = vqsub_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
541 }
542 for (int i = 0; i < n / 4; ++i) {
543 output[n / 2 + i] = vqsub_s16(in0[n - i - 1], in1[n / 2 + i]);
544 }
545 for (int i = 0; i < n / 4; ++i) {
546 output[(3 * n) / 4 + i] =
547 vqadd_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
548 }
549 }
550
butterfly_dct_post_s16_x8(const int16x8_t * in0,const int16x8_t * in1,int16x8_t * output,int n)551 static AOM_FORCE_INLINE void butterfly_dct_post_s16_x8(const int16x8_t *in0,
552 const int16x8_t *in1,
553 int16x8_t *output,
554 int n) {
555 for (int i = 0; i < n / 4; ++i) {
556 output[i] = vqaddq_s16(in0[i], in1[n / 2 - i - 1]);
557 }
558 for (int i = 0; i < n / 4; ++i) {
559 output[n / 4 + i] = vqsubq_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
560 }
561 for (int i = 0; i < n / 4; ++i) {
562 output[n / 2 + i] = vqsubq_s16(in0[n - i - 1], in1[n / 2 + i]);
563 }
564 for (int i = 0; i < n / 4; ++i) {
565 output[(3 * n) / 4 + i] =
566 vqaddq_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
567 }
568 }
569
butterfly_dct_post_s32_x4(const int32x4_t * in0,const int32x4_t * in1,int32x4_t * output,int n)570 static AOM_FORCE_INLINE void butterfly_dct_post_s32_x4(const int32x4_t *in0,
571 const int32x4_t *in1,
572 int32x4_t *output,
573 int n) {
574 for (int i = 0; i < n / 4; ++i) {
575 output[i] = vqaddq_s32(in0[i], in1[n / 2 - i - 1]);
576 }
577 for (int i = 0; i < n / 4; ++i) {
578 output[n / 4 + i] = vqsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
579 }
580 for (int i = 0; i < n / 4; ++i) {
581 output[n / 2 + i] = vqsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
582 }
583 for (int i = 0; i < n / 4; ++i) {
584 output[(3 * n) / 4 + i] =
585 vqaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
586 }
587 }
588
fdct8x4_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)589 static AOM_FORCE_INLINE void fdct8x4_neon(const int16x8_t *input,
590 int16x8_t *output, int cos_bit) {
591 const int16_t *cospi = cospi_arr_q13(cos_bit);
592
593 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
594
595 const int16x4_t cospi32 = vget_low_s16(cospi32_16);
596 const int16x4_t cospi16 = vget_high_s16(cospi32_16);
597
598 // stage 1
599 int16x8_t x1[4];
600 butterfly_dct_pre_s16_x8(input, x1, 4);
601
602 // stage 2
603 int16x8_t x2[4];
604 butterfly_s16_s32_x8_0112_neon(cospi32, x1[0], x1[1], &x2[0], &x2[1]);
605 butterfly_s16_s32_x8_0112_neon(cospi16, x1[3], x1[2], &x2[2], &x2[3]);
606
607 // stage 3
608 output[0] = x2[0];
609 output[1] = x2[2];
610 output[2] = x2[1];
611 output[3] = x2[3];
612 }
613
fdct4x8_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)614 static AOM_FORCE_INLINE void fdct4x8_neon(const int16x4_t *input,
615 int16x4_t *output, int cos_bit) {
616 const int16_t *cospi = cospi_arr_q13(cos_bit);
617
618 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
619 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
620
621 const int16x4_t cospi32 = vget_low_s16(cospi32_16);
622 const int16x4_t cospi16 = vget_high_s16(cospi32_16);
623 const int16x4_t cospi8 = vget_low_s16(cospi8_24);
624 const int16x4_t cospi24 = vget_high_s16(cospi8_24);
625
626 // stage 1
627 int16x4_t x1[8];
628 butterfly_dct_pre_s16_x4(input, x1, 8);
629
630 // stage 2
631 int16x4_t x2[8];
632 butterfly_dct_pre_s16_x4(x1, x2, 4);
633 butterfly_s16_s32_x4_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
634
635 // stage 3
636 int16x4_t x3[8];
637 butterfly_s16_s32_x4_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
638 butterfly_s16_s32_x4_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
639 butterfly_dct_post_s16_x4(x1 + 4, x2 + 4, x3 + 4, 4);
640
641 // stage 4-5
642 butterfly_s16_s32_x4_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
643 butterfly_s16_s32_x4_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
644 }
645
fdct8x8_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)646 static AOM_FORCE_INLINE void fdct8x8_neon(const int16x8_t *input,
647 int16x8_t *output, int cos_bit) {
648 const int16_t *cospi = cospi_arr_q13(cos_bit);
649
650 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
651 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
652
653 const int16x4_t cospi32 = vget_low_s16(cospi32_16);
654 const int16x4_t cospi16 = vget_high_s16(cospi32_16);
655 const int16x4_t cospi8 = vget_low_s16(cospi8_24);
656 const int16x4_t cospi24 = vget_high_s16(cospi8_24);
657
658 // stage 1
659 int16x8_t x1[8];
660 butterfly_dct_pre_s16_x8(input, x1, 8);
661
662 // stage 2
663 int16x8_t x2[8];
664 butterfly_dct_pre_s16_x8(x1, x2, 4);
665 butterfly_s16_s32_x8_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
666
667 // stage 3
668 int16x8_t x3[8];
669 butterfly_s16_s32_x8_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
670 butterfly_s16_s32_x8_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
671 butterfly_dct_post_s16_x8(x1 + 4, x2 + 4, x3 + 4, 4);
672
673 // stage 4-5
674 butterfly_s16_s32_x8_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
675 butterfly_s16_s32_x8_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
676 }
677
fdct4x16_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)678 static AOM_FORCE_INLINE void fdct4x16_neon(const int16x4_t *input,
679 int16x4_t *output, int cos_bit) {
680 const int16_t *cospi = cospi_arr_q13(cos_bit);
681
682 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
683 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
684 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
685 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
686
687 const int16x4_t cospi32 = vget_low_s16(cospi32_16);
688 const int16x4_t cospi16 = vget_high_s16(cospi32_16);
689 const int16x4_t cospi8 = vget_low_s16(cospi8_24);
690 const int16x4_t cospi24 = vget_high_s16(cospi8_24);
691 const int16x4_t cospi4 = vget_low_s16(cospi4_12);
692 const int16x4_t cospi12 = vget_high_s16(cospi4_12);
693 const int16x4_t cospi20 = vget_low_s16(cospi20_28);
694 const int16x4_t cospi28 = vget_high_s16(cospi20_28);
695
696 // stage 1
697 int16x4_t x1[16];
698 butterfly_dct_pre_s16_x4(input, x1, 16);
699
700 // stage 2
701 int16x4_t x2[16];
702 butterfly_dct_pre_s16_x4(x1, x2, 8);
703 butterfly_s16_s32_x4_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
704 butterfly_s16_s32_x4_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
705
706 // stage 3
707 int16x4_t x3[16];
708 butterfly_dct_pre_s16_x4(x2, x3, 4);
709 butterfly_s16_s32_x4_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
710 butterfly_dct_post_s16_x4(x1 + 8, x2 + 8, x3 + 8, 8);
711
712 // stage 4
713 int16x4_t x4[16];
714 butterfly_s16_s32_x4_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
715 butterfly_s16_s32_x4_0112_neon(cospi16, x3[3], x3[2], &output[4],
716 &output[12]);
717 butterfly_dct_post_s16_x4(x2 + 4, x3 + 4, x4 + 4, 4);
718 butterfly_s16_s32_x4_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
719 butterfly_s16_s32_x4_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
720
721 // stage 5
722 int16x4_t x5[16];
723 butterfly_s16_s32_x4_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
724 butterfly_s16_s32_x4_1003_neon(cospi24, x4[6], x4[5], &output[10],
725 &output[6]);
726 butterfly_dct_post_s16_x4(x3 + 8, x4 + 8, x5 + 8, 4);
727 butterfly_dct_post_s16_x4(x3 + 12, x4 + 12, x5 + 12, 4);
728
729 // stage 6-7
730 butterfly_s16_s32_x4_0112_neon(cospi4, x5[15], x5[8], &output[1],
731 &output[15]);
732 butterfly_s16_s32_x4_1003_neon(cospi28, x5[14], x5[9], &output[9],
733 &output[7]);
734 butterfly_s16_s32_x4_0112_neon(cospi20, x5[13], x5[10], &output[5],
735 &output[11]);
736 butterfly_s16_s32_x4_1003_neon(cospi12, x5[12], x5[11], &output[13],
737 &output[3]);
738 }
739
fdct8x16_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)740 static AOM_FORCE_INLINE void fdct8x16_neon(const int16x8_t *input,
741 int16x8_t *output, int cos_bit) {
742 const int16_t *cospi = cospi_arr_q13(cos_bit);
743
744 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
745 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
746 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
747 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
748
749 const int16x4_t cospi32 = vget_low_s16(cospi32_16);
750 const int16x4_t cospi16 = vget_high_s16(cospi32_16);
751 const int16x4_t cospi8 = vget_low_s16(cospi8_24);
752 const int16x4_t cospi24 = vget_high_s16(cospi8_24);
753 const int16x4_t cospi4 = vget_low_s16(cospi4_12);
754 const int16x4_t cospi12 = vget_high_s16(cospi4_12);
755 const int16x4_t cospi20 = vget_low_s16(cospi20_28);
756 const int16x4_t cospi28 = vget_high_s16(cospi20_28);
757
758 // stage 1
759 int16x8_t x1[16];
760 butterfly_dct_pre_s16_x8(input, x1, 16);
761
762 // stage 2
763 int16x8_t x2[16];
764 butterfly_dct_pre_s16_x8(x1, x2, 8);
765 butterfly_s16_s32_x8_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
766 butterfly_s16_s32_x8_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
767
768 // stage 3
769 int16x8_t x3[16];
770 butterfly_dct_pre_s16_x8(x2, x3, 4);
771 butterfly_s16_s32_x8_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
772 butterfly_dct_post_s16_x8(x1 + 8, x2 + 8, x3 + 8, 8);
773
774 // stage 4
775 int16x8_t x4[16];
776 butterfly_s16_s32_x8_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
777 butterfly_s16_s32_x8_0112_neon(cospi16, x3[3], x3[2], &output[4],
778 &output[12]);
779 butterfly_dct_post_s16_x8(x2 + 4, x3 + 4, x4 + 4, 4);
780 butterfly_s16_s32_x8_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
781 butterfly_s16_s32_x8_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
782
783 // stage 5
784 int16x8_t x5[16];
785 butterfly_s16_s32_x8_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
786 butterfly_s16_s32_x8_1003_neon(cospi24, x4[6], x4[5], &output[10],
787 &output[6]);
788 butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 4);
789 butterfly_dct_post_s16_x8(x3 + 12, x4 + 12, x5 + 12, 4);
790
791 // stage 6-7
792 butterfly_s16_s32_x8_0112_neon(cospi4, x5[15], x5[8], &output[1],
793 &output[15]);
794 butterfly_s16_s32_x8_1003_neon(cospi28, x5[14], x5[9], &output[9],
795 &output[7]);
796 butterfly_s16_s32_x8_0112_neon(cospi20, x5[13], x5[10], &output[5],
797 &output[11]);
798 butterfly_s16_s32_x8_1003_neon(cospi12, x5[12], x5[11], &output[13],
799 &output[3]);
800 }
801
fdct8x32_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)802 static AOM_FORCE_INLINE void fdct8x32_neon(const int16x8_t *input,
803 int16x8_t *output, int cos_bit) {
804 const int16_t *cospi = cospi_arr_q13(cos_bit);
805
806 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
807 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
808 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
809 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
810 const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
811 const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
812 const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
813 const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
814
815 const int16x4_t cospi32 = vget_low_s16(cospi32_16);
816 const int16x4_t cospi16 = vget_high_s16(cospi32_16);
817 const int16x4_t cospi8 = vget_low_s16(cospi8_24);
818 const int16x4_t cospi24 = vget_high_s16(cospi8_24);
819 const int16x4_t cospi4 = vget_low_s16(cospi4_12);
820 const int16x4_t cospi12 = vget_high_s16(cospi4_12);
821 const int16x4_t cospi20 = vget_low_s16(cospi20_28);
822 const int16x4_t cospi28 = vget_high_s16(cospi20_28);
823 const int16x4_t cospi2 = vget_low_s16(cospi2_6);
824 const int16x4_t cospi6 = vget_high_s16(cospi2_6);
825 const int16x4_t cospi10 = vget_low_s16(cospi10_14);
826 const int16x4_t cospi14 = vget_high_s16(cospi10_14);
827 const int16x4_t cospi18 = vget_low_s16(cospi18_22);
828 const int16x4_t cospi22 = vget_high_s16(cospi18_22);
829 const int16x4_t cospi26 = vget_low_s16(cospi26_30);
830 const int16x4_t cospi30 = vget_high_s16(cospi26_30);
831
832 // stage 1
833 int16x8_t x1[32];
834 butterfly_dct_pre_s16_x8(input, x1, 32);
835
836 // stage 2
837 int16x8_t x2[32];
838 butterfly_dct_pre_s16_x8(x1, x2, 16);
839 butterfly_s16_s32_x8_0112_neon(cospi32, x1[27], x1[20], &x2[27], &x2[20]);
840 butterfly_s16_s32_x8_0112_neon(cospi32, x1[26], x1[21], &x2[26], &x2[21]);
841 butterfly_s16_s32_x8_0112_neon(cospi32, x1[25], x1[22], &x2[25], &x2[22]);
842 butterfly_s16_s32_x8_0112_neon(cospi32, x1[24], x1[23], &x2[24], &x2[23]);
843
844 // stage 3
845 int16x8_t x3[32];
846 butterfly_dct_pre_s16_x8(x2, x3, 8);
847 butterfly_s16_s32_x8_0112_neon(cospi32, x2[13], x2[10], &x3[13], &x3[10]);
848 butterfly_s16_s32_x8_0112_neon(cospi32, x2[12], x2[11], &x3[12], &x3[11]);
849 butterfly_dct_post_s16_x8(x1 + 16, x2 + 16, x3 + 16, 16);
850
851 // stage 4
852 int16x8_t x4[32];
853 butterfly_dct_pre_s16_x8(x3, x4, 4);
854 butterfly_s16_s32_x8_0112_neon(cospi32, x3[6], x3[5], &x4[6], &x4[5]);
855 butterfly_dct_post_s16_x8(x2 + 8, x3 + 8, x4 + 8, 8);
856 butterfly_s16_s32_x8_0112_neon(cospi16, x3[29], x3[18], &x4[29], &x4[18]);
857 butterfly_s16_s32_x8_0112_neon(cospi16, x3[28], x3[19], &x4[28], &x4[19]);
858 butterfly_s16_s32_x8_1223_neon(cospi16, x3[27], x3[20], &x4[27], &x4[20]);
859 butterfly_s16_s32_x8_1223_neon(cospi16, x3[26], x3[21], &x4[26], &x4[21]);
860
861 // stage 5
862 int16x8_t x5[32];
863 butterfly_s16_s32_x8_0112_neon(cospi32, x4[0], x4[1], &output[0],
864 &output[16]);
865 butterfly_s16_s32_x8_0112_neon(cospi16, x4[3], x4[2], &output[8],
866 &output[24]);
867 butterfly_dct_post_s16_x8(x3 + 4, x4 + 4, x5 + 4, 4);
868 butterfly_s16_s32_x8_0112_neon(cospi16, x4[14], x4[9], &x5[14], &x5[9]);
869 butterfly_s16_s32_x8_1223_neon(cospi16, x4[13], x4[10], &x5[13], &x5[10]);
870 butterfly_dct_post_s16_x8(x3 + 16, x4 + 16, x5 + 16, 8);
871 butterfly_dct_post_s16_x8(x3 + 24, x4 + 24, x5 + 24, 8);
872
873 // stage 6
874 int16x8_t x6[32];
875 butterfly_s16_s32_x8_0112_neon(cospi8, x5[7], x5[4], &output[4], &output[28]);
876 butterfly_s16_s32_x8_1003_neon(cospi24, x5[6], x5[5], &output[20],
877 &output[12]);
878 butterfly_dct_post_s16_x8(x4 + 8, x5 + 8, x6 + 8, 4);
879 butterfly_dct_post_s16_x8(x4 + 12, x5 + 12, x6 + 12, 4);
880 butterfly_s16_s32_x8_0112_neon(cospi8, x5[30], x5[17], &x6[30], &x6[17]);
881 butterfly_s16_s32_x8_1223_neon(cospi8, x5[29], x5[18], &x6[29], &x6[18]);
882 butterfly_s16_s32_x8_1003_neon(cospi24, x5[26], x5[21], &x6[26], &x6[21]);
883 butterfly_s16_s32_x8_0332_neon(cospi24, x5[25], x5[22], &x6[25], &x6[22]);
884
885 // stage 7
886 int16x8_t x7[32];
887 butterfly_s16_s32_x8_0112_neon(cospi4, x6[15], x6[8], &output[2],
888 &output[30]);
889 butterfly_s16_s32_x8_1003_neon(cospi28, x6[14], x6[9], &output[18],
890 &output[14]);
891 butterfly_s16_s32_x8_0112_neon(cospi20, x6[13], x6[10], &output[10],
892 &output[22]);
893 butterfly_s16_s32_x8_1003_neon(cospi12, x6[12], x6[11], &output[26],
894 &output[6]);
895 butterfly_dct_post_s16_x8(x5 + 16, x6 + 16, x7 + 16, 4);
896 butterfly_dct_post_s16_x8(x5 + 20, x6 + 20, x7 + 20, 4);
897 butterfly_dct_post_s16_x8(x5 + 24, x6 + 24, x7 + 24, 4);
898 butterfly_dct_post_s16_x8(x5 + 28, x6 + 28, x7 + 28, 4);
899
900 butterfly_s16_s32_x8_0112_neon(cospi2, x7[31], x7[16], &output[1],
901 &output[31]);
902 butterfly_s16_s32_x8_1003_neon(cospi30, x7[30], x7[17], &output[17],
903 &output[15]);
904 butterfly_s16_s32_x8_0112_neon(cospi18, x7[29], x7[18], &output[9],
905 &output[23]);
906 butterfly_s16_s32_x8_1003_neon(cospi14, x7[28], x7[19], &output[25],
907 &output[7]);
908 butterfly_s16_s32_x8_0112_neon(cospi10, x7[27], x7[20], &output[5],
909 &output[27]);
910 butterfly_s16_s32_x8_1003_neon(cospi22, x7[26], x7[21], &output[21],
911 &output[11]);
912 butterfly_s16_s32_x8_0112_neon(cospi26, x7[25], x7[22], &output[13],
913 &output[19]);
914 butterfly_s16_s32_x8_1003_neon(cospi6, x7[24], x7[23], &output[29],
915 &output[3]);
916 }
917
fdct8x64_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)918 static AOM_FORCE_INLINE void fdct8x64_neon(const int16x8_t *input,
919 int16x8_t *output, int cos_bit) {
920 const int16_t *cospi = cospi_arr_q13(cos_bit);
921
922 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
923 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
924 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
925 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
926 const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
927 const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
928 const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
929 const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
930 const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
931 const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
932 const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
933 const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
934 const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
935 const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
936 const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
937 const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
938
939 const int16x4_t cospi32 = vget_low_s16(cospi32_16);
940 const int16x4_t cospi16 = vget_high_s16(cospi32_16);
941 const int16x4_t cospi8 = vget_low_s16(cospi8_24);
942 const int16x4_t cospi24 = vget_high_s16(cospi8_24);
943 const int16x4_t cospi4 = vget_low_s16(cospi4_12);
944 const int16x4_t cospi12 = vget_high_s16(cospi4_12);
945 const int16x4_t cospi20 = vget_low_s16(cospi20_28);
946 const int16x4_t cospi28 = vget_high_s16(cospi20_28);
947 const int16x4_t cospi2 = vget_low_s16(cospi2_6);
948 const int16x4_t cospi6 = vget_high_s16(cospi2_6);
949 const int16x4_t cospi10 = vget_low_s16(cospi10_14);
950 const int16x4_t cospi14 = vget_high_s16(cospi10_14);
951 const int16x4_t cospi18 = vget_low_s16(cospi18_22);
952 const int16x4_t cospi22 = vget_high_s16(cospi18_22);
953 const int16x4_t cospi26 = vget_low_s16(cospi26_30);
954 const int16x4_t cospi30 = vget_high_s16(cospi26_30);
955 const int16x4_t cospi1 = vget_low_s16(cospi1_3);
956 const int16x4_t cospi3 = vget_high_s16(cospi1_3);
957 const int16x4_t cospi5 = vget_low_s16(cospi5_7);
958 const int16x4_t cospi7 = vget_high_s16(cospi5_7);
959 const int16x4_t cospi9 = vget_low_s16(cospi9_11);
960 const int16x4_t cospi11 = vget_high_s16(cospi9_11);
961 const int16x4_t cospi13 = vget_low_s16(cospi13_15);
962 const int16x4_t cospi15 = vget_high_s16(cospi13_15);
963 const int16x4_t cospi17 = vget_low_s16(cospi17_19);
964 const int16x4_t cospi19 = vget_high_s16(cospi17_19);
965 const int16x4_t cospi21 = vget_low_s16(cospi21_23);
966 const int16x4_t cospi23 = vget_high_s16(cospi21_23);
967 const int16x4_t cospi25 = vget_low_s16(cospi25_27);
968 const int16x4_t cospi27 = vget_high_s16(cospi25_27);
969 const int16x4_t cospi29 = vget_low_s16(cospi29_31);
970 const int16x4_t cospi31 = vget_high_s16(cospi29_31);
971
972 // stage 1
973 int16x8_t x1[64];
974 butterfly_dct_pre_s16_x8(input, x1, 64);
975
976 // stage 2
977 int16x8_t x2[64];
978 butterfly_dct_pre_s16_x8(x1, x2, 32);
979 butterfly_s16_s32_x8_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
980 butterfly_s16_s32_x8_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
981 butterfly_s16_s32_x8_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
982 butterfly_s16_s32_x8_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
983 butterfly_s16_s32_x8_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
984 butterfly_s16_s32_x8_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
985 butterfly_s16_s32_x8_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
986 butterfly_s16_s32_x8_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
987
988 // stage 3
989 int16x8_t x3[64];
990 butterfly_dct_pre_s16_x8(x2, x3, 16);
991 x3[16] = x2[16];
992 x3[17] = x2[17];
993 x3[18] = x2[18];
994 x3[19] = x2[19];
995 butterfly_s16_s32_x8_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
996 butterfly_s16_s32_x8_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
997 butterfly_s16_s32_x8_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
998 butterfly_s16_s32_x8_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
999 x3[28] = x2[28];
1000 x3[29] = x2[29];
1001 x3[30] = x2[30];
1002 x3[31] = x2[31];
1003 butterfly_dct_post_s16_x8(x1 + 32, x2 + 32, x3 + 32, 32);
1004
1005 // stage 4
1006 int16x8_t x4[64];
1007 butterfly_dct_pre_s16_x8(x3, x4, 8);
1008 butterfly_s16_s32_x8_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
1009 butterfly_s16_s32_x8_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
1010 butterfly_dct_post_s16_x8(x3 + 16, x3 + 16, x4 + 16, 16);
1011 butterfly_s16_s32_x8_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
1012 butterfly_s16_s32_x8_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
1013 butterfly_s16_s32_x8_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
1014 butterfly_s16_s32_x8_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
1015 butterfly_s16_s32_x8_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
1016 butterfly_s16_s32_x8_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
1017 butterfly_s16_s32_x8_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
1018 butterfly_s16_s32_x8_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
1019
1020 // stage 5
1021 int16x8_t x5[64];
1022 butterfly_dct_pre_s16_x8(x4, x5, 4);
1023 butterfly_s16_s32_x8_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
1024 butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 8);
1025 butterfly_s16_s32_x8_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
1026 butterfly_s16_s32_x8_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
1027 butterfly_s16_s32_x8_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
1028 butterfly_s16_s32_x8_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
1029 butterfly_dct_post_s16_x8(x3 + 32, x4 + 32, x5 + 32, 16);
1030 butterfly_dct_post_s16_x8(x3 + 48, x4 + 48, x5 + 48, 16);
1031
1032 // stage 6
1033 int16x8_t x6[64];
1034 butterfly_s16_s32_x8_0112_neon(cospi32, x5[1], x5[0], &x6[0], &x6[1]);
1035 butterfly_s16_s32_x8_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
1036 butterfly_dct_post_s16_x8(x4 + 4, x5 + 4, x6 + 4, 4);
1037 butterfly_s16_s32_x8_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
1038 butterfly_s16_s32_x8_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
1039 butterfly_dct_post_s16_x8(x4 + 16, x5 + 16, x6 + 16, 8);
1040 butterfly_dct_post_s16_x8(x4 + 24, x5 + 24, x6 + 24, 8);
1041 butterfly_s16_s32_x8_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
1042 butterfly_s16_s32_x8_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
1043 butterfly_s16_s32_x8_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
1044 butterfly_s16_s32_x8_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
1045 butterfly_s16_s32_x8_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
1046 butterfly_s16_s32_x8_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
1047 butterfly_s16_s32_x8_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
1048 butterfly_s16_s32_x8_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
1049
1050 // stage 7
1051 int16x8_t x7[64];
1052 butterfly_s16_s32_x8_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
1053 butterfly_s16_s32_x8_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
1054 butterfly_dct_post_s16_x8(x5 + 8, x6 + 8, x7 + 8, 4);
1055 butterfly_dct_post_s16_x8(x5 + 12, x6 + 12, x7 + 12, 4);
1056 butterfly_s16_s32_x8_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
1057 butterfly_s16_s32_x8_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
1058 butterfly_s16_s32_x8_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
1059 butterfly_s16_s32_x8_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
1060 butterfly_dct_post_s16_x8(x5 + 32, x6 + 32, x7 + 32, 8);
1061 butterfly_dct_post_s16_x8(x5 + 40, x6 + 40, x7 + 40, 8);
1062 butterfly_dct_post_s16_x8(x5 + 48, x6 + 48, x7 + 48, 8);
1063 butterfly_dct_post_s16_x8(x5 + 56, x6 + 56, x7 + 56, 8);
1064
1065 // stage 8
1066 int16x8_t x8[64];
1067 butterfly_s16_s32_x8_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
1068 butterfly_s16_s32_x8_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
1069 butterfly_s16_s32_x8_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
1070 butterfly_s16_s32_x8_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
1071 butterfly_dct_post_s16_x8(x6 + 16, x7 + 16, x8 + 16, 4);
1072 butterfly_dct_post_s16_x8(x6 + 20, x7 + 20, x8 + 20, 4);
1073 butterfly_dct_post_s16_x8(x6 + 24, x7 + 24, x8 + 24, 4);
1074 butterfly_dct_post_s16_x8(x6 + 28, x7 + 28, x8 + 28, 4);
1075 butterfly_s16_s32_x8_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
1076 butterfly_s16_s32_x8_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
1077 butterfly_s16_s32_x8_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
1078 butterfly_s16_s32_x8_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
1079 butterfly_s16_s32_x8_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
1080 butterfly_s16_s32_x8_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
1081 butterfly_s16_s32_x8_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
1082 butterfly_s16_s32_x8_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
1083
1084 // stage 9
1085 int16x8_t x9[64];
1086 butterfly_s16_s32_x8_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
1087 butterfly_s16_s32_x8_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
1088 butterfly_s16_s32_x8_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
1089 butterfly_s16_s32_x8_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
1090 butterfly_s16_s32_x8_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
1091 butterfly_s16_s32_x8_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
1092 butterfly_s16_s32_x8_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
1093 butterfly_s16_s32_x8_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
1094 butterfly_dct_post_s16_x8(x7 + 32, x8 + 32, x9 + 32, 4);
1095 butterfly_dct_post_s16_x8(x7 + 36, x8 + 36, x9 + 36, 4);
1096 butterfly_dct_post_s16_x8(x7 + 40, x8 + 40, x9 + 40, 4);
1097 butterfly_dct_post_s16_x8(x7 + 44, x8 + 44, x9 + 44, 4);
1098 butterfly_dct_post_s16_x8(x7 + 48, x8 + 48, x9 + 48, 4);
1099 butterfly_dct_post_s16_x8(x7 + 52, x8 + 52, x9 + 52, 4);
1100 butterfly_dct_post_s16_x8(x7 + 56, x8 + 56, x9 + 56, 4);
1101 butterfly_dct_post_s16_x8(x7 + 60, x8 + 60, x9 + 60, 4);
1102
1103 // stage 10
1104 butterfly_s16_s32_x8_0112_neon(cospi1, x9[63], x9[32], &output[1],
1105 &output[63]);
1106 butterfly_s16_s32_x8_1003_neon(cospi31, x9[62], x9[33], &output[33],
1107 &output[31]);
1108 butterfly_s16_s32_x8_0112_neon(cospi17, x9[61], x9[34], &output[17],
1109 &output[47]);
1110 butterfly_s16_s32_x8_1003_neon(cospi15, x9[60], x9[35], &output[49],
1111 &output[15]);
1112 butterfly_s16_s32_x8_0112_neon(cospi9, x9[59], x9[36], &output[9],
1113 &output[55]);
1114 butterfly_s16_s32_x8_1003_neon(cospi23, x9[58], x9[37], &output[41],
1115 &output[23]);
1116 butterfly_s16_s32_x8_0112_neon(cospi25, x9[57], x9[38], &output[25],
1117 &output[39]);
1118 butterfly_s16_s32_x8_1003_neon(cospi7, x9[56], x9[39], &output[57],
1119 &output[7]);
1120 butterfly_s16_s32_x8_0112_neon(cospi5, x9[55], x9[40], &output[5],
1121 &output[59]);
1122 butterfly_s16_s32_x8_1003_neon(cospi27, x9[54], x9[41], &output[37],
1123 &output[27]);
1124 butterfly_s16_s32_x8_0112_neon(cospi21, x9[53], x9[42], &output[21],
1125 &output[43]);
1126 butterfly_s16_s32_x8_1003_neon(cospi11, x9[52], x9[43], &output[53],
1127 &output[11]);
1128 butterfly_s16_s32_x8_0112_neon(cospi13, x9[51], x9[44], &output[13],
1129 &output[51]);
1130 butterfly_s16_s32_x8_1003_neon(cospi19, x9[50], x9[45], &output[45],
1131 &output[19]);
1132 butterfly_s16_s32_x8_0112_neon(cospi29, x9[49], x9[46], &output[29],
1133 &output[35]);
1134 butterfly_s16_s32_x8_1003_neon(cospi3, x9[48], x9[47], &output[61],
1135 &output[3]);
1136
1137 // stage 11
1138 output[0] = x6[0];
1139 output[2] = x9[16];
1140 output[4] = x8[8];
1141 output[6] = x9[24];
1142 output[8] = x7[4];
1143 output[10] = x9[20];
1144 output[12] = x8[12];
1145 output[14] = x9[28];
1146 output[16] = x6[2];
1147 output[18] = x9[18];
1148 output[20] = x8[10];
1149 output[22] = x9[26];
1150 output[24] = x7[6];
1151 output[26] = x9[22];
1152 output[28] = x8[14];
1153 output[30] = x9[30];
1154 output[32] = x6[1];
1155 output[34] = x9[17];
1156 output[36] = x8[9];
1157 output[38] = x9[25];
1158 output[40] = x7[5];
1159 output[42] = x9[21];
1160 output[44] = x8[13];
1161 output[46] = x9[29];
1162 output[48] = x6[3];
1163 output[52] = x8[11];
1164 output[54] = x9[27];
1165 output[56] = x7[7];
1166 output[58] = x9[23];
1167 output[60] = x8[15];
1168 output[62] = x9[31];
1169 }
1170
fadst8x8_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)1171 static AOM_FORCE_INLINE void fadst8x8_neon(const int16x8_t *input,
1172 int16x8_t *output, int cos_bit) {
1173 const int16_t *cospi = cospi_arr_q13(cos_bit);
1174
1175 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
1176 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
1177 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
1178
1179 const int16x4_t cospi32 = vget_low_s16(cospi32_16);
1180 const int16x4_t cospi16 = vget_high_s16(cospi32_16);
1181 const int16x4_t cospi4 = vget_low_s16(cospi4_12);
1182 const int16x4_t cospi12 = vget_high_s16(cospi4_12);
1183 const int16x4_t cospi20 = vget_low_s16(cospi20_28);
1184 const int16x4_t cospi28 = vget_high_s16(cospi20_28);
1185
1186 // stage 2
1187 int16x8_t x2[8];
1188 butterfly_s16_s32_x8_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
1189 butterfly_s16_s32_x8_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
1190
1191 // stage 3
1192 int16x8_t x3[8];
1193 x3[0] = vqaddq_s16(input[0], x2[2]);
1194 x3[1] = vqsubq_s16(x2[3], input[7]);
1195 x3[2] = vqsubq_s16(input[0], x2[2]);
1196 x3[3] = vqaddq_s16(input[7], x2[3]);
1197 x3[4] = vqsubq_s16(x2[6], input[1]);
1198 x3[5] = vqaddq_s16(input[6], x2[7]);
1199 x3[6] = vqaddq_s16(input[1], x2[6]);
1200 x3[7] = vqsubq_s16(input[6], x2[7]);
1201
1202 // stage 4
1203 butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
1204 butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
1205
1206 // stage 5
1207 int16x8_t x5[8];
1208 x5[0] = vqaddq_s16(x3[0], x3[4]);
1209 x5[1] = vqaddq_s16(x3[1], x3[5]);
1210 x5[2] = vqaddq_s16(x3[2], x3[6]);
1211 x5[3] = vqsubq_s16(x3[7], x3[3]);
1212 x5[4] = vqsubq_s16(x3[0], x3[4]);
1213 x5[5] = vqsubq_s16(x3[1], x3[5]);
1214 x5[6] = vqsubq_s16(x3[2], x3[6]);
1215 x5[7] = vqaddq_s16(x3[3], x3[7]);
1216
1217 // stage 6
1218 butterfly_s16_s32_x8_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
1219 butterfly_s16_s32_x8_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
1220 butterfly_s16_s32_x8_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
1221 butterfly_s16_s32_x8_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
1222 }
1223
fadst4x16_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)1224 static AOM_FORCE_INLINE void fadst4x16_neon(const int16x4_t *input,
1225 int16x4_t *output, int cos_bit) {
1226 const int16_t *cospi = cospi_arr_q13(cos_bit);
1227
1228 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
1229 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
1230 const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
1231 const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
1232 const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
1233 const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
1234
1235 const int16x4_t cospi32 = vget_low_s16(cospi32_16);
1236 const int16x4_t cospi16 = vget_high_s16(cospi32_16);
1237 const int16x4_t cospi8 = vget_low_s16(cospi8_24);
1238 const int16x4_t cospi24 = vget_high_s16(cospi8_24);
1239 const int16x4_t cospi2 = vget_low_s16(cospi2_6);
1240 const int16x4_t cospi6 = vget_high_s16(cospi2_6);
1241 const int16x4_t cospi10 = vget_low_s16(cospi10_14);
1242 const int16x4_t cospi14 = vget_high_s16(cospi10_14);
1243 const int16x4_t cospi18 = vget_low_s16(cospi18_22);
1244 const int16x4_t cospi22 = vget_high_s16(cospi18_22);
1245 const int16x4_t cospi26 = vget_low_s16(cospi26_30);
1246 const int16x4_t cospi30 = vget_high_s16(cospi26_30);
1247
1248 // stage 2
1249 int16x4_t x2[8];
1250 butterfly_s16_s32_x4_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
1251 butterfly_s16_s32_x4_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
1252 butterfly_s16_s32_x4_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
1253 butterfly_s16_s32_x4_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
1254
1255 // stage 3
1256 int16x4_t x3[16];
1257 x3[0] = vqadd_s16(input[0], x2[0]);
1258 x3[1] = vqsub_s16(x2[1], input[15]);
1259 x3[2] = vqsub_s16(input[0], x2[0]);
1260 x3[3] = vqadd_s16(input[15], x2[1]);
1261 x3[4] = vqsub_s16(x2[2], input[3]);
1262 x3[5] = vqadd_s16(input[12], x2[3]);
1263 x3[6] = vqadd_s16(input[3], x2[2]);
1264 x3[7] = vqsub_s16(input[12], x2[3]);
1265 x3[8] = vqsub_s16(x2[4], input[1]);
1266 x3[9] = vqadd_s16(input[14], x2[5]);
1267 x3[10] = vqadd_s16(input[1], x2[4]);
1268 x3[11] = vqsub_s16(input[14], x2[5]);
1269 x3[12] = vqadd_s16(input[2], x2[6]);
1270 x3[13] = vqsub_s16(x2[7], input[13]);
1271 x3[14] = vqsub_s16(input[2], x2[6]);
1272 x3[15] = vqadd_s16(input[13], x2[7]);
1273
1274 // stage 4
1275 butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
1276 butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
1277 butterfly_s16_s32_x4_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
1278 butterfly_s16_s32_x4_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
1279
1280 // stage 5
1281 int16x4_t x5[16];
1282 x5[0] = vqadd_s16(x3[0], x3[4]);
1283 x5[1] = vqadd_s16(x3[1], x3[5]);
1284 x5[2] = vqadd_s16(x3[2], x3[6]);
1285 x5[3] = vqsub_s16(x3[7], x3[3]);
1286 x5[4] = vqsub_s16(x3[0], x3[4]);
1287 x5[5] = vqsub_s16(x3[1], x3[5]);
1288 x5[6] = vqsub_s16(x3[2], x3[6]);
1289 x5[7] = vqadd_s16(x3[3], x3[7]);
1290 x5[8] = vqadd_s16(x3[8], x3[12]);
1291 x5[9] = vqadd_s16(x3[9], x3[13]);
1292 x5[10] = vqsub_s16(x3[14], x3[10]);
1293 x5[11] = vqadd_s16(x3[11], x3[15]);
1294 x5[12] = vqsub_s16(x3[8], x3[12]);
1295 x5[13] = vqsub_s16(x3[9], x3[13]);
1296 x5[14] = vqadd_s16(x3[10], x3[14]);
1297 x5[15] = vqsub_s16(x3[11], x3[15]);
1298
1299 // stage 6
1300 butterfly_s16_s32_x4_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
1301 butterfly_s16_s32_x4_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
1302 butterfly_s16_s32_x4_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
1303 butterfly_s16_s32_x4_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
1304
1305 // stage 7
1306 int16x4_t x7[16];
1307 x7[0] = vqadd_s16(x5[0], x5[8]);
1308 x7[1] = vqadd_s16(x5[1], x5[9]);
1309 x7[2] = vqadd_s16(x5[2], x5[10]);
1310 x7[3] = vqadd_s16(x5[3], x5[11]);
1311 x7[4] = vqadd_s16(x5[4], x5[12]);
1312 x7[5] = vqadd_s16(x5[5], x5[13]);
1313 x7[6] = vqadd_s16(x5[6], x5[14]);
1314 x7[7] = vqsub_s16(x5[15], x5[7]);
1315 x7[8] = vqsub_s16(x5[0], x5[8]);
1316 x7[9] = vqsub_s16(x5[1], x5[9]);
1317 x7[10] = vqsub_s16(x5[2], x5[10]);
1318 x7[11] = vqsub_s16(x5[3], x5[11]);
1319 x7[12] = vqsub_s16(x5[4], x5[12]);
1320 x7[13] = vqsub_s16(x5[5], x5[13]);
1321 x7[14] = vqsub_s16(x5[6], x5[14]);
1322 x7[15] = vqadd_s16(x5[7], x5[15]);
1323
1324 // stage 8
1325 butterfly_s16_s32_x4_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
1326 butterfly_s16_s32_x4_0112_neon(cospi10, x7[2], x7[3], &output[13],
1327 &output[2]);
1328 butterfly_s16_s32_x4_0112_neon(cospi18, x7[4], x7[5], &output[11],
1329 &output[4]);
1330 butterfly_s16_s32_x4_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
1331 butterfly_s16_s32_x4_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
1332 butterfly_s16_s32_x4_1003_neon(cospi22, x7[10], x7[11], &output[5],
1333 &output[10]);
1334 butterfly_s16_s32_x4_1003_neon(cospi14, x7[12], x7[13], &output[3],
1335 &output[12]);
1336 butterfly_s16_s32_x4_0112_neon(cospi6, x7[14], x7[15], &output[14],
1337 &output[1]);
1338 }
1339
fadst8x16_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)1340 static AOM_FORCE_INLINE void fadst8x16_neon(const int16x8_t *input,
1341 int16x8_t *output, int cos_bit) {
1342 const int16_t *cospi = cospi_arr_q13(cos_bit);
1343
1344 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
1345 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
1346 const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
1347 const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
1348 const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
1349 const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
1350
1351 const int16x4_t cospi32 = vget_low_s16(cospi32_16);
1352 const int16x4_t cospi16 = vget_high_s16(cospi32_16);
1353 const int16x4_t cospi8 = vget_low_s16(cospi8_24);
1354 const int16x4_t cospi24 = vget_high_s16(cospi8_24);
1355 const int16x4_t cospi2 = vget_low_s16(cospi2_6);
1356 const int16x4_t cospi6 = vget_high_s16(cospi2_6);
1357 const int16x4_t cospi10 = vget_low_s16(cospi10_14);
1358 const int16x4_t cospi14 = vget_high_s16(cospi10_14);
1359 const int16x4_t cospi18 = vget_low_s16(cospi18_22);
1360 const int16x4_t cospi22 = vget_high_s16(cospi18_22);
1361 const int16x4_t cospi26 = vget_low_s16(cospi26_30);
1362 const int16x4_t cospi30 = vget_high_s16(cospi26_30);
1363
1364 // stage 2
1365 int16x8_t x2[8];
1366 butterfly_s16_s32_x8_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
1367 butterfly_s16_s32_x8_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
1368 butterfly_s16_s32_x8_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
1369 butterfly_s16_s32_x8_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
1370
1371 // stage 3
1372 int16x8_t x3[16];
1373 x3[0] = vqaddq_s16(input[0], x2[0]);
1374 x3[1] = vqsubq_s16(x2[1], input[15]);
1375 x3[2] = vqsubq_s16(input[0], x2[0]);
1376 x3[3] = vqaddq_s16(input[15], x2[1]);
1377 x3[4] = vqsubq_s16(x2[2], input[3]);
1378 x3[5] = vqaddq_s16(input[12], x2[3]);
1379 x3[6] = vqaddq_s16(input[3], x2[2]);
1380 x3[7] = vqsubq_s16(input[12], x2[3]);
1381 x3[8] = vqsubq_s16(x2[4], input[1]);
1382 x3[9] = vqaddq_s16(input[14], x2[5]);
1383 x3[10] = vqaddq_s16(input[1], x2[4]);
1384 x3[11] = vqsubq_s16(input[14], x2[5]);
1385 x3[12] = vqaddq_s16(input[2], x2[6]);
1386 x3[13] = vqsubq_s16(x2[7], input[13]);
1387 x3[14] = vqsubq_s16(input[2], x2[6]);
1388 x3[15] = vqaddq_s16(input[13], x2[7]);
1389
1390 // stage 4
1391 butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
1392 butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
1393 butterfly_s16_s32_x8_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
1394 butterfly_s16_s32_x8_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
1395
1396 // stage 5
1397 int16x8_t x5[16];
1398 x5[0] = vqaddq_s16(x3[0], x3[4]);
1399 x5[1] = vqaddq_s16(x3[1], x3[5]);
1400 x5[2] = vqaddq_s16(x3[2], x3[6]);
1401 x5[3] = vqsubq_s16(x3[7], x3[3]);
1402 x5[4] = vqsubq_s16(x3[0], x3[4]);
1403 x5[5] = vqsubq_s16(x3[1], x3[5]);
1404 x5[6] = vqsubq_s16(x3[2], x3[6]);
1405 x5[7] = vqaddq_s16(x3[3], x3[7]);
1406 x5[8] = vqaddq_s16(x3[8], x3[12]);
1407 x5[9] = vqaddq_s16(x3[9], x3[13]);
1408 x5[10] = vqsubq_s16(x3[14], x3[10]);
1409 x5[11] = vqaddq_s16(x3[11], x3[15]);
1410 x5[12] = vqsubq_s16(x3[8], x3[12]);
1411 x5[13] = vqsubq_s16(x3[9], x3[13]);
1412 x5[14] = vqaddq_s16(x3[10], x3[14]);
1413 x5[15] = vqsubq_s16(x3[11], x3[15]);
1414
1415 // stage 6
1416 butterfly_s16_s32_x8_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
1417 butterfly_s16_s32_x8_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
1418 butterfly_s16_s32_x8_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
1419 butterfly_s16_s32_x8_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
1420
1421 // stage 7
1422 int16x8_t x7[16];
1423 x7[0] = vqaddq_s16(x5[0], x5[8]);
1424 x7[1] = vqaddq_s16(x5[1], x5[9]);
1425 x7[2] = vqaddq_s16(x5[2], x5[10]);
1426 x7[3] = vqaddq_s16(x5[3], x5[11]);
1427 x7[4] = vqaddq_s16(x5[4], x5[12]);
1428 x7[5] = vqaddq_s16(x5[5], x5[13]);
1429 x7[6] = vqaddq_s16(x5[6], x5[14]);
1430 x7[7] = vqsubq_s16(x5[15], x5[7]);
1431 x7[8] = vqsubq_s16(x5[0], x5[8]);
1432 x7[9] = vqsubq_s16(x5[1], x5[9]);
1433 x7[10] = vqsubq_s16(x5[2], x5[10]);
1434 x7[11] = vqsubq_s16(x5[3], x5[11]);
1435 x7[12] = vqsubq_s16(x5[4], x5[12]);
1436 x7[13] = vqsubq_s16(x5[5], x5[13]);
1437 x7[14] = vqsubq_s16(x5[6], x5[14]);
1438 x7[15] = vqaddq_s16(x5[7], x5[15]);
1439
1440 // stage 8
1441 butterfly_s16_s32_x8_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
1442 butterfly_s16_s32_x8_0112_neon(cospi10, x7[2], x7[3], &output[13],
1443 &output[2]);
1444 butterfly_s16_s32_x8_0112_neon(cospi18, x7[4], x7[5], &output[11],
1445 &output[4]);
1446 butterfly_s16_s32_x8_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
1447 butterfly_s16_s32_x8_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
1448 butterfly_s16_s32_x8_1003_neon(cospi22, x7[10], x7[11], &output[5],
1449 &output[10]);
1450 butterfly_s16_s32_x8_1003_neon(cospi14, x7[12], x7[13], &output[3],
1451 &output[12]);
1452 butterfly_s16_s32_x8_0112_neon(cospi6, x7[14], x7[15], &output[14],
1453 &output[1]);
1454 }
1455
fidentity4x4_neon(const int16x4_t * const input,int16x4_t * const output,const int cos_bit)1456 static AOM_FORCE_INLINE void fidentity4x4_neon(const int16x4_t *const input,
1457 int16x4_t *const output,
1458 const int cos_bit) {
1459 (void)cos_bit;
1460 round_shift_sqrt2_s16_s16_4xn_neon(input, output, 4);
1461 }
1462
fidentity8x4_neon(const int16x8_t * const input,int16x8_t * const output,const int cos_bit)1463 static AOM_FORCE_INLINE void fidentity8x4_neon(const int16x8_t *const input,
1464 int16x8_t *const output,
1465 const int cos_bit) {
1466 (void)cos_bit;
1467 round_shift_sqrt2_s16_s16_8xn_neon(input, output, 4);
1468 }
1469
fidentity4x8_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)1470 static AOM_FORCE_INLINE void fidentity4x8_neon(const int16x4_t *input,
1471 int16x4_t *output, int cos_bit) {
1472 (void)cos_bit;
1473 shift_left_1_s16_x4(input, output, 8);
1474 }
1475
fidentity8x8_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)1476 static AOM_FORCE_INLINE void fidentity8x8_neon(const int16x8_t *input,
1477 int16x8_t *output, int cos_bit) {
1478 (void)cos_bit;
1479 shift_left_1_s16_x8(input, output, 8);
1480 }
1481
fidentity4x16_neon(const int16x4_t * input,int16x4_t * output,int cos_bit)1482 static AOM_FORCE_INLINE void fidentity4x16_neon(const int16x4_t *input,
1483 int16x4_t *output,
1484 int cos_bit) {
1485 (void)cos_bit;
1486 round_shift_2sqrt2_s16_s16_4xn_neon(input, output, 16);
1487 }
1488
fidentity8x16_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)1489 static AOM_FORCE_INLINE void fidentity8x16_neon(const int16x8_t *input,
1490 int16x8_t *output,
1491 int cos_bit) {
1492 (void)cos_bit;
1493 round_shift_2sqrt2_s16_s16_8xn_neon(input, output, 16);
1494 }
1495
fidentity8x32_neon(const int16x8_t * input,int16x8_t * output,int cos_bit)1496 static AOM_FORCE_INLINE void fidentity8x32_neon(const int16x8_t *input,
1497 int16x8_t *output,
1498 int cos_bit) {
1499 (void)cos_bit;
1500 shift_left_2_s16_x8(input, output, 32);
1501 }
1502
1503 #define TRANSFORM_COL(name, tw, n) \
1504 static void name##_col_neon(const int16_t *input, int16x##tw##_t *output, \
1505 int stride, int cos_bit) { \
1506 int16x##tw##_t buf0[n]; \
1507 load_buffer_s16_x##tw(input, stride, buf0, n); \
1508 shift_left_2_s16_x##tw(buf0, buf0, n); \
1509 name##_neon(buf0, output, cos_bit); \
1510 }
1511
1512 TRANSFORM_COL(fadst4x4, 4, 4)
1513 TRANSFORM_COL(fadst4x8, 4, 8)
1514 TRANSFORM_COL(fadst4x16, 4, 16)
1515 TRANSFORM_COL(fadst8x4, 8, 4)
1516 TRANSFORM_COL(fadst8x8, 8, 8)
1517 TRANSFORM_COL(fadst8x16, 8, 16)
1518 TRANSFORM_COL(fdct4x4, 4, 4)
1519 TRANSFORM_COL(fdct4x8, 4, 8)
1520 TRANSFORM_COL(fdct4x16, 4, 16)
1521 TRANSFORM_COL(fdct8x4, 8, 4)
1522 TRANSFORM_COL(fdct8x8, 8, 8)
1523 TRANSFORM_COL(fdct8x16, 8, 16)
1524 TRANSFORM_COL(fdct8x32, 8, 32)
1525 TRANSFORM_COL(fidentity4x4, 4, 4)
1526 TRANSFORM_COL(fidentity4x8, 4, 8)
1527 TRANSFORM_COL(fidentity4x16, 4, 16)
1528 TRANSFORM_COL(fidentity8x4, 8, 4)
1529 TRANSFORM_COL(fidentity8x8, 8, 8)
1530 TRANSFORM_COL(fidentity8x16, 8, 16)
1531 TRANSFORM_COL(fidentity8x32, 8, 32)
1532
1533 #define TRANSFORM_ROW(name, tw, n) \
1534 static void name##_row_neon(const int16x##tw##_t *input, int32_t *output, \
1535 int stride, int cos_bit) { \
1536 int16x##tw##_t buf0[n]; \
1537 name##_neon(input, buf0, cos_bit); \
1538 store_buffer_s16_x##tw(buf0, output, stride, n); \
1539 }
1540
1541 #define TRANSFORM_ROW_RECT(name, tw, n) \
1542 static void name##_row_rect_neon(const int16x##tw##_t *input, \
1543 int32_t *output, int stride, int cos_bit) { \
1544 int16x##tw##_t buf0[n]; \
1545 name##_neon(input, buf0, cos_bit); \
1546 store_rect_buffer_s16_x##tw(buf0, output, stride, n); \
1547 }
1548
1549 TRANSFORM_ROW(fadst4x4, 4, 4)
1550 TRANSFORM_ROW(fadst4x16, 4, 16)
1551 TRANSFORM_ROW(fadst8x4, 8, 4)
1552 TRANSFORM_ROW(fadst8x8, 8, 8)
1553 TRANSFORM_ROW(fadst8x16, 8, 16)
1554 TRANSFORM_ROW(fdct4x4, 4, 4)
1555 TRANSFORM_ROW(fdct4x16, 4, 16)
1556 TRANSFORM_ROW(fdct8x4, 8, 4)
1557 TRANSFORM_ROW(fdct8x8, 8, 8)
1558 TRANSFORM_ROW(fdct8x16, 8, 16)
1559 TRANSFORM_ROW(fdct8x32, 8, 32)
1560 TRANSFORM_ROW(fidentity4x4, 4, 4)
1561 TRANSFORM_ROW(fidentity4x16, 4, 16)
1562 TRANSFORM_ROW(fidentity8x4, 8, 4)
1563 TRANSFORM_ROW(fidentity8x8, 8, 8)
1564 TRANSFORM_ROW(fidentity8x16, 8, 16)
1565 TRANSFORM_ROW(fidentity8x32, 8, 32)
1566
1567 TRANSFORM_ROW_RECT(fadst4x8, 4, 8)
1568 TRANSFORM_ROW_RECT(fadst8x4, 8, 4)
1569 TRANSFORM_ROW_RECT(fadst8x8, 8, 8)
1570 TRANSFORM_ROW_RECT(fadst8x16, 8, 16)
1571 TRANSFORM_ROW_RECT(fdct4x8, 4, 8)
1572 TRANSFORM_ROW_RECT(fdct8x4, 8, 4)
1573 TRANSFORM_ROW_RECT(fdct8x8, 8, 8)
1574 TRANSFORM_ROW_RECT(fdct8x16, 8, 16)
1575 TRANSFORM_ROW_RECT(fdct8x32, 8, 32)
1576 TRANSFORM_ROW_RECT(fidentity4x8, 4, 8)
1577 TRANSFORM_ROW_RECT(fidentity8x4, 8, 4)
1578 TRANSFORM_ROW_RECT(fidentity8x8, 8, 8)
1579 TRANSFORM_ROW_RECT(fidentity8x16, 8, 16)
1580 TRANSFORM_ROW_RECT(fidentity8x32, 8, 32)
1581
1582 typedef void (*transform_1d_lbd_4_neon)(const int16x4_t *input,
1583 int16x4_t *output, int cos_bit);
1584 typedef void (*transform_1d_lbd_8_neon)(const int16x8_t *input,
1585 int16x8_t *output, int cos_bit);
1586
1587 typedef void (*col_transform_1d_lbd_4_neon)(const int16_t *input,
1588 int16x4_t *output, int stride,
1589 int cos_bit);
1590 typedef void (*col_transform_1d_lbd_8_neon)(const int16_t *input,
1591 int16x8_t *output, int stride,
1592 int cos_bit);
1593
1594 typedef void (*row_transform_1d_lbd_4_neon)(const int16x4_t *input,
1595 int32_t *output, int stride,
1596 int cos_bit);
1597 typedef void (*row_transform_1d_lbd_8_neon)(const int16x8_t *input,
1598 int32_t *output, int stride,
1599 int cos_bit);
1600
1601 static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = {
1602 fdct4x8_col_neon, // DCT_DCT
1603 fadst4x8_col_neon, // ADST_DCT
1604 fdct4x8_col_neon, // DCT_ADST
1605 fadst4x8_col_neon, // ADST_ADST
1606 fadst4x8_col_neon, // FLIPADST_DCT
1607 fdct4x8_col_neon, // DCT_FLIPADST
1608 fadst4x8_col_neon, // FLIPADST_FLIPADST
1609 fadst4x8_col_neon, // ADST_FLIPADST
1610 fadst4x8_col_neon, // FLIPADST_ADST
1611 fidentity4x8_col_neon, // IDTX
1612 fdct4x8_col_neon, // V_DCT
1613 fidentity4x8_col_neon, // H_DCT
1614 fadst4x8_col_neon, // V_ADST
1615 fidentity4x8_col_neon, // H_ADST
1616 fadst4x8_col_neon, // V_FLIPADST
1617 fidentity4x8_col_neon // H_FLIPADST
1618 };
1619
1620 static const row_transform_1d_lbd_8_neon row_txfm8x4_arr[TX_TYPES] = {
1621 fdct8x4_row_neon, // DCT_DCT
1622 fdct8x4_row_neon, // ADST_DCT
1623 fadst8x4_row_neon, // DCT_ADST
1624 fadst8x4_row_neon, // ADST_ADST
1625 fdct8x4_row_neon, // FLIPADST_DCT
1626 fadst8x4_row_neon, // DCT_FLIPADST
1627 fadst8x4_row_neon, // FLIPADST_FLIPADST
1628 fadst8x4_row_neon, // ADST_FLIPADST
1629 fadst8x4_row_neon, // FLIPADST_ADST
1630 fidentity8x4_row_neon, // IDTX
1631 fidentity8x4_row_neon, // V_DCT
1632 fdct8x4_row_neon, // H_DCT
1633 fidentity8x4_row_neon, // V_ADST
1634 fadst8x4_row_neon, // H_ADST
1635 fidentity8x4_row_neon, // V_FLIPADST
1636 fadst8x4_row_neon // H_FLIPADST
1637 };
1638
1639 static const row_transform_1d_lbd_8_neon row_rect_txfm8x4_arr[TX_TYPES] = {
1640 fdct8x4_row_rect_neon, // DCT_DCT
1641 fdct8x4_row_rect_neon, // ADST_DCT
1642 fadst8x4_row_rect_neon, // DCT_ADST
1643 fadst8x4_row_rect_neon, // ADST_ADST
1644 fdct8x4_row_rect_neon, // FLIPADST_DCT
1645 fadst8x4_row_rect_neon, // DCT_FLIPADST
1646 fadst8x4_row_rect_neon, // FLIPADST_FLIPADST
1647 fadst8x4_row_rect_neon, // ADST_FLIPADST
1648 fadst8x4_row_rect_neon, // FLIPADST_ADST
1649 fidentity8x4_row_rect_neon, // IDTX
1650 fidentity8x4_row_rect_neon, // V_DCT
1651 fdct8x4_row_rect_neon, // H_DCT
1652 fidentity8x4_row_rect_neon, // V_ADST
1653 fadst8x4_row_rect_neon, // H_ADST
1654 fidentity8x4_row_rect_neon, // V_FLIPADST
1655 fadst8x4_row_rect_neon // H_FLIPADST
1656 };
1657
1658 static const col_transform_1d_lbd_8_neon col_txfm8x4_arr[TX_TYPES] = {
1659 fdct8x4_col_neon, // DCT_DCT
1660 fadst8x4_col_neon, // ADST_DCT
1661 fdct8x4_col_neon, // DCT_ADST
1662 fadst8x4_col_neon, // ADST_ADST
1663 fadst8x4_col_neon, // FLIPADST_DCT
1664 fdct8x4_col_neon, // DCT_FLIPADST
1665 fadst8x4_col_neon, // FLIPADST_FLIPADST
1666 fadst8x4_col_neon, // ADST_FLIPADST
1667 fadst8x4_col_neon, // FLIPADST_ADST
1668 fidentity8x4_col_neon, // IDTX
1669 fdct8x4_col_neon, // V_DCT
1670 fidentity8x4_col_neon, // H_DCT
1671 fadst8x4_col_neon, // V_ADST
1672 fidentity8x4_col_neon, // H_ADST
1673 fadst8x4_col_neon, // V_FLIPADST
1674 fidentity8x4_col_neon // H_FLIPADST
1675 };
1676
1677 static const row_transform_1d_lbd_4_neon row_rect_txfm4x8_arr[TX_TYPES] = {
1678 fdct4x8_row_rect_neon, // DCT_DCT
1679 fdct4x8_row_rect_neon, // ADST_DCT
1680 fadst4x8_row_rect_neon, // DCT_ADST
1681 fadst4x8_row_rect_neon, // ADST_ADST
1682 fdct4x8_row_rect_neon, // FLIPADST_DCT
1683 fadst4x8_row_rect_neon, // DCT_FLIPADST
1684 fadst4x8_row_rect_neon, // FLIPADST_FLIPADST
1685 fadst4x8_row_rect_neon, // ADST_FLIPADST
1686 fadst4x8_row_rect_neon, // FLIPADST_ADST
1687 fidentity4x8_row_rect_neon, // IDTX
1688 fidentity4x8_row_rect_neon, // V_DCT
1689 fdct4x8_row_rect_neon, // H_DCT
1690 fidentity4x8_row_rect_neon, // V_ADST
1691 fadst4x8_row_rect_neon, // H_ADST
1692 fidentity4x8_row_rect_neon, // V_FLIPADST
1693 fadst4x8_row_rect_neon // H_FLIPADST
1694 };
1695
1696 static const col_transform_1d_lbd_8_neon col_txfm8x8_arr[TX_TYPES] = {
1697 fdct8x8_col_neon, // DCT_DCT
1698 fadst8x8_col_neon, // ADST_DCT
1699 fdct8x8_col_neon, // DCT_ADST
1700 fadst8x8_col_neon, // ADST_ADST
1701 fadst8x8_col_neon, // FLIPADST_DCT
1702 fdct8x8_col_neon, // DCT_FLIPADST
1703 fadst8x8_col_neon, // FLIPADST_FLIPADST
1704 fadst8x8_col_neon, // ADST_FLIPADST
1705 fadst8x8_col_neon, // FLIPADST_ADST
1706 fidentity8x8_col_neon, // IDTX
1707 fdct8x8_col_neon, // V_DCT
1708 fidentity8x8_col_neon, // H_DCT
1709 fadst8x8_col_neon, // V_ADST
1710 fidentity8x8_col_neon, // H_ADST
1711 fadst8x8_col_neon, // V_FLIPADST
1712 fidentity8x8_col_neon, // H_FLIPADST
1713 };
1714
1715 static const row_transform_1d_lbd_8_neon row_txfm8x8_arr[TX_TYPES] = {
1716 fdct8x8_row_neon, // DCT_DCT
1717 fdct8x8_row_neon, // ADST_DCT
1718 fadst8x8_row_neon, // DCT_ADST
1719 fadst8x8_row_neon, // ADST_ADST
1720 fdct8x8_row_neon, // FLIPADST_DCT
1721 fadst8x8_row_neon, // DCT_FLIPADST
1722 fadst8x8_row_neon, // FLIPADST_FLIPADST
1723 fadst8x8_row_neon, // ADST_FLIPADST
1724 fadst8x8_row_neon, // FLIPADST_ADST
1725 fidentity8x8_row_neon, // IDTX
1726 fidentity8x8_row_neon, // V_DCT
1727 fdct8x8_row_neon, // H_DCT
1728 fidentity8x8_row_neon, // V_ADST
1729 fadst8x8_row_neon, // H_ADST
1730 fidentity8x8_row_neon, // V_FLIPADST
1731 fadst8x8_row_neon // H_FLIPADST
1732 };
1733
1734 static const row_transform_1d_lbd_8_neon row_rect_txfm8x8_arr[TX_TYPES] = {
1735 fdct8x8_row_rect_neon, // DCT_DCT
1736 fdct8x8_row_rect_neon, // ADST_DCT
1737 fadst8x8_row_rect_neon, // DCT_ADST
1738 fadst8x8_row_rect_neon, // ADST_ADST
1739 fdct8x8_row_rect_neon, // FLIPADST_DCT
1740 fadst8x8_row_rect_neon, // DCT_FLIPADST
1741 fadst8x8_row_rect_neon, // FLIPADST_FLIPADST
1742 fadst8x8_row_rect_neon, // ADST_FLIPADST
1743 fadst8x8_row_rect_neon, // FLIPADST_ADST
1744 fidentity8x8_row_rect_neon, // IDTX
1745 fidentity8x8_row_rect_neon, // V_DCT
1746 fdct8x8_row_rect_neon, // H_DCT
1747 fidentity8x8_row_rect_neon, // V_ADST
1748 fadst8x8_row_rect_neon, // H_ADST
1749 fidentity8x8_row_rect_neon, // V_FLIPADST
1750 fadst8x8_row_rect_neon // H_FLIPADST
1751 };
1752
1753 static const col_transform_1d_lbd_4_neon col_txfm4x16_arr[TX_TYPES] = {
1754 fdct4x16_col_neon, // DCT_DCT
1755 fadst4x16_col_neon, // ADST_DCT
1756 fdct4x16_col_neon, // DCT_ADST
1757 fadst4x16_col_neon, // ADST_ADST
1758 fadst4x16_col_neon, // FLIPADST_DCT
1759 fdct4x16_col_neon, // DCT_FLIPADST
1760 fadst4x16_col_neon, // FLIPADST_FLIPADST
1761 fadst4x16_col_neon, // ADST_FLIPADST
1762 fadst4x16_col_neon, // FLIPADST_ADST
1763 fidentity4x16_col_neon, // IDTX
1764 fdct4x16_col_neon, // V_DCT
1765 fidentity4x16_col_neon, // H_DCT
1766 fadst4x16_col_neon, // V_ADST
1767 fidentity4x16_col_neon, // H_ADST
1768 fadst4x16_col_neon, // V_FLIPADST
1769 fidentity4x16_col_neon // H_FLIPADST
1770 };
1771
1772 static const row_transform_1d_lbd_4_neon row_txfm4x16_arr[TX_TYPES] = {
1773 fdct4x16_row_neon, // DCT_DCT
1774 fdct4x16_row_neon, // ADST_DCT
1775 fadst4x16_row_neon, // DCT_ADST
1776 fadst4x16_row_neon, // ADST_ADST
1777 fdct4x16_row_neon, // FLIPADST_DCT
1778 fadst4x16_row_neon, // DCT_FLIPADST
1779 fadst4x16_row_neon, // FLIPADST_FLIPADST
1780 fadst4x16_row_neon, // ADST_FLIPADST
1781 fadst4x16_row_neon, // FLIPADST_ADST
1782 fidentity4x16_row_neon, // IDTX
1783 fidentity4x16_row_neon, // V_DCT
1784 fdct4x16_row_neon, // H_DCT
1785 fidentity4x16_row_neon, // V_ADST
1786 fadst4x16_row_neon, // H_ADST
1787 fidentity4x16_row_neon, // V_FLIPADST
1788 fadst4x16_row_neon // H_FLIPADST
1789 };
1790
1791 static const col_transform_1d_lbd_8_neon col_txfm8x16_arr[TX_TYPES] = {
1792 fdct8x16_col_neon, // DCT_DCT
1793 fadst8x16_col_neon, // ADST_DCT
1794 fdct8x16_col_neon, // DCT_ADST
1795 fadst8x16_col_neon, // ADST_ADST
1796 fadst8x16_col_neon, // FLIPADST_DCT
1797 fdct8x16_col_neon, // DCT_FLIPADST
1798 fadst8x16_col_neon, // FLIPADST_FLIPADST
1799 fadst8x16_col_neon, // ADST_FLIPADST
1800 fadst8x16_col_neon, // FLIPADST_ADST
1801 fidentity8x16_col_neon, // IDTX
1802 fdct8x16_col_neon, // V_DCT
1803 fidentity8x16_col_neon, // H_DCT
1804 fadst8x16_col_neon, // V_ADST
1805 fidentity8x16_col_neon, // H_ADST
1806 fadst8x16_col_neon, // V_FLIPADST
1807 fidentity8x16_col_neon // H_FLIPADST
1808 };
1809
1810 static const row_transform_1d_lbd_8_neon row_txfm8x16_arr[TX_TYPES] = {
1811 fdct8x16_row_neon, // DCT_DCT
1812 fdct8x16_row_neon, // ADST_DCT
1813 fadst8x16_row_neon, // DCT_ADST
1814 fadst8x16_row_neon, // ADST_ADST
1815 fdct8x16_row_neon, // FLIPADST_DCT
1816 fadst8x16_row_neon, // DCT_FLIPADST
1817 fadst8x16_row_neon, // FLIPADST_FLIPADST
1818 fadst8x16_row_neon, // ADST_FLIPADST
1819 fadst8x16_row_neon, // FLIPADST_ADST
1820 fidentity8x16_row_neon, // IDTX
1821 fidentity8x16_row_neon, // V_DCT
1822 fdct8x16_row_neon, // H_DCT
1823 fidentity8x16_row_neon, // V_ADST
1824 fadst8x16_row_neon, // H_ADST
1825 fidentity8x16_row_neon, // V_FLIPADST
1826 fadst8x16_row_neon // H_FLIPADST
1827 };
1828
1829 static const row_transform_1d_lbd_8_neon row_rect_txfm8x16_arr[TX_TYPES] = {
1830 fdct8x16_row_rect_neon, // DCT_DCT
1831 fdct8x16_row_rect_neon, // ADST_DCT
1832 fadst8x16_row_rect_neon, // DCT_ADST
1833 fadst8x16_row_rect_neon, // ADST_ADST
1834 fdct8x16_row_rect_neon, // FLIPADST_DCT
1835 fadst8x16_row_rect_neon, // DCT_FLIPADST
1836 fadst8x16_row_rect_neon, // FLIPADST_FLIPADST
1837 fadst8x16_row_rect_neon, // ADST_FLIPADST
1838 fadst8x16_row_rect_neon, // FLIPADST_ADST
1839 fidentity8x16_row_rect_neon, // IDTX
1840 fidentity8x16_row_rect_neon, // V_DCT
1841 fdct8x16_row_rect_neon, // H_DCT
1842 fidentity8x16_row_rect_neon, // V_ADST
1843 fadst8x16_row_rect_neon, // H_ADST
1844 fidentity8x16_row_rect_neon, // V_FLIPADST
1845 fadst8x16_row_rect_neon // H_FLIPADST
1846 };
1847
1848 static const row_transform_1d_lbd_8_neon row_txfm8x32_arr[TX_TYPES] = {
1849 fdct8x32_row_neon, // DCT_DCT
1850 NULL, // ADST_DCT
1851 NULL, // DCT_ADST
1852 NULL, // ADST_ADST
1853 NULL, // FLIPADST_DCT
1854 NULL, // DCT_FLIPADST
1855 NULL, // FLIPADST_FLIPADST
1856 NULL, // ADST_FLIPADST
1857 NULL, // FLIPADST_ADST
1858 fidentity8x32_row_neon, // IDTX
1859 fidentity8x32_row_neon, // V_DCT
1860 fdct8x32_row_neon, // H_DCT
1861 NULL, // V_ADST
1862 NULL, // H_ADST
1863 NULL, // V_FLIPADST
1864 NULL // H_FLIPADST
1865 };
1866
1867 static const row_transform_1d_lbd_8_neon row_rect_txfm8x32_arr[TX_TYPES] = {
1868 fdct8x32_row_rect_neon, // DCT_DCT
1869 NULL, // ADST_DCT
1870 NULL, // DCT_ADST
1871 NULL, // ADST_ADST
1872 NULL, // FLIPADST_DCT
1873 NULL, // DCT_FLIPADST
1874 NULL, // FLIPADST_FLIPADST
1875 NULL, // ADST_FLIPADST
1876 NULL, // FLIPADST_ADST
1877 fidentity8x32_row_rect_neon, // IDTX
1878 fidentity8x32_row_rect_neon, // V_DCT
1879 fdct8x32_row_rect_neon, // H_DCT
1880 NULL, // V_ADST
1881 NULL, // H_ADST
1882 NULL, // V_FLIPADST
1883 NULL // H_FLIPADST
1884 };
1885
1886 static const col_transform_1d_lbd_8_neon col_txfm8x32_arr[TX_TYPES] = {
1887 fdct8x32_col_neon, // DCT_DCT
1888 NULL, // ADST_DCT
1889 NULL, // DCT_ADST
1890 NULL, // ADST_ADST
1891 NULL, // FLIPADST_DCT
1892 NULL, // DCT_FLIPADST
1893 NULL, // FLIPADST_FLIPADST
1894 NULL, // ADST_FLIPADST
1895 NULL, // FLIPADST_ADST
1896 fidentity8x32_col_neon, // IDTX
1897 fdct8x32_col_neon, // V_DCT
1898 fidentity8x32_col_neon, // H_DCT
1899 NULL, // V_ADST
1900 NULL, // H_ADST
1901 NULL, // V_FLIPADST
1902 NULL // H_FLIPADST
1903 };
1904
lowbd_fwd_txfm2d_4x4_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1905 static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
1906 int stride, TX_TYPE tx_type, int bd) {
1907 (void)bd;
1908 int ud_flip, lr_flip;
1909 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1910 ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
1911
1912 int16x4_t buf0[4], buf1[4];
1913 switch (tx_type) {
1914 case DCT_DCT:
1915 fdct4x4_col_neon(input, buf0, stride, 13);
1916 transpose_arrays_s16_4x4(buf0, buf1);
1917 fdct4x4_row_neon(buf1, output, 4, 13);
1918 break;
1919 case ADST_DCT:
1920 fadst4x4_col_neon(input, buf0, stride, 13);
1921 transpose_arrays_s16_4x4(buf0, buf1);
1922 fdct4x4_row_neon(buf1, output, 4, 13);
1923 break;
1924 case DCT_ADST:
1925 fdct4x4_col_neon(input, buf0, stride, 13);
1926 transpose_arrays_s16_4x4(buf0, buf1);
1927 fadst4x4_row_neon(buf1, output, 4, 13);
1928 break;
1929 case ADST_ADST:
1930 fadst4x4_col_neon(input, buf0, stride, 13);
1931 transpose_arrays_s16_4x4(buf0, buf1);
1932 fadst4x4_row_neon(buf1, output, 4, 13);
1933 break;
1934 case FLIPADST_DCT:
1935 fadst4x4_col_neon(input, buf0, stride, 13);
1936 transpose_arrays_s16_4x4(buf0, buf1);
1937 fdct4x4_row_neon(buf1, output, 4, 13);
1938 break;
1939 case DCT_FLIPADST:
1940 fdct4x4_col_neon(input, buf0, stride, 13);
1941 transpose_arrays_s16_4x4(buf0, buf1);
1942 flip_buf_4_neon(buf1, buf0, 4);
1943 fadst4x4_row_neon(buf0, output, 4, 13);
1944 break;
1945 case FLIPADST_FLIPADST:
1946 fadst4x4_col_neon(input, buf0, stride, 13);
1947 transpose_arrays_s16_4x4(buf0, buf1);
1948 flip_buf_4_neon(buf1, buf0, 4);
1949 fadst4x4_row_neon(buf0, output, 4, 13);
1950 break;
1951 case ADST_FLIPADST:
1952 fadst4x4_col_neon(input, buf0, stride, 13);
1953 transpose_arrays_s16_4x4(buf0, buf1);
1954 flip_buf_4_neon(buf1, buf0, 4);
1955 fadst4x4_row_neon(buf0, output, 4, 13);
1956 break;
1957 case FLIPADST_ADST:
1958 fadst4x4_col_neon(input, buf0, stride, 13);
1959 transpose_arrays_s16_4x4(buf0, buf1);
1960 fadst4x4_row_neon(buf1, output, 4, 13);
1961 break;
1962 case IDTX:
1963 fidentity4x4_col_neon(input, buf0, stride, 13);
1964 transpose_arrays_s16_4x4(buf0, buf1);
1965 fidentity4x4_row_neon(buf1, output, 4, 13);
1966 break;
1967 case V_DCT:
1968 fdct4x4_col_neon(input, buf0, stride, 13);
1969 transpose_arrays_s16_4x4(buf0, buf1);
1970 fidentity4x4_row_neon(buf1, output, 4, 13);
1971 break;
1972 case H_DCT:
1973 fidentity4x4_col_neon(input, buf0, stride, 13);
1974 transpose_arrays_s16_4x4(buf0, buf1);
1975 fdct4x4_row_neon(buf1, output, 4, 13);
1976 break;
1977 case V_ADST:
1978 fadst4x4_col_neon(input, buf0, stride, 13);
1979 transpose_arrays_s16_4x4(buf0, buf1);
1980 fidentity4x4_row_neon(buf1, output, 4, 13);
1981 break;
1982 case H_ADST:
1983 fidentity4x4_col_neon(input, buf0, stride, 13);
1984 transpose_arrays_s16_4x4(buf0, buf1);
1985 fadst4x4_row_neon(buf1, output, 4, 13);
1986 break;
1987 case V_FLIPADST:
1988 fadst4x4_col_neon(input, buf0, stride, 13);
1989 transpose_arrays_s16_4x4(buf0, buf1);
1990 fidentity4x4_row_neon(buf1, output, 4, 13);
1991 break;
1992 case H_FLIPADST:
1993 fidentity4x4_col_neon(input, buf0, stride, 13);
1994 transpose_arrays_s16_4x4(buf0, buf1);
1995 flip_buf_4_neon(buf1, buf0, 4);
1996 fadst4x4_row_neon(buf0, output, 4, 13);
1997 break;
1998 }
1999 }
2000
lowbd_fwd_txfm2d_4x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2001 static void lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
2002 int stride, TX_TYPE tx_type, int bd) {
2003 (void)bd;
2004 int16x4_t buf0[8];
2005 int16x8_t buf1[8];
2006 const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x8_arr[tx_type];
2007 const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x4_arr[tx_type];
2008
2009 int ud_flip, lr_flip;
2010 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2011 ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
2012 col_txfm(input, buf0, stride, 13);
2013 shift_right_1_round_s16_x4(buf0, buf0, 8);
2014 transpose_arrays_s16_4x8(buf0, buf1);
2015
2016 if (lr_flip) {
2017 int16x8_t buf2[8];
2018 flip_buf_8_neon(buf1, buf2, 4);
2019 row_txfm(buf2, output, 8, 13);
2020 } else {
2021 row_txfm(buf1, output, 8, 13);
2022 }
2023 }
2024
lowbd_fwd_txfm2d_4x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2025 static void lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
2026 int stride, TX_TYPE tx_type, int bd) {
2027 (void)bd;
2028 int16x4_t buf0[16];
2029 int16x8_t buf1[16];
2030 const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x16_arr[tx_type];
2031 const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type];
2032 int ud_flip, lr_flip;
2033
2034 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2035 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
2036 col_txfm(input, buf0, stride, 13);
2037 shift_right_1_round_s16_x4(buf0, buf0, 16);
2038 transpose_arrays_s16_4x8(buf0, buf1);
2039 transpose_arrays_s16_4x8(buf0 + 8, buf1 + 8);
2040
2041 for (int i = 0; i < 2; i++) {
2042 if (lr_flip) {
2043 int16x8_t buf2[16];
2044 flip_buf_8_neon(buf1 + 8 * i, buf2, 4);
2045 row_txfm(buf2, output + 8 * i, 16, 12);
2046 } else {
2047 int16x8_t *buf = buf1 + 8 * i;
2048 row_txfm(buf, output + 8 * i, 16, 12);
2049 }
2050 }
2051 }
2052
lowbd_fwd_txfm2d_8x4_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2053 static void lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
2054 int stride, TX_TYPE tx_type, int bd) {
2055 (void)bd;
2056 int16x8_t buf0[8];
2057 int16x4_t buf1[8];
2058 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
2059 const row_transform_1d_lbd_4_neon row_txfm = row_rect_txfm4x8_arr[tx_type];
2060 int ud_flip, lr_flip;
2061
2062 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2063 ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
2064 col_txfm(input, buf0, stride, 13);
2065 shift_right_1_round_s16_x8(buf0, buf0, 4);
2066 transpose_arrays_s16_8x4(buf0, buf1);
2067
2068 if (lr_flip) {
2069 int16x4_t buf2[8];
2070 flip_buf_4_neon(buf1, buf2, 8);
2071 row_txfm(buf2, output, 4, 13);
2072 } else {
2073 row_txfm(buf1, output, 4, 13);
2074 }
2075 }
2076
lowbd_fwd_txfm2d_8x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2077 static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
2078 int stride, TX_TYPE tx_type, int bd) {
2079 (void)bd;
2080 int ud_flip, lr_flip;
2081 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2082 ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
2083
2084 int16x8_t buf0[8], buf1[8];
2085
2086 switch (tx_type) {
2087 case DCT_DCT:
2088 fdct8x8_col_neon(input, buf0, stride, 13);
2089 shift_right_1_round_s16_x8(buf0, buf0, 8);
2090 transpose_arrays_s16_8x8(buf0, buf1);
2091 fdct8x8_row_neon(buf1, output, 8, 13);
2092 break;
2093 case ADST_DCT:
2094 fadst8x8_col_neon(input, buf0, stride, 13);
2095 shift_right_1_round_s16_x8(buf0, buf0, 8);
2096 transpose_arrays_s16_8x8(buf0, buf1);
2097 fdct8x8_row_neon(buf1, output, 8, 13);
2098 break;
2099 case DCT_ADST:
2100 fdct8x8_col_neon(input, buf0, stride, 13);
2101 shift_right_1_round_s16_x8(buf0, buf0, 8);
2102 transpose_arrays_s16_8x8(buf0, buf1);
2103 fadst8x8_row_neon(buf1, output, 8, 13);
2104 break;
2105 case ADST_ADST:
2106 fadst8x8_col_neon(input, buf0, stride, 13);
2107 shift_right_1_round_s16_x8(buf0, buf0, 8);
2108 transpose_arrays_s16_8x8(buf0, buf1);
2109 fadst8x8_row_neon(buf1, output, 8, 13);
2110 break;
2111 case FLIPADST_DCT:
2112 fadst8x8_col_neon(input, buf0, stride, 13);
2113 shift_right_1_round_s16_x8(buf0, buf0, 8);
2114 transpose_arrays_s16_8x8(buf0, buf1);
2115 fdct8x8_row_neon(buf1, output, 8, 13);
2116 break;
2117 case DCT_FLIPADST:
2118 fdct8x8_col_neon(input, buf0, stride, 13);
2119 shift_right_1_round_s16_x8(buf0, buf0, 8);
2120 transpose_arrays_s16_8x8(buf0, buf1);
2121 flip_buf_8_neon(buf1, buf0, 8);
2122 fadst8x8_row_neon(buf0, output, 8, 13);
2123 break;
2124 case FLIPADST_FLIPADST:
2125 fadst8x8_col_neon(input, buf0, stride, 13);
2126 shift_right_1_round_s16_x8(buf0, buf0, 8);
2127 transpose_arrays_s16_8x8(buf0, buf1);
2128 flip_buf_8_neon(buf1, buf0, 8);
2129 fadst8x8_row_neon(buf0, output, 8, 13);
2130 break;
2131 case ADST_FLIPADST:
2132 fadst8x8_col_neon(input, buf0, stride, 13);
2133 shift_right_1_round_s16_x8(buf0, buf0, 8);
2134 transpose_arrays_s16_8x8(buf0, buf1);
2135 flip_buf_8_neon(buf1, buf0, 8);
2136 fadst8x8_row_neon(buf0, output, 8, 13);
2137 break;
2138 case FLIPADST_ADST:
2139 fadst8x8_col_neon(input, buf0, stride, 13);
2140 shift_right_1_round_s16_x8(buf0, buf0, 8);
2141 transpose_arrays_s16_8x8(buf0, buf1);
2142 fadst8x8_row_neon(buf1, output, 8, 13);
2143 break;
2144 case IDTX:
2145 fidentity8x8_col_neon(input, buf0, stride, 13);
2146 shift_right_1_round_s16_x8(buf0, buf0, 8);
2147 transpose_arrays_s16_8x8(buf0, buf1);
2148 fidentity8x8_row_neon(buf1, output, 8, 13);
2149 break;
2150 case V_DCT:
2151 fdct8x8_col_neon(input, buf0, stride, 13);
2152 shift_right_1_round_s16_x8(buf0, buf0, 8);
2153 transpose_arrays_s16_8x8(buf0, buf1);
2154 fidentity8x8_row_neon(buf1, output, 8, 13);
2155 break;
2156 case H_DCT:
2157 fidentity8x8_col_neon(input, buf0, stride, 13);
2158 shift_right_1_round_s16_x8(buf0, buf0, 8);
2159 transpose_arrays_s16_8x8(buf0, buf1);
2160 fdct8x8_row_neon(buf1, output, 8, 13);
2161 break;
2162 case V_ADST:
2163 fadst8x8_col_neon(input, buf0, stride, 13);
2164 shift_right_1_round_s16_x8(buf0, buf0, 8);
2165 transpose_arrays_s16_8x8(buf0, buf1);
2166 fidentity8x8_row_neon(buf1, output, 8, 13);
2167 break;
2168 case H_ADST:
2169 fidentity8x8_col_neon(input, buf0, stride, 13);
2170 shift_right_1_round_s16_x8(buf0, buf0, 8);
2171 transpose_arrays_s16_8x8(buf0, buf1);
2172 fadst8x8_row_neon(buf1, output, 8, 13);
2173 break;
2174 case V_FLIPADST:
2175 fadst8x8_col_neon(input, buf0, stride, 13);
2176 shift_right_1_round_s16_x8(buf0, buf0, 8);
2177 transpose_arrays_s16_8x8(buf0, buf1);
2178 fidentity8x8_row_neon(buf1, output, 8, 13);
2179 break;
2180 case H_FLIPADST:
2181 fidentity8x8_col_neon(input, buf0, stride, 13);
2182 shift_right_1_round_s16_x8(buf0, buf0, 8);
2183 transpose_arrays_s16_8x8(buf0, buf1);
2184 flip_buf_8_neon(buf1, buf0, 8);
2185 fadst8x8_row_neon(buf0, output, 8, 13);
2186 break;
2187 }
2188 }
2189
lowbd_fwd_txfm2d_8x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2190 static void lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
2191 int stride, TX_TYPE tx_type, int bd) {
2192 (void)bd;
2193 int16x8_t buf0[16], buf1[16];
2194 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
2195 const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x8_arr[tx_type];
2196 int ud_flip, lr_flip;
2197
2198 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2199 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
2200 col_txfm(input, buf0, stride, 13);
2201 shift_right_2_round_s16_x8(buf0, buf0, 16);
2202 transpose_arrays_s16_8x8(buf0, buf1);
2203 transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
2204
2205 for (int i = 0; i < 2; i++) {
2206 if (lr_flip) {
2207 flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
2208 row_txfm(buf0, output + 8 * i, 16, 13);
2209 } else {
2210 int16x8_t *buf = buf1 + 8 * i;
2211 row_txfm(buf, output + 8 * i, 16, 13);
2212 }
2213 }
2214 }
2215
lowbd_fwd_txfm2d_8x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2216 static void lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
2217 int stride, TX_TYPE tx_type, int bd) {
2218 (void)bd;
2219 int16x8_t buf0[32], buf1[32];
2220 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
2221 const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
2222 int ud_flip, lr_flip;
2223
2224 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2225 ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
2226 col_txfm(input, buf0, stride, 12);
2227 shift_right_2_round_s16_x8(buf0, buf0, 32);
2228 transpose_arrays_s16_8x8(buf0, buf1);
2229 transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
2230 transpose_arrays_s16_8x8(buf0 + 16, buf1 + 16);
2231 transpose_arrays_s16_8x8(buf0 + 24, buf1 + 24);
2232
2233 for (int i = 0; i < 4; i++) {
2234 if (lr_flip) {
2235 flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
2236 row_txfm(buf0, output + 8 * i, 32, 12);
2237 } else {
2238 int16x8_t *buf = buf1 + 8 * i;
2239 row_txfm(buf, output + 8 * i, 32, 12);
2240 }
2241 }
2242 }
2243
lowbd_fwd_txfm2d_16x4_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2244 static void lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
2245 int stride, TX_TYPE tx_type, int bd) {
2246 (void)bd;
2247 int16x8_t buf0[16];
2248 int16x4_t buf1[16];
2249 int16x4_t buf2[16];
2250 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
2251 const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x16_arr[tx_type];
2252 int ud_flip, lr_flip;
2253
2254 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2255 ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
2256 for (int i = 0; i < 2; i++) {
2257 col_txfm(input + 8 * i, buf0, stride, 13);
2258 shift_right_1_round_s16_x8(buf0, buf0, 4);
2259 transpose_arrays_s16_8x4(buf0, buf1 + 8 * i);
2260 }
2261
2262 if (lr_flip) {
2263 flip_buf_4_neon(buf1, buf2, 16);
2264 row_txfm(buf2, output, 4, 13);
2265 } else {
2266 row_txfm(buf1, output, 4, 13);
2267 }
2268 }
2269
lowbd_fwd_txfm2d_16x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2270 static void lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
2271 int stride, TX_TYPE tx_type, int bd) {
2272 (void)bd;
2273 int16x8_t buf0[16], buf1[16];
2274 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
2275 const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
2276 int ud_flip, lr_flip;
2277
2278 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2279 ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
2280 for (int i = 0; i < 2; i++) {
2281 col_txfm(input + 8 * i, buf0, stride, 13);
2282 shift_right_2_round_s16_x8(buf0, buf0, 8);
2283 transpose_arrays_s16_8x8(buf0, buf1 + 8 * i);
2284 }
2285
2286 if (lr_flip) {
2287 flip_buf_8_neon(buf1, buf0, 16);
2288 row_txfm(buf0, output, 8, 13);
2289 } else {
2290 row_txfm(buf1, output, 8, 13);
2291 }
2292 }
2293
lowbd_fwd_txfm2d_16x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2294 static void lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
2295 int stride, TX_TYPE tx_type, int bd) {
2296 (void)bd;
2297 int16x8_t buf0[16], buf1[32];
2298 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
2299 const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type];
2300 int ud_flip, lr_flip;
2301
2302 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2303 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
2304 for (int i = 0; i < 2; i++) {
2305 col_txfm(input + 8 * i, buf0, stride, 13);
2306 shift_right_2_round_s16_x8(buf0, buf0, 16);
2307 transpose_arrays_s16_8x8(buf0, buf1 + 0 * 16 + 8 * i);
2308 transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 16 + 8 * i);
2309 }
2310
2311 for (int i = 0; i < 2; i++) {
2312 if (lr_flip) {
2313 flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
2314 row_txfm(buf0, output + 8 * i, 16, 12);
2315 } else {
2316 int16x8_t *buf = buf1 + 16 * i;
2317 row_txfm(buf, output + 8 * i, 16, 12);
2318 }
2319 }
2320 }
2321
lowbd_fwd_txfm2d_16x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2322 static void lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
2323 int stride, TX_TYPE tx_type, int bd) {
2324 (void)bd;
2325 int16x8_t buf0[32], buf1[64];
2326 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
2327 const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
2328
2329 if (col_txfm == NULL || row_txfm == NULL) {
2330 av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
2331 return;
2332 }
2333
2334 int ud_flip, lr_flip;
2335 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2336 ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
2337 for (int i = 0; i < 2; i++) {
2338 col_txfm(input + 8 * i, buf0, stride, 12);
2339 shift_right_4_round_s16_x8(buf0, buf0, 32);
2340 transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 16 + 8 * i);
2341 transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 16 + 8 * i);
2342 transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 16 + 8 * i);
2343 transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 16 + 8 * i);
2344 }
2345
2346 for (int i = 0; i < 4; i++) {
2347 if (lr_flip) {
2348 flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
2349 row_txfm(buf0, output + 8 * i, 32, 13);
2350 } else {
2351 int16x8_t *buf = buf1 + 16 * i;
2352 row_txfm(buf, output + 8 * i, 32, 13);
2353 }
2354 }
2355 }
2356
lowbd_fwd_txfm2d_32x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2357 static void lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
2358 int stride, TX_TYPE tx_type, int bd) {
2359 (void)bd;
2360 int16x8_t buf0[32], buf1[32];
2361 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
2362 const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
2363
2364 if (col_txfm == NULL || row_txfm == NULL) {
2365 av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
2366 return;
2367 }
2368
2369 int ud_flip, lr_flip;
2370 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2371 ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
2372 for (int i = 0; i < 4; i++) {
2373 col_txfm(input + 8 * i, buf0, stride, 13);
2374 shift_right_2_round_s16_x8(buf0, buf0, 8);
2375 transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
2376 }
2377
2378 if (lr_flip) {
2379 flip_buf_8_neon(buf1, buf0, 32);
2380 row_txfm(buf0, output, 8, 12);
2381 } else {
2382 row_txfm(buf1, output, 8, 12);
2383 }
2384 }
2385
lowbd_fwd_txfm2d_32x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2386 static void lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
2387 int stride, TX_TYPE tx_type, int bd) {
2388 (void)bd;
2389 int16x8_t buf0[32], buf1[64];
2390 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
2391 const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x32_arr[tx_type];
2392
2393 if (col_txfm == NULL || row_txfm == NULL) {
2394 av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
2395 return;
2396 }
2397
2398 int ud_flip, lr_flip;
2399 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2400 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
2401 for (int i = 0; i < 4; i++) {
2402 col_txfm(input + 8 * i, buf0, stride, 13);
2403 shift_right_4_round_s16_x8(buf0, buf0, 16);
2404 transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
2405 transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 32 + 8 * i);
2406 }
2407
2408 for (int i = 0; i < 2; i++) {
2409 if (lr_flip) {
2410 flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
2411 row_txfm(buf0, output + 8 * i, 16, 13);
2412 } else {
2413 int16x8_t *buf = buf1 + 32 * i;
2414 row_txfm(buf, output + 8 * i, 16, 13);
2415 }
2416 }
2417 }
2418
lowbd_fwd_txfm2d_32x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2419 static void lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
2420 int stride, TX_TYPE tx_type, int bd) {
2421 (void)bd;
2422 int16x8_t buf0[32], buf1[128];
2423 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
2424 const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
2425
2426 if (col_txfm == NULL || row_txfm == NULL) {
2427 av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
2428 return;
2429 }
2430
2431 int ud_flip, lr_flip;
2432 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2433 ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
2434 for (int i = 0; i < 4; i++) {
2435 col_txfm(input + 8 * i, buf0, stride, 12);
2436 shift_right_4_round_s16_x8(buf0, buf0, 32);
2437 transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 32 + 8 * i);
2438 transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 32 + 8 * i);
2439 transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 32 + 8 * i);
2440 transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 32 + 8 * i);
2441 }
2442
2443 for (int i = 0; i < 4; i++) {
2444 if (lr_flip) {
2445 flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
2446 row_txfm(buf0, output + 8 * i, 32, 12);
2447 } else {
2448 int16x8_t *buf = buf1 + 32 * i;
2449 row_txfm(buf, output + 8 * i, 32, 12);
2450 }
2451 }
2452 }
2453
lowbd_fwd_txfm2d_64x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2454 static void lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output,
2455 int stride, TX_TYPE tx_type, int bd) {
2456 (void)bd;
2457 (void)tx_type;
2458 assert(tx_type == DCT_DCT);
2459 int16x8_t buf0[64], buf1[128];
2460 const transform_1d_lbd_8_neon col_txfm = fdct8x16_neon;
2461 const transform_1d_lbd_8_neon row_txfm = fdct8x64_neon;
2462
2463 for (int i = 0; i < 8; i++) {
2464 load_buffer_s16_x8(input + 8 * i, stride, buf0, 16);
2465 shift_left_2_s16_x8(buf0, buf0, 16);
2466 col_txfm(buf0, buf0, 13);
2467 shift_right_4_round_s16_x8(buf0, buf0, 16);
2468 for (int j = 0; j < 2; ++j) {
2469 transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
2470 }
2471 }
2472
2473 for (int i = 0; i < 2; i++) {
2474 int16x8_t *buf = buf1 + 64 * i;
2475 row_txfm(buf, buf, 12);
2476 store_buffer_s16_x8(buf, output + 8 * i, 16, 32);
2477 }
2478 // Zero out the bottom 16x32 area.
2479 memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
2480 }
2481
lowbd_fwd_txfm2d_16x64_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2482 static void lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output,
2483 int stride, TX_TYPE tx_type, int bd) {
2484 (void)bd;
2485 (void)tx_type;
2486 assert(tx_type == DCT_DCT);
2487 int16x8_t buf0[64], buf1[128];
2488 const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
2489 const transform_1d_lbd_8_neon row_txfm = fdct8x16_neon;
2490
2491 for (int i = 0; i < 2; i++) {
2492 load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
2493 col_txfm(buf0, buf0, 13);
2494 shift_right_2_round_s16_x8(buf0, buf0, 64);
2495 for (int j = 0; j < 8; ++j) {
2496 transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 16 + 8 * i);
2497 }
2498 }
2499
2500 for (int i = 0; i < 4; i++) {
2501 int16x8_t *buf = buf1 + 16 * i;
2502 row_txfm(buf, buf, 12);
2503 store_buffer_s16_x8(buf, output + 8 * i, 32, 16);
2504 }
2505 }
2506
fdct32_neon(const int32x4_t * input,int32x4_t * output,int cos_bit)2507 static void fdct32_neon(const int32x4_t *input, int32x4_t *output,
2508 int cos_bit) {
2509 const int16_t *cospi = cospi_arr_q13(cos_bit);
2510
2511 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
2512 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
2513 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
2514 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
2515 const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
2516 const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
2517 const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
2518 const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
2519
2520 const int16x4_t cospi32 = vget_low_s16(cospi32_16);
2521 const int16x4_t cospi16 = vget_high_s16(cospi32_16);
2522 const int16x4_t cospi8 = vget_low_s16(cospi8_24);
2523 const int16x4_t cospi24 = vget_high_s16(cospi8_24);
2524 const int16x4_t cospi4 = vget_low_s16(cospi4_12);
2525 const int16x4_t cospi12 = vget_high_s16(cospi4_12);
2526 const int16x4_t cospi20 = vget_low_s16(cospi20_28);
2527 const int16x4_t cospi28 = vget_high_s16(cospi20_28);
2528 const int16x4_t cospi2 = vget_low_s16(cospi2_6);
2529 const int16x4_t cospi6 = vget_high_s16(cospi2_6);
2530 const int16x4_t cospi10 = vget_low_s16(cospi10_14);
2531 const int16x4_t cospi14 = vget_high_s16(cospi10_14);
2532 const int16x4_t cospi18 = vget_low_s16(cospi18_22);
2533 const int16x4_t cospi22 = vget_high_s16(cospi18_22);
2534 const int16x4_t cospi26 = vget_low_s16(cospi26_30);
2535 const int16x4_t cospi30 = vget_high_s16(cospi26_30);
2536
2537 int32x4_t buf0[32];
2538 int32x4_t buf1[32];
2539
2540 // stage 1
2541 butterfly_dct_pre_s32_x4(input, buf1, 32);
2542
2543 // stage 2
2544 butterfly_dct_pre_s32_x4(buf1, buf0, 16);
2545 buf0[16] = buf1[16];
2546 buf0[17] = buf1[17];
2547 buf0[18] = buf1[18];
2548 buf0[19] = buf1[19];
2549 butterfly_s32_s32_x4_0112_neon(cospi32, buf1[27], buf1[20], &buf0[27],
2550 &buf0[20]);
2551 butterfly_s32_s32_x4_0112_neon(cospi32, buf1[26], buf1[21], &buf0[26],
2552 &buf0[21]);
2553 butterfly_s32_s32_x4_0112_neon(cospi32, buf1[25], buf1[22], &buf0[25],
2554 &buf0[22]);
2555 butterfly_s32_s32_x4_0112_neon(cospi32, buf1[24], buf1[23], &buf0[24],
2556 &buf0[23]);
2557 buf0[28] = buf1[28];
2558 buf0[29] = buf1[29];
2559 buf0[30] = buf1[30];
2560 buf0[31] = buf1[31];
2561
2562 // stage 3
2563 butterfly_dct_pre_s32_x4(buf0, buf1, 8);
2564 buf1[8] = buf0[8];
2565 buf1[9] = buf0[9];
2566 butterfly_s32_s32_x4_0112_neon(cospi32, buf0[13], buf0[10], &buf1[13],
2567 &buf1[10]);
2568 butterfly_s32_s32_x4_0112_neon(cospi32, buf0[12], buf0[11], &buf1[12],
2569 &buf1[11]);
2570 buf1[14] = buf0[14];
2571 buf1[15] = buf0[15];
2572 butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 16);
2573
2574 // stage 4
2575 butterfly_dct_pre_s32_x4(buf1, buf0, 4);
2576 buf0[4] = buf1[4];
2577 butterfly_s32_s32_x4_0112_neon(cospi32, buf1[6], buf1[5], &buf0[6], &buf0[5]);
2578 buf0[7] = buf1[7];
2579 butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 8);
2580 buf0[16] = buf1[16];
2581 buf0[17] = buf1[17];
2582 butterfly_s32_s32_x4_0112_neon(cospi16, buf1[29], buf1[18], &buf0[29],
2583 &buf0[18]);
2584 butterfly_s32_s32_x4_0112_neon(cospi16, buf1[28], buf1[19], &buf0[28],
2585 &buf0[19]);
2586 butterfly_s32_s32_x4_1223_neon(cospi16, buf1[27], buf1[20], &buf0[27],
2587 &buf0[20]);
2588 butterfly_s32_s32_x4_1223_neon(cospi16, buf1[26], buf1[21], &buf0[26],
2589 &buf0[21]);
2590 buf0[22] = buf1[22];
2591 buf0[23] = buf1[23];
2592 buf0[24] = buf1[24];
2593 buf0[25] = buf1[25];
2594 buf0[30] = buf1[30];
2595 buf0[31] = buf1[31];
2596
2597 // stage 5
2598 butterfly_s32_s32_x4_0112_neon(cospi32, buf0[0], buf0[1], &buf1[0], &buf1[1]);
2599 butterfly_s32_s32_x4_0112_neon(cospi16, buf0[3], buf0[2], &buf1[2], &buf1[3]);
2600 butterfly_dct_post_s32_x4(buf0 + 4, buf0 + 4, buf1 + 4, 4);
2601 buf1[8] = buf0[8];
2602 butterfly_s32_s32_x4_0112_neon(cospi16, buf0[14], buf0[9], &buf1[14],
2603 &buf1[9]);
2604 butterfly_s32_s32_x4_1223_neon(cospi16, buf0[13], buf0[10], &buf1[13],
2605 &buf1[10]);
2606 buf1[11] = buf0[11];
2607 buf1[12] = buf0[12];
2608 buf1[15] = buf0[15];
2609 butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 8);
2610 butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 8);
2611
2612 // stage 6
2613 buf0[0] = buf1[0];
2614 buf0[1] = buf1[1];
2615 buf0[2] = buf1[2];
2616 buf0[3] = buf1[3];
2617 butterfly_s32_s32_x4_0112_neon(cospi8, buf1[7], buf1[4], &buf0[4], &buf0[7]);
2618 butterfly_s32_s32_x4_1003_neon(cospi24, buf1[6], buf1[5], &buf0[5], &buf0[6]);
2619 butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 4);
2620 butterfly_dct_post_s32_x4(buf1 + 12, buf1 + 12, buf0 + 12, 4);
2621 buf0[16] = buf1[16];
2622 butterfly_s32_s32_x4_0112_neon(cospi8, buf1[30], buf1[17], &buf0[30],
2623 &buf0[17]);
2624 butterfly_s32_s32_x4_1223_neon(cospi8, buf1[29], buf1[18], &buf0[29],
2625 &buf0[18]);
2626 buf0[19] = buf1[19];
2627 buf0[20] = buf1[20];
2628 butterfly_s32_s32_x4_1003_neon(cospi24, buf1[26], buf1[21], &buf0[26],
2629 &buf0[21]);
2630 butterfly_s32_s32_x4_0332_neon(cospi24, buf1[25], buf1[22], &buf0[25],
2631 &buf0[22]);
2632 buf0[23] = buf1[23];
2633 buf0[24] = buf1[24];
2634 buf0[27] = buf1[27];
2635 buf0[28] = buf1[28];
2636 buf0[31] = buf1[31];
2637
2638 // stage 7
2639 buf1[0] = buf0[0];
2640 buf1[1] = buf0[1];
2641 buf1[2] = buf0[2];
2642 buf1[3] = buf0[3];
2643 buf1[4] = buf0[4];
2644 buf1[5] = buf0[5];
2645 buf1[6] = buf0[6];
2646 buf1[7] = buf0[7];
2647 butterfly_s32_s32_x4_0112_neon(cospi4, buf0[15], buf0[8], &buf1[8],
2648 &buf1[15]);
2649 butterfly_s32_s32_x4_1003_neon(cospi28, buf0[14], buf0[9], &buf1[9],
2650 &buf1[14]);
2651 butterfly_s32_s32_x4_0112_neon(cospi20, buf0[13], buf0[10], &buf1[10],
2652 &buf1[13]);
2653 butterfly_s32_s32_x4_1003_neon(cospi12, buf0[12], buf0[11], &buf1[11],
2654 &buf1[12]);
2655 butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 4);
2656 butterfly_dct_post_s32_x4(buf0 + 20, buf0 + 20, buf1 + 20, 4);
2657 butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 4);
2658 butterfly_dct_post_s32_x4(buf0 + 28, buf0 + 28, buf1 + 28, 4);
2659
2660 // stage 8
2661 buf0[0] = buf1[0];
2662 buf0[1] = buf1[1];
2663 buf0[2] = buf1[2];
2664 buf0[3] = buf1[3];
2665 buf0[4] = buf1[4];
2666 buf0[5] = buf1[5];
2667 buf0[6] = buf1[6];
2668 buf0[7] = buf1[7];
2669 buf0[8] = buf1[8];
2670 buf0[9] = buf1[9];
2671 buf0[10] = buf1[10];
2672 buf0[11] = buf1[11];
2673 buf0[12] = buf1[12];
2674 buf0[13] = buf1[13];
2675 buf0[14] = buf1[14];
2676 buf0[15] = buf1[15];
2677 butterfly_s32_s32_x4_0112_neon(cospi2, buf1[31], buf1[16], &buf0[16],
2678 &buf0[31]);
2679 butterfly_s32_s32_x4_1003_neon(cospi30, buf1[30], buf1[17], &buf0[17],
2680 &buf0[30]);
2681 butterfly_s32_s32_x4_0112_neon(cospi18, buf1[29], buf1[18], &buf0[18],
2682 &buf0[29]);
2683 butterfly_s32_s32_x4_1003_neon(cospi14, buf1[28], buf1[19], &buf0[19],
2684 &buf0[28]);
2685 butterfly_s32_s32_x4_0112_neon(cospi10, buf1[27], buf1[20], &buf0[20],
2686 &buf0[27]);
2687 butterfly_s32_s32_x4_1003_neon(cospi22, buf1[26], buf1[21], &buf0[21],
2688 &buf0[26]);
2689 butterfly_s32_s32_x4_0112_neon(cospi26, buf1[25], buf1[22], &buf0[22],
2690 &buf0[25]);
2691 butterfly_s32_s32_x4_1003_neon(cospi6, buf1[24], buf1[23], &buf0[23],
2692 &buf0[24]);
2693
2694 // stage 9
2695 output[0] = buf0[0];
2696 output[1] = buf0[16];
2697 output[2] = buf0[8];
2698 output[3] = buf0[24];
2699 output[4] = buf0[4];
2700 output[5] = buf0[20];
2701 output[6] = buf0[12];
2702 output[7] = buf0[28];
2703 output[8] = buf0[2];
2704 output[9] = buf0[18];
2705 output[10] = buf0[10];
2706 output[11] = buf0[26];
2707 output[12] = buf0[6];
2708 output[13] = buf0[22];
2709 output[14] = buf0[14];
2710 output[15] = buf0[30];
2711 output[16] = buf0[1];
2712 output[17] = buf0[17];
2713 output[18] = buf0[9];
2714 output[19] = buf0[25];
2715 output[20] = buf0[5];
2716 output[21] = buf0[21];
2717 output[22] = buf0[13];
2718 output[23] = buf0[29];
2719 output[24] = buf0[3];
2720 output[25] = buf0[19];
2721 output[26] = buf0[11];
2722 output[27] = buf0[27];
2723 output[28] = buf0[7];
2724 output[29] = buf0[23];
2725 output[30] = buf0[15];
2726 output[31] = buf0[31];
2727 }
2728
fdct64_neon(const int32x4_t * input,int32x4_t * output,int cos_bit)2729 static void fdct64_neon(const int32x4_t *input, int32x4_t *output,
2730 int cos_bit) {
2731 const int16_t *cospi = cospi_arr_q13(cos_bit);
2732
2733 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
2734 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
2735 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
2736 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
2737 const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
2738 const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
2739 const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
2740 const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
2741 const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
2742 const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
2743 const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
2744 const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
2745 const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
2746 const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
2747 const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
2748 const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
2749
2750 const int16x4_t cospi32 = vget_low_s16(cospi32_16);
2751 const int16x4_t cospi16 = vget_high_s16(cospi32_16);
2752 const int16x4_t cospi8 = vget_low_s16(cospi8_24);
2753 const int16x4_t cospi24 = vget_high_s16(cospi8_24);
2754 const int16x4_t cospi4 = vget_low_s16(cospi4_12);
2755 const int16x4_t cospi12 = vget_high_s16(cospi4_12);
2756 const int16x4_t cospi20 = vget_low_s16(cospi20_28);
2757 const int16x4_t cospi28 = vget_high_s16(cospi20_28);
2758 const int16x4_t cospi2 = vget_low_s16(cospi2_6);
2759 const int16x4_t cospi6 = vget_high_s16(cospi2_6);
2760 const int16x4_t cospi10 = vget_low_s16(cospi10_14);
2761 const int16x4_t cospi14 = vget_high_s16(cospi10_14);
2762 const int16x4_t cospi18 = vget_low_s16(cospi18_22);
2763 const int16x4_t cospi22 = vget_high_s16(cospi18_22);
2764 const int16x4_t cospi26 = vget_low_s16(cospi26_30);
2765 const int16x4_t cospi30 = vget_high_s16(cospi26_30);
2766 const int16x4_t cospi1 = vget_low_s16(cospi1_3);
2767 const int16x4_t cospi3 = vget_high_s16(cospi1_3);
2768 const int16x4_t cospi5 = vget_low_s16(cospi5_7);
2769 const int16x4_t cospi7 = vget_high_s16(cospi5_7);
2770 const int16x4_t cospi9 = vget_low_s16(cospi9_11);
2771 const int16x4_t cospi11 = vget_high_s16(cospi9_11);
2772 const int16x4_t cospi13 = vget_low_s16(cospi13_15);
2773 const int16x4_t cospi15 = vget_high_s16(cospi13_15);
2774 const int16x4_t cospi17 = vget_low_s16(cospi17_19);
2775 const int16x4_t cospi19 = vget_high_s16(cospi17_19);
2776 const int16x4_t cospi21 = vget_low_s16(cospi21_23);
2777 const int16x4_t cospi23 = vget_high_s16(cospi21_23);
2778 const int16x4_t cospi25 = vget_low_s16(cospi25_27);
2779 const int16x4_t cospi27 = vget_high_s16(cospi25_27);
2780 const int16x4_t cospi29 = vget_low_s16(cospi29_31);
2781 const int16x4_t cospi31 = vget_high_s16(cospi29_31);
2782
2783 // stage 1
2784 int32x4_t x1[64];
2785 butterfly_dct_pre_s32_x4(input, x1, 64);
2786
2787 // stage 2
2788 int32x4_t x2[64];
2789 butterfly_dct_pre_s32_x4(x1, x2, 32);
2790 butterfly_s32_s32_x4_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
2791 butterfly_s32_s32_x4_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
2792 butterfly_s32_s32_x4_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
2793 butterfly_s32_s32_x4_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
2794 butterfly_s32_s32_x4_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
2795 butterfly_s32_s32_x4_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
2796 butterfly_s32_s32_x4_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
2797 butterfly_s32_s32_x4_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
2798
2799 // stage 3
2800 int32x4_t x3[64];
2801 butterfly_dct_pre_s32_x4(x2, x3, 16);
2802 butterfly_s32_s32_x4_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
2803 butterfly_s32_s32_x4_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
2804 butterfly_s32_s32_x4_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
2805 butterfly_s32_s32_x4_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
2806 butterfly_dct_post_s32_x4(x1 + 32, x2 + 32, x3 + 32, 32);
2807
2808 // stage 4
2809 int32x4_t x4[64];
2810 butterfly_dct_pre_s32_x4(x3, x4, 8);
2811 butterfly_s32_s32_x4_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
2812 butterfly_s32_s32_x4_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
2813 butterfly_dct_post_s32_x4(x2 + 16, x3 + 16, x4 + 16, 16);
2814 butterfly_s32_s32_x4_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
2815 butterfly_s32_s32_x4_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
2816 butterfly_s32_s32_x4_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
2817 butterfly_s32_s32_x4_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
2818 butterfly_s32_s32_x4_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
2819 butterfly_s32_s32_x4_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
2820 butterfly_s32_s32_x4_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
2821 butterfly_s32_s32_x4_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
2822
2823 // stage 5
2824 int32x4_t x5[64];
2825 butterfly_dct_pre_s32_x4(x4, x5, 4);
2826 butterfly_s32_s32_x4_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
2827 butterfly_dct_post_s32_x4(x3 + 8, x4 + 8, x5 + 8, 8);
2828 butterfly_s32_s32_x4_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
2829 butterfly_s32_s32_x4_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
2830 butterfly_s32_s32_x4_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
2831 butterfly_s32_s32_x4_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
2832 butterfly_dct_post_s32_x4(x3 + 32, x4 + 32, x5 + 32, 16);
2833 butterfly_dct_post_s32_x4(x3 + 48, x4 + 48, x5 + 48, 16);
2834
2835 // stage 6
2836 int32x4_t x6[64];
2837 butterfly_s32_s32_x4_0112_neon(cospi32, x5[0], x5[1], &x6[0], &x6[1]);
2838 butterfly_s32_s32_x4_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
2839 butterfly_dct_post_s32_x4(x4 + 4, x5 + 4, x6 + 4, 4);
2840 butterfly_s32_s32_x4_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
2841 butterfly_s32_s32_x4_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
2842 butterfly_dct_post_s32_x4(x4 + 16, x5 + 16, x6 + 16, 8);
2843 butterfly_dct_post_s32_x4(x4 + 24, x5 + 24, x6 + 24, 8);
2844 butterfly_s32_s32_x4_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
2845 butterfly_s32_s32_x4_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
2846 butterfly_s32_s32_x4_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
2847 butterfly_s32_s32_x4_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
2848 butterfly_s32_s32_x4_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
2849 butterfly_s32_s32_x4_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
2850 butterfly_s32_s32_x4_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
2851 butterfly_s32_s32_x4_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
2852
2853 // stage 7
2854 int32x4_t x7[64];
2855 butterfly_s32_s32_x4_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
2856 butterfly_s32_s32_x4_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
2857 butterfly_dct_post_s32_x4(x5 + 8, x6 + 8, x7 + 8, 4);
2858 butterfly_dct_post_s32_x4(x5 + 12, x6 + 12, x7 + 12, 4);
2859 butterfly_s32_s32_x4_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
2860 butterfly_s32_s32_x4_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
2861 butterfly_s32_s32_x4_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
2862 butterfly_s32_s32_x4_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
2863 butterfly_dct_post_s32_x4(x5 + 32, x6 + 32, x7 + 32, 8);
2864 butterfly_dct_post_s32_x4(x5 + 40, x6 + 40, x7 + 40, 8);
2865 butterfly_dct_post_s32_x4(x5 + 48, x6 + 48, x7 + 48, 8);
2866 butterfly_dct_post_s32_x4(x5 + 56, x6 + 56, x7 + 56, 8);
2867
2868 // stage 8
2869 int32x4_t x8[64];
2870 butterfly_s32_s32_x4_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
2871 butterfly_s32_s32_x4_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
2872 butterfly_s32_s32_x4_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
2873 butterfly_s32_s32_x4_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
2874 butterfly_dct_post_s32_x4(x6 + 16, x7 + 16, x8 + 16, 4);
2875 butterfly_dct_post_s32_x4(x6 + 20, x7 + 20, x8 + 20, 4);
2876 butterfly_dct_post_s32_x4(x6 + 24, x7 + 24, x8 + 24, 4);
2877 butterfly_dct_post_s32_x4(x6 + 28, x7 + 28, x8 + 28, 4);
2878 butterfly_s32_s32_x4_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
2879 butterfly_s32_s32_x4_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
2880 butterfly_s32_s32_x4_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
2881 butterfly_s32_s32_x4_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
2882 butterfly_s32_s32_x4_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
2883 butterfly_s32_s32_x4_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
2884 butterfly_s32_s32_x4_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
2885 butterfly_s32_s32_x4_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
2886
2887 // stage 9
2888 int32x4_t x9[64];
2889 butterfly_s32_s32_x4_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
2890 butterfly_s32_s32_x4_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
2891 butterfly_s32_s32_x4_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
2892 butterfly_s32_s32_x4_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
2893 butterfly_s32_s32_x4_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
2894 butterfly_s32_s32_x4_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
2895 butterfly_s32_s32_x4_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
2896 butterfly_s32_s32_x4_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
2897 butterfly_dct_post_s32_x4(x7 + 32, x8 + 32, x9 + 32, 4);
2898 butterfly_dct_post_s32_x4(x7 + 36, x8 + 36, x9 + 36, 4);
2899 butterfly_dct_post_s32_x4(x7 + 40, x8 + 40, x9 + 40, 4);
2900 butterfly_dct_post_s32_x4(x7 + 44, x8 + 44, x9 + 44, 4);
2901 butterfly_dct_post_s32_x4(x7 + 48, x8 + 48, x9 + 48, 4);
2902 butterfly_dct_post_s32_x4(x7 + 52, x8 + 52, x9 + 52, 4);
2903 butterfly_dct_post_s32_x4(x7 + 56, x8 + 56, x9 + 56, 4);
2904 butterfly_dct_post_s32_x4(x7 + 60, x8 + 60, x9 + 60, 4);
2905
2906 // stage 10
2907 int32x4_t x10[64];
2908 butterfly_s32_s32_x4_0112_neon(cospi1, x9[63], x9[32], &x10[32], &x10[63]);
2909 butterfly_s32_s32_x4_1003_neon(cospi31, x9[62], x9[33], &x10[33], &x10[62]);
2910 butterfly_s32_s32_x4_0112_neon(cospi17, x9[61], x9[34], &x10[34], &x10[61]);
2911 butterfly_s32_s32_x4_1003_neon(cospi15, x9[60], x9[35], &x10[35], &x10[60]);
2912 butterfly_s32_s32_x4_0112_neon(cospi9, x9[59], x9[36], &x10[36], &x10[59]);
2913 butterfly_s32_s32_x4_1003_neon(cospi23, x9[58], x9[37], &x10[37], &x10[58]);
2914 butterfly_s32_s32_x4_0112_neon(cospi25, x9[57], x9[38], &x10[38], &x10[57]);
2915 butterfly_s32_s32_x4_1003_neon(cospi7, x9[56], x9[39], &x10[39], &x10[56]);
2916 butterfly_s32_s32_x4_0112_neon(cospi5, x9[55], x9[40], &x10[40], &x10[55]);
2917 butterfly_s32_s32_x4_1003_neon(cospi27, x9[54], x9[41], &x10[41], &x10[54]);
2918 butterfly_s32_s32_x4_0112_neon(cospi21, x9[53], x9[42], &x10[42], &x10[53]);
2919 butterfly_s32_s32_x4_1003_neon(cospi11, x9[52], x9[43], &x10[43], &x10[52]);
2920 butterfly_s32_s32_x4_0112_neon(cospi13, x9[51], x9[44], &x10[44], &x10[51]);
2921 butterfly_s32_s32_x4_1003_neon(cospi19, x9[50], x9[45], &x10[45], &x10[50]);
2922 butterfly_s32_s32_x4_0112_neon(cospi29, x9[49], x9[46], &x10[46], &x10[49]);
2923 butterfly_s32_s32_x4_1003_neon(cospi3, x9[48], x9[47], &x10[47], &x10[48]);
2924
2925 // stage 11, only store into the low 32 output indices.
2926 output[0] = x6[0];
2927 output[1] = x10[32];
2928 output[2] = x9[16];
2929 output[3] = x10[48];
2930 output[4] = x8[8];
2931 output[5] = x10[40];
2932 output[6] = x9[24];
2933 output[7] = x10[56];
2934 output[8] = x7[4];
2935 output[9] = x10[36];
2936 output[10] = x9[20];
2937 output[11] = x10[52];
2938 output[12] = x8[12];
2939 output[13] = x10[44];
2940 output[14] = x9[28];
2941 output[15] = x10[60];
2942 output[16] = x6[2];
2943 output[17] = x10[34];
2944 output[18] = x9[18];
2945 output[19] = x10[50];
2946 output[20] = x8[10];
2947 output[21] = x10[42];
2948 output[22] = x9[26];
2949 output[23] = x10[58];
2950 output[24] = x7[6];
2951 output[25] = x10[38];
2952 output[26] = x9[22];
2953 output[27] = x10[54];
2954 output[28] = x8[14];
2955 output[29] = x10[46];
2956 output[30] = x9[30];
2957 output[31] = x10[62];
2958 }
2959
lowbd_fwd_txfm2d_64x64_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2960 static void lowbd_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
2961 int stride, TX_TYPE tx_type, int bd) {
2962 (void)bd;
2963 (void)tx_type;
2964 assert(tx_type == DCT_DCT);
2965 int16x8_t buf0[64], buf1[512];
2966 const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
2967
2968 for (int i = 0; i < 8; i++) {
2969 load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
2970 col_txfm(buf0, buf0, 13);
2971 shift_right_2_round_s16_x8(buf0, buf0, 64);
2972 for (int j = 0; j < 4; ++j) {
2973 transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
2974 }
2975 }
2976 for (int i = 0; i < 4; i++) {
2977 int32x4_t bufA[64];
2978 int32x4_t bufB[64];
2979 int16x8_t *buf = buf1 + 64 * i;
2980 for (int j = 0; j < 64; ++j) {
2981 bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
2982 bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
2983 }
2984 fdct64_neon(bufA, bufA, 10);
2985 fdct64_neon(bufB, bufB, 10);
2986 shift_right_2_round_s32_x4(bufA, bufA, 32);
2987 shift_right_2_round_s32_x4(bufB, bufB, 32);
2988 store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
2989 }
2990 }
2991
lowbd_fwd_txfm2d_64x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2992 static void lowbd_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *output,
2993 int stride, TX_TYPE tx_type, int bd) {
2994 (void)bd;
2995 int16x8_t buf0[64], buf1[256];
2996 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
2997
2998 for (int i = 0; i < 8; i++) {
2999 col_txfm(input + 8 * i, buf0, stride, 12);
3000 shift_right_4_round_s16_x8(buf0, buf0, 32);
3001 for (int j = 0; j < 4; ++j) {
3002 transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
3003 }
3004 }
3005 assert(tx_type == DCT_DCT);
3006 for (int i = 0; i < 4; i++) {
3007 int32x4_t bufA[64];
3008 int32x4_t bufB[64];
3009 int16x8_t *buf = buf1 + 64 * i;
3010 for (int j = 0; j < 64; ++j) {
3011 bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
3012 bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
3013 }
3014 fdct64_neon(bufA, bufA, 11);
3015 fdct64_neon(bufB, bufB, 11);
3016 shift_right_2_round_s32_x4(bufA, bufA, 32);
3017 shift_right_2_round_s32_x4(bufB, bufB, 32);
3018 round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
3019 round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
3020 store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
3021 }
3022 }
3023
lowbd_fwd_txfm2d_32x64_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)3024 static void lowbd_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *output,
3025 int stride, TX_TYPE tx_type, int bd) {
3026 (void)bd;
3027 (void)tx_type;
3028 assert(tx_type == DCT_DCT);
3029 int16x8_t buf0[64], buf1[256];
3030 const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
3031
3032 for (int i = 0; i < 4; i++) {
3033 load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
3034 col_txfm(buf0, buf0, 13);
3035 shift_right_2_round_s16_x8(buf0, buf0, 64);
3036 for (int j = 0; j < 4; ++j) {
3037 transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 32 + 8 * i);
3038 }
3039 }
3040
3041 for (int i = 0; i < 4; i++) {
3042 int32x4_t bufA[32];
3043 int32x4_t bufB[32];
3044 int16x8_t *buf = buf1 + 32 * i;
3045 for (int j = 0; j < 32; ++j) {
3046 bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
3047 bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
3048 }
3049 fdct32_neon(bufA, bufA, 11);
3050 fdct32_neon(bufB, bufB, 11);
3051 shift_right_2_round_s32_x4(bufA, bufA, 32);
3052 shift_right_2_round_s32_x4(bufB, bufB, 32);
3053 round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
3054 round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
3055 store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
3056 }
3057 }
3058
3059 static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = {
3060 lowbd_fwd_txfm2d_4x4_neon, // 4x4 transform
3061 lowbd_fwd_txfm2d_8x8_neon, // 8x8 transform
3062 lowbd_fwd_txfm2d_16x16_neon, // 16x16 transform
3063 lowbd_fwd_txfm2d_32x32_neon, // 32x32 transform
3064 lowbd_fwd_txfm2d_64x64_neon, // 64x64 transform
3065 lowbd_fwd_txfm2d_4x8_neon, // 4x8 transform
3066 lowbd_fwd_txfm2d_8x4_neon, // 8x4 transform
3067 lowbd_fwd_txfm2d_8x16_neon, // 8x16 transform
3068 lowbd_fwd_txfm2d_16x8_neon, // 16x8 transform
3069 lowbd_fwd_txfm2d_16x32_neon, // 16x32 transform
3070 lowbd_fwd_txfm2d_32x16_neon, // 32x16 transform
3071 lowbd_fwd_txfm2d_32x64_neon, // 32x64 transform
3072 lowbd_fwd_txfm2d_64x32_neon, // 64x32 transform
3073 lowbd_fwd_txfm2d_4x16_neon, // 4x16 transform
3074 lowbd_fwd_txfm2d_16x4_neon, // 16x4 transform
3075 lowbd_fwd_txfm2d_8x32_neon, // 8x32 transform
3076 lowbd_fwd_txfm2d_32x8_neon, // 32x8 transform
3077 lowbd_fwd_txfm2d_16x64_neon, // 16x64 transform
3078 lowbd_fwd_txfm2d_64x16_neon, // 64x16 transform
3079 };
3080
av1_lowbd_fwd_txfm_neon(const int16_t * src_diff,tran_low_t * coeff,int diff_stride,TxfmParam * txfm_param)3081 void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff,
3082 int diff_stride, TxfmParam *txfm_param) {
3083 FwdTxfm2dFunc fwd_txfm2d_func = lowbd_fwd_txfm_func_ls[txfm_param->tx_size];
3084 if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
3085 av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
3086 } else {
3087 fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
3088 txfm_param->bd);
3089 }
3090 }
3091