1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/txfm_common.h"
16 #include "vpx_dsp/arm/mem_neon.h"
17 #include "vpx_dsp/arm/transpose_neon.h"
18 #include "vpx_dsp/arm/fdct_neon.h"
19 #include "vpx_dsp/arm/fdct32x32_neon.h"
20
21 // Most gcc 4.9 distributions outside of Android do not generate correct code
22 // for this function.
23 #if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
24 __GNUC__ == 4 && __GNUC_MINOR__ <= 9
25
vpx_fdct32x32_neon(const int16_t * input,tran_low_t * output,int stride)26 void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
27 vpx_fdct32x32_c(input, output, stride);
28 }
29
vpx_fdct32x32_rd_neon(const int16_t * input,tran_low_t * output,int stride)30 void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
31 int stride) {
32 vpx_fdct32x32_rd_c(input, output, stride);
33 }
34
35 #else
36
vpx_fdct32x32_neon(const int16_t * input,tran_low_t * output,int stride)37 void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
38 int16x8_t temp0[32];
39 int16x8_t temp1[32];
40 int16x8_t temp2[32];
41 int16x8_t temp3[32];
42 int16x8_t temp4[32];
43 int16x8_t temp5[32];
44
45 // Process in 8x32 columns.
46 load_cross(input, stride, temp0);
47 scale_input(temp0, temp5);
48 dct_body_first_pass(temp5, temp1);
49
50 load_cross(input + 8, stride, temp0);
51 scale_input(temp0, temp5);
52 dct_body_first_pass(temp5, temp2);
53
54 load_cross(input + 16, stride, temp0);
55 scale_input(temp0, temp5);
56 dct_body_first_pass(temp5, temp3);
57
58 load_cross(input + 24, stride, temp0);
59 scale_input(temp0, temp5);
60 dct_body_first_pass(temp5, temp4);
61
62 // Generate the top row by munging the first set of 8 from each one together.
63 transpose_s16_8x8q(&temp1[0], &temp0[0]);
64 transpose_s16_8x8q(&temp2[0], &temp0[8]);
65 transpose_s16_8x8q(&temp3[0], &temp0[16]);
66 transpose_s16_8x8q(&temp4[0], &temp0[24]);
67
68 dct_body_second_pass(temp0, temp5);
69
70 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
71 &temp5[5], &temp5[6], &temp5[7]);
72 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
73 &temp5[13], &temp5[14], &temp5[15]);
74 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
75 &temp5[21], &temp5[22], &temp5[23]);
76 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
77 &temp5[29], &temp5[30], &temp5[31]);
78 store(output, temp5);
79
80 // Second row of 8x32.
81 transpose_s16_8x8q(&temp1[8], &temp0[0]);
82 transpose_s16_8x8q(&temp2[8], &temp0[8]);
83 transpose_s16_8x8q(&temp3[8], &temp0[16]);
84 transpose_s16_8x8q(&temp4[8], &temp0[24]);
85
86 dct_body_second_pass(temp0, temp5);
87
88 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
89 &temp5[5], &temp5[6], &temp5[7]);
90 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
91 &temp5[13], &temp5[14], &temp5[15]);
92 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
93 &temp5[21], &temp5[22], &temp5[23]);
94 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
95 &temp5[29], &temp5[30], &temp5[31]);
96 store(output + 8 * 32, temp5);
97
98 // Third row of 8x32
99 transpose_s16_8x8q(&temp1[16], &temp0[0]);
100 transpose_s16_8x8q(&temp2[16], &temp0[8]);
101 transpose_s16_8x8q(&temp3[16], &temp0[16]);
102 transpose_s16_8x8q(&temp4[16], &temp0[24]);
103
104 dct_body_second_pass(temp0, temp5);
105
106 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
107 &temp5[5], &temp5[6], &temp5[7]);
108 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
109 &temp5[13], &temp5[14], &temp5[15]);
110 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
111 &temp5[21], &temp5[22], &temp5[23]);
112 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
113 &temp5[29], &temp5[30], &temp5[31]);
114 store(output + 16 * 32, temp5);
115
116 // Final row of 8x32.
117 transpose_s16_8x8q(&temp1[24], &temp0[0]);
118 transpose_s16_8x8q(&temp2[24], &temp0[8]);
119 transpose_s16_8x8q(&temp3[24], &temp0[16]);
120 transpose_s16_8x8q(&temp4[24], &temp0[24]);
121
122 dct_body_second_pass(temp0, temp5);
123
124 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
125 &temp5[5], &temp5[6], &temp5[7]);
126 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
127 &temp5[13], &temp5[14], &temp5[15]);
128 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
129 &temp5[21], &temp5[22], &temp5[23]);
130 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
131 &temp5[29], &temp5[30], &temp5[31]);
132 store(output + 24 * 32, temp5);
133 }
134
vpx_fdct32x32_rd_neon(const int16_t * input,tran_low_t * output,int stride)135 void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
136 int stride) {
137 int16x8_t temp0[32];
138 int16x8_t temp1[32];
139 int16x8_t temp2[32];
140 int16x8_t temp3[32];
141 int16x8_t temp4[32];
142 int16x8_t temp5[32];
143
144 // Process in 8x32 columns.
145 load_cross(input, stride, temp0);
146 scale_input(temp0, temp5);
147 dct_body_first_pass(temp5, temp1);
148
149 load_cross(input + 8, stride, temp0);
150 scale_input(temp0, temp5);
151 dct_body_first_pass(temp5, temp2);
152
153 load_cross(input + 16, stride, temp0);
154 scale_input(temp0, temp5);
155 dct_body_first_pass(temp5, temp3);
156
157 load_cross(input + 24, stride, temp0);
158 scale_input(temp0, temp5);
159 dct_body_first_pass(temp5, temp4);
160
161 // Generate the top row by munging the first set of 8 from each one together.
162 transpose_s16_8x8q(&temp1[0], &temp0[0]);
163 transpose_s16_8x8q(&temp2[0], &temp0[8]);
164 transpose_s16_8x8q(&temp3[0], &temp0[16]);
165 transpose_s16_8x8q(&temp4[0], &temp0[24]);
166
167 dct_body_second_pass_rd(temp0, temp5);
168
169 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
170 &temp5[5], &temp5[6], &temp5[7]);
171 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
172 &temp5[13], &temp5[14], &temp5[15]);
173 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
174 &temp5[21], &temp5[22], &temp5[23]);
175 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
176 &temp5[29], &temp5[30], &temp5[31]);
177 store(output, temp5);
178
179 // Second row of 8x32.
180 transpose_s16_8x8q(&temp1[8], &temp0[0]);
181 transpose_s16_8x8q(&temp2[8], &temp0[8]);
182 transpose_s16_8x8q(&temp3[8], &temp0[16]);
183 transpose_s16_8x8q(&temp4[8], &temp0[24]);
184
185 dct_body_second_pass_rd(temp0, temp5);
186
187 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
188 &temp5[5], &temp5[6], &temp5[7]);
189 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
190 &temp5[13], &temp5[14], &temp5[15]);
191 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
192 &temp5[21], &temp5[22], &temp5[23]);
193 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
194 &temp5[29], &temp5[30], &temp5[31]);
195 store(output + 8 * 32, temp5);
196
197 // Third row of 8x32
198 transpose_s16_8x8q(&temp1[16], &temp0[0]);
199 transpose_s16_8x8q(&temp2[16], &temp0[8]);
200 transpose_s16_8x8q(&temp3[16], &temp0[16]);
201 transpose_s16_8x8q(&temp4[16], &temp0[24]);
202
203 dct_body_second_pass_rd(temp0, temp5);
204
205 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
206 &temp5[5], &temp5[6], &temp5[7]);
207 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
208 &temp5[13], &temp5[14], &temp5[15]);
209 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
210 &temp5[21], &temp5[22], &temp5[23]);
211 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
212 &temp5[29], &temp5[30], &temp5[31]);
213 store(output + 16 * 32, temp5);
214
215 // Final row of 8x32.
216 transpose_s16_8x8q(&temp1[24], &temp0[0]);
217 transpose_s16_8x8q(&temp2[24], &temp0[8]);
218 transpose_s16_8x8q(&temp3[24], &temp0[16]);
219 transpose_s16_8x8q(&temp4[24], &temp0[24]);
220
221 dct_body_second_pass_rd(temp0, temp5);
222
223 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
224 &temp5[5], &temp5[6], &temp5[7]);
225 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
226 &temp5[13], &temp5[14], &temp5[15]);
227 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
228 &temp5[21], &temp5[22], &temp5[23]);
229 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
230 &temp5[29], &temp5[30], &temp5[31]);
231 store(output + 24 * 32, temp5);
232 }
233
234 #if CONFIG_VP9_HIGHBITDEPTH
235
vpx_highbd_fdct32x32_neon(const int16_t * input,tran_low_t * output,int stride)236 void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output,
237 int stride) {
238 int16x8_t temp0[32];
239 int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
240 right3[32], right4[32];
241 int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
242 left8[32], right8[32];
243 int32x4_t temp1[32], temp2[32];
244
245 // Process in 8x32 columns.
246 load_cross(input, stride, temp0);
247 highbd_scale_input(temp0, left1, right1);
248 highbd_dct8x32_body_first_pass(left1, right1);
249 highbd_partial_sub_round_shift(left1, right1);
250
251 load_cross(input + 8, stride, temp0);
252 highbd_scale_input(temp0, left2, right2);
253 highbd_dct8x32_body_first_pass(left2, right2);
254 highbd_partial_sub_round_shift(left2, right2);
255
256 load_cross(input + 16, stride, temp0);
257 highbd_scale_input(temp0, left3, right3);
258 highbd_dct8x32_body_first_pass(left3, right3);
259 highbd_partial_sub_round_shift(left3, right3);
260
261 load_cross(input + 24, stride, temp0);
262 highbd_scale_input(temp0, left4, right4);
263 highbd_dct8x32_body_first_pass(left4, right4);
264 highbd_partial_sub_round_shift(left4, right4);
265
266 // Generate the top row by munging the first set of 8 from each one together.
267 transpose_s32_8x8_2(left1, right1, temp1, temp2);
268 transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
269 transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
270 transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
271
272 highbd_cross_input(temp1, temp2, left5, right5);
273 highbd_dct8x32_body_second_pass(left5, right5);
274 highbd_partial_add_round_shift(left5, right5);
275
276 // Second row of 8x32.
277 transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
278 transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
279 transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
280 transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
281
282 highbd_cross_input(temp1, temp2, left6, right6);
283 highbd_dct8x32_body_second_pass(left6, right6);
284 highbd_partial_add_round_shift(left6, right6);
285
286 // Third row of 8x32
287 transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
288 transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
289 transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
290 transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
291
292 highbd_cross_input(temp1, temp2, left7, right7);
293 highbd_dct8x32_body_second_pass(left7, right7);
294 highbd_partial_add_round_shift(left7, right7);
295
296 // Final row of 8x32.
297 transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
298 transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
299 transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
300 transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
301
302 highbd_cross_input(temp1, temp2, left8, right8);
303 highbd_dct8x32_body_second_pass(left8, right8);
304 highbd_partial_add_round_shift(left8, right8);
305
306 // Final transpose
307 transpose_s32_8x8_2(left5, right5, left1, right1);
308 transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
309 transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
310 transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
311 transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
312 transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
313 transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
314 transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
315 transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
316 transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
317 transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
318 transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
319 transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
320 transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
321 transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
322 transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
323
324 store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
325 right4);
326 }
327
vpx_highbd_fdct32x32_rd_neon(const int16_t * input,tran_low_t * output,int stride)328 void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
329 int stride) {
330 int16x8_t temp0[32];
331 int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
332 right3[32], right4[32];
333 int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
334 left8[32], right8[32];
335 int32x4_t temp1[32], temp2[32];
336
337 // Process in 8x32 columns.
338 load_cross(input, stride, temp0);
339 highbd_scale_input(temp0, left1, right1);
340 highbd_dct8x32_body_first_pass(left1, right1);
341 highbd_partial_sub_round_shift(left1, right1);
342
343 load_cross(input + 8, stride, temp0);
344 highbd_scale_input(temp0, left2, right2);
345 highbd_dct8x32_body_first_pass(left2, right2);
346 highbd_partial_sub_round_shift(left2, right2);
347
348 load_cross(input + 16, stride, temp0);
349 highbd_scale_input(temp0, left3, right3);
350 highbd_dct8x32_body_first_pass(left3, right3);
351 highbd_partial_sub_round_shift(left3, right3);
352
353 load_cross(input + 24, stride, temp0);
354 highbd_scale_input(temp0, left4, right4);
355 highbd_dct8x32_body_first_pass(left4, right4);
356 highbd_partial_sub_round_shift(left4, right4);
357
358 // Generate the top row by munging the first set of 8 from each one together.
359 transpose_s32_8x8_2(left1, right1, temp1, temp2);
360 transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
361 transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
362 transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
363
364 highbd_cross_input(temp1, temp2, left5, right5);
365 highbd_dct8x32_body_second_pass_rd(left5, right5);
366
367 // Second row of 8x32.
368 transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
369 transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
370 transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
371 transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
372
373 highbd_cross_input(temp1, temp2, left6, right6);
374 highbd_dct8x32_body_second_pass_rd(left6, right6);
375
376 // Third row of 8x32
377 transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
378 transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
379 transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
380 transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
381
382 highbd_cross_input(temp1, temp2, left7, right7);
383 highbd_dct8x32_body_second_pass_rd(left7, right7);
384
385 // Final row of 8x32.
386 transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
387 transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
388 transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
389 transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
390
391 highbd_cross_input(temp1, temp2, left8, right8);
392 highbd_dct8x32_body_second_pass_rd(left8, right8);
393
394 // Final transpose
395 transpose_s32_8x8_2(left5, right5, left1, right1);
396 transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
397 transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
398 transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
399 transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
400 transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
401 transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
402 transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
403 transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
404 transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
405 transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
406 transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
407 transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
408 transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
409 transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
410 transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
411
412 store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
413 right4);
414 }
415
416 #endif // CONFIG_VP9_HIGHBITDEPTH
417
418 #endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
419 // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
420