xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/fdct16x16_neon.h (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
12 #define VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
13 
14 #include <arm_neon.h>
15 
16 #include "fdct_neon.h"
17 
load(const int16_t * a,int stride,int16x8_t * b)18 static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
19   b[0] = vld1q_s16(a);
20   a += stride;
21   b[1] = vld1q_s16(a);
22   a += stride;
23   b[2] = vld1q_s16(a);
24   a += stride;
25   b[3] = vld1q_s16(a);
26   a += stride;
27   b[4] = vld1q_s16(a);
28   a += stride;
29   b[5] = vld1q_s16(a);
30   a += stride;
31   b[6] = vld1q_s16(a);
32   a += stride;
33   b[7] = vld1q_s16(a);
34   a += stride;
35   b[8] = vld1q_s16(a);
36   a += stride;
37   b[9] = vld1q_s16(a);
38   a += stride;
39   b[10] = vld1q_s16(a);
40   a += stride;
41   b[11] = vld1q_s16(a);
42   a += stride;
43   b[12] = vld1q_s16(a);
44   a += stride;
45   b[13] = vld1q_s16(a);
46   a += stride;
47   b[14] = vld1q_s16(a);
48   a += stride;
49   b[15] = vld1q_s16(a);
50 }
51 
52 // Store 8 16x8 values, assuming stride == 16.
store(tran_low_t * a,const int16x8_t * b)53 static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
54   store_s16q_to_tran_low(a, b[0]);
55   a += 16;
56   store_s16q_to_tran_low(a, b[1]);
57   a += 16;
58   store_s16q_to_tran_low(a, b[2]);
59   a += 16;
60   store_s16q_to_tran_low(a, b[3]);
61   a += 16;
62   store_s16q_to_tran_low(a, b[4]);
63   a += 16;
64   store_s16q_to_tran_low(a, b[5]);
65   a += 16;
66   store_s16q_to_tran_low(a, b[6]);
67   a += 16;
68   store_s16q_to_tran_low(a, b[7]);
69 }
70 
71 // Load step of each pass. Add and subtract clear across the input, requiring
72 // all 16 values to be loaded. For the first pass it also multiplies by 4.
73 
74 // To maybe reduce register usage this could be combined with the load() step to
75 // get the first 4 and last 4 values, cross those, then load the middle 8 values
76 // and cross them.
scale_input(const int16x8_t * a,int16x8_t * b)77 static INLINE void scale_input(const int16x8_t *a /*[16]*/,
78                                int16x8_t *b /*[16]*/) {
79   b[0] = vshlq_n_s16(a[0], 2);
80   b[1] = vshlq_n_s16(a[1], 2);
81   b[2] = vshlq_n_s16(a[2], 2);
82   b[3] = vshlq_n_s16(a[3], 2);
83   b[4] = vshlq_n_s16(a[4], 2);
84   b[5] = vshlq_n_s16(a[5], 2);
85   b[6] = vshlq_n_s16(a[6], 2);
86   b[7] = vshlq_n_s16(a[7], 2);
87 
88   b[8] = vshlq_n_s16(a[8], 2);
89   b[9] = vshlq_n_s16(a[9], 2);
90   b[10] = vshlq_n_s16(a[10], 2);
91   b[11] = vshlq_n_s16(a[11], 2);
92   b[12] = vshlq_n_s16(a[12], 2);
93   b[13] = vshlq_n_s16(a[13], 2);
94   b[14] = vshlq_n_s16(a[14], 2);
95   b[15] = vshlq_n_s16(a[15], 2);
96 }
97 
cross_input(const int16x8_t * a,int16x8_t * b)98 static INLINE void cross_input(const int16x8_t *a /*[16]*/,
99                                int16x8_t *b /*[16]*/) {
100   b[0] = vaddq_s16(a[0], a[15]);
101   b[1] = vaddq_s16(a[1], a[14]);
102   b[2] = vaddq_s16(a[2], a[13]);
103   b[3] = vaddq_s16(a[3], a[12]);
104   b[4] = vaddq_s16(a[4], a[11]);
105   b[5] = vaddq_s16(a[5], a[10]);
106   b[6] = vaddq_s16(a[6], a[9]);
107   b[7] = vaddq_s16(a[7], a[8]);
108 
109   b[8] = vsubq_s16(a[7], a[8]);
110   b[9] = vsubq_s16(a[6], a[9]);
111   b[10] = vsubq_s16(a[5], a[10]);
112   b[11] = vsubq_s16(a[4], a[11]);
113   b[12] = vsubq_s16(a[3], a[12]);
114   b[13] = vsubq_s16(a[2], a[13]);
115   b[14] = vsubq_s16(a[1], a[14]);
116   b[15] = vsubq_s16(a[0], a[15]);
117 }
118 
load_cross(const int16_t * a,int stride,int16x8_t * b)119 static INLINE void load_cross(const int16_t *a, int stride,
120                               int16x8_t *b /*[16]*/) {
121   b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
122   b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
123   b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
124   b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
125   b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
126   b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
127   b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
128   b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
129 
130   b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
131   b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
132   b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
133   b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
134   b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
135   b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
136   b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
137   b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
138 }
139 
140 // Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
141 // because this only adds 1, not 1 << 2.
partial_round_shift(int16x8_t * a)142 static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
143   const int16x8_t one = vdupq_n_s16(1);
144   a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
145   a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
146   a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
147   a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
148   a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
149   a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
150   a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
151   a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
152   a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
153   a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
154   a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
155   a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
156   a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
157   a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
158   a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
159   a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
160 }
161 
162 #if CONFIG_VP9_HIGHBITDEPTH
163 
highbd_scale_input(const int16x8_t * a,int32x4_t * left,int32x4_t * right)164 static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/,
165                                       int32x4_t *left /*[16]*/,
166                                       int32x4_t *right /* [16] */) {
167   left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
168   left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
169   left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
170   left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
171   left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
172   left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
173   left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
174   left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
175   left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
176   left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
177   left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
178   left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
179   left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
180   left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
181   left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
182   left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
183 
184   right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
185   right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
186   right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
187   right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
188   right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
189   right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
190   right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
191   right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
192   right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
193   right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
194   right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
195   right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
196   right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
197   right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
198   right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
199   right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
200 }
201 
highbd_cross_input(const int32x4_t * a_left,int32x4_t * a_right,int32x4_t * b_left,int32x4_t * b_right)202 static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/,
203                                       int32x4_t *a_right /*[16]*/,
204                                       int32x4_t *b_left /*[16]*/,
205                                       int32x4_t *b_right /*[16]*/) {
206   b_left[0] = vaddq_s32(a_left[0], a_left[15]);
207   b_left[1] = vaddq_s32(a_left[1], a_left[14]);
208   b_left[2] = vaddq_s32(a_left[2], a_left[13]);
209   b_left[3] = vaddq_s32(a_left[3], a_left[12]);
210   b_left[4] = vaddq_s32(a_left[4], a_left[11]);
211   b_left[5] = vaddq_s32(a_left[5], a_left[10]);
212   b_left[6] = vaddq_s32(a_left[6], a_left[9]);
213   b_left[7] = vaddq_s32(a_left[7], a_left[8]);
214 
215   b_right[0] = vaddq_s32(a_right[0], a_right[15]);
216   b_right[1] = vaddq_s32(a_right[1], a_right[14]);
217   b_right[2] = vaddq_s32(a_right[2], a_right[13]);
218   b_right[3] = vaddq_s32(a_right[3], a_right[12]);
219   b_right[4] = vaddq_s32(a_right[4], a_right[11]);
220   b_right[5] = vaddq_s32(a_right[5], a_right[10]);
221   b_right[6] = vaddq_s32(a_right[6], a_right[9]);
222   b_right[7] = vaddq_s32(a_right[7], a_right[8]);
223 
224   b_left[8] = vsubq_s32(a_left[7], a_left[8]);
225   b_left[9] = vsubq_s32(a_left[6], a_left[9]);
226   b_left[10] = vsubq_s32(a_left[5], a_left[10]);
227   b_left[11] = vsubq_s32(a_left[4], a_left[11]);
228   b_left[12] = vsubq_s32(a_left[3], a_left[12]);
229   b_left[13] = vsubq_s32(a_left[2], a_left[13]);
230   b_left[14] = vsubq_s32(a_left[1], a_left[14]);
231   b_left[15] = vsubq_s32(a_left[0], a_left[15]);
232 
233   b_right[8] = vsubq_s32(a_right[7], a_right[8]);
234   b_right[9] = vsubq_s32(a_right[6], a_right[9]);
235   b_right[10] = vsubq_s32(a_right[5], a_right[10]);
236   b_right[11] = vsubq_s32(a_right[4], a_right[11]);
237   b_right[12] = vsubq_s32(a_right[3], a_right[12]);
238   b_right[13] = vsubq_s32(a_right[2], a_right[13]);
239   b_right[14] = vsubq_s32(a_right[1], a_right[14]);
240   b_right[15] = vsubq_s32(a_right[0], a_right[15]);
241 }
242 
highbd_partial_round_shift(int32x4_t * left,int32x4_t * right)243 static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/,
244                                               int32x4_t *right /* [16] */) {
245   const int32x4_t one = vdupq_n_s32(1);
246   left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2);
247   left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2);
248   left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2);
249   left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2);
250   left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2);
251   left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2);
252   left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2);
253   left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2);
254   left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2);
255   left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2);
256   left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2);
257   left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2);
258   left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2);
259   left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2);
260   left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2);
261   left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2);
262 
263   right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
264   right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
265   right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
266   right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
267   right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
268   right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
269   right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
270   right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
271   right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
272   right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
273   right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
274   right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
275   right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
276   right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
277   right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
278   right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
279 }
280 
281 // Store 16 32x4 vectors, assuming stride == 16.
store16_s32(tran_low_t * a,const int32x4_t * b)282 static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
283   vst1q_s32(a, b[0]);
284   a += 16;
285   vst1q_s32(a, b[1]);
286   a += 16;
287   vst1q_s32(a, b[2]);
288   a += 16;
289   vst1q_s32(a, b[3]);
290   a += 16;
291   vst1q_s32(a, b[4]);
292   a += 16;
293   vst1q_s32(a, b[5]);
294   a += 16;
295   vst1q_s32(a, b[6]);
296   a += 16;
297   vst1q_s32(a, b[7]);
298   a += 16;
299   vst1q_s32(a, b[8]);
300   a += 16;
301   vst1q_s32(a, b[9]);
302   a += 16;
303   vst1q_s32(a, b[10]);
304   a += 16;
305   vst1q_s32(a, b[11]);
306   a += 16;
307   vst1q_s32(a, b[12]);
308   a += 16;
309   vst1q_s32(a, b[13]);
310   a += 16;
311   vst1q_s32(a, b[14]);
312   a += 16;
313   vst1q_s32(a, b[15]);
314 }
315 
316 #endif  // CONFIG_VP9_HIGHBITDEPTH
317 
318 #endif  // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
319