1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
12 #define VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
13
14 #include <arm_neon.h>
15
16 #include "fdct_neon.h"
17
load(const int16_t * a,int stride,int16x8_t * b)18 static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
19 b[0] = vld1q_s16(a);
20 a += stride;
21 b[1] = vld1q_s16(a);
22 a += stride;
23 b[2] = vld1q_s16(a);
24 a += stride;
25 b[3] = vld1q_s16(a);
26 a += stride;
27 b[4] = vld1q_s16(a);
28 a += stride;
29 b[5] = vld1q_s16(a);
30 a += stride;
31 b[6] = vld1q_s16(a);
32 a += stride;
33 b[7] = vld1q_s16(a);
34 a += stride;
35 b[8] = vld1q_s16(a);
36 a += stride;
37 b[9] = vld1q_s16(a);
38 a += stride;
39 b[10] = vld1q_s16(a);
40 a += stride;
41 b[11] = vld1q_s16(a);
42 a += stride;
43 b[12] = vld1q_s16(a);
44 a += stride;
45 b[13] = vld1q_s16(a);
46 a += stride;
47 b[14] = vld1q_s16(a);
48 a += stride;
49 b[15] = vld1q_s16(a);
50 }
51
52 // Store 8 16x8 values, assuming stride == 16.
store(tran_low_t * a,const int16x8_t * b)53 static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
54 store_s16q_to_tran_low(a, b[0]);
55 a += 16;
56 store_s16q_to_tran_low(a, b[1]);
57 a += 16;
58 store_s16q_to_tran_low(a, b[2]);
59 a += 16;
60 store_s16q_to_tran_low(a, b[3]);
61 a += 16;
62 store_s16q_to_tran_low(a, b[4]);
63 a += 16;
64 store_s16q_to_tran_low(a, b[5]);
65 a += 16;
66 store_s16q_to_tran_low(a, b[6]);
67 a += 16;
68 store_s16q_to_tran_low(a, b[7]);
69 }
70
71 // Load step of each pass. Add and subtract clear across the input, requiring
72 // all 16 values to be loaded. For the first pass it also multiplies by 4.
73
74 // To maybe reduce register usage this could be combined with the load() step to
75 // get the first 4 and last 4 values, cross those, then load the middle 8 values
76 // and cross them.
scale_input(const int16x8_t * a,int16x8_t * b)77 static INLINE void scale_input(const int16x8_t *a /*[16]*/,
78 int16x8_t *b /*[16]*/) {
79 b[0] = vshlq_n_s16(a[0], 2);
80 b[1] = vshlq_n_s16(a[1], 2);
81 b[2] = vshlq_n_s16(a[2], 2);
82 b[3] = vshlq_n_s16(a[3], 2);
83 b[4] = vshlq_n_s16(a[4], 2);
84 b[5] = vshlq_n_s16(a[5], 2);
85 b[6] = vshlq_n_s16(a[6], 2);
86 b[7] = vshlq_n_s16(a[7], 2);
87
88 b[8] = vshlq_n_s16(a[8], 2);
89 b[9] = vshlq_n_s16(a[9], 2);
90 b[10] = vshlq_n_s16(a[10], 2);
91 b[11] = vshlq_n_s16(a[11], 2);
92 b[12] = vshlq_n_s16(a[12], 2);
93 b[13] = vshlq_n_s16(a[13], 2);
94 b[14] = vshlq_n_s16(a[14], 2);
95 b[15] = vshlq_n_s16(a[15], 2);
96 }
97
cross_input(const int16x8_t * a,int16x8_t * b)98 static INLINE void cross_input(const int16x8_t *a /*[16]*/,
99 int16x8_t *b /*[16]*/) {
100 b[0] = vaddq_s16(a[0], a[15]);
101 b[1] = vaddq_s16(a[1], a[14]);
102 b[2] = vaddq_s16(a[2], a[13]);
103 b[3] = vaddq_s16(a[3], a[12]);
104 b[4] = vaddq_s16(a[4], a[11]);
105 b[5] = vaddq_s16(a[5], a[10]);
106 b[6] = vaddq_s16(a[6], a[9]);
107 b[7] = vaddq_s16(a[7], a[8]);
108
109 b[8] = vsubq_s16(a[7], a[8]);
110 b[9] = vsubq_s16(a[6], a[9]);
111 b[10] = vsubq_s16(a[5], a[10]);
112 b[11] = vsubq_s16(a[4], a[11]);
113 b[12] = vsubq_s16(a[3], a[12]);
114 b[13] = vsubq_s16(a[2], a[13]);
115 b[14] = vsubq_s16(a[1], a[14]);
116 b[15] = vsubq_s16(a[0], a[15]);
117 }
118
load_cross(const int16_t * a,int stride,int16x8_t * b)119 static INLINE void load_cross(const int16_t *a, int stride,
120 int16x8_t *b /*[16]*/) {
121 b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
122 b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
123 b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
124 b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
125 b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
126 b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
127 b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
128 b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
129
130 b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
131 b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
132 b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
133 b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
134 b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
135 b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
136 b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
137 b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
138 }
139
140 // Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
141 // because this only adds 1, not 1 << 2.
partial_round_shift(int16x8_t * a)142 static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
143 const int16x8_t one = vdupq_n_s16(1);
144 a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
145 a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
146 a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
147 a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
148 a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
149 a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
150 a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
151 a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
152 a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
153 a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
154 a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
155 a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
156 a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
157 a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
158 a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
159 a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
160 }
161
162 #if CONFIG_VP9_HIGHBITDEPTH
163
highbd_scale_input(const int16x8_t * a,int32x4_t * left,int32x4_t * right)164 static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/,
165 int32x4_t *left /*[16]*/,
166 int32x4_t *right /* [16] */) {
167 left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
168 left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
169 left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
170 left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
171 left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
172 left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
173 left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
174 left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
175 left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
176 left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
177 left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
178 left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
179 left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
180 left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
181 left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
182 left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
183
184 right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
185 right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
186 right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
187 right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
188 right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
189 right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
190 right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
191 right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
192 right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
193 right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
194 right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
195 right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
196 right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
197 right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
198 right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
199 right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
200 }
201
highbd_cross_input(const int32x4_t * a_left,int32x4_t * a_right,int32x4_t * b_left,int32x4_t * b_right)202 static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/,
203 int32x4_t *a_right /*[16]*/,
204 int32x4_t *b_left /*[16]*/,
205 int32x4_t *b_right /*[16]*/) {
206 b_left[0] = vaddq_s32(a_left[0], a_left[15]);
207 b_left[1] = vaddq_s32(a_left[1], a_left[14]);
208 b_left[2] = vaddq_s32(a_left[2], a_left[13]);
209 b_left[3] = vaddq_s32(a_left[3], a_left[12]);
210 b_left[4] = vaddq_s32(a_left[4], a_left[11]);
211 b_left[5] = vaddq_s32(a_left[5], a_left[10]);
212 b_left[6] = vaddq_s32(a_left[6], a_left[9]);
213 b_left[7] = vaddq_s32(a_left[7], a_left[8]);
214
215 b_right[0] = vaddq_s32(a_right[0], a_right[15]);
216 b_right[1] = vaddq_s32(a_right[1], a_right[14]);
217 b_right[2] = vaddq_s32(a_right[2], a_right[13]);
218 b_right[3] = vaddq_s32(a_right[3], a_right[12]);
219 b_right[4] = vaddq_s32(a_right[4], a_right[11]);
220 b_right[5] = vaddq_s32(a_right[5], a_right[10]);
221 b_right[6] = vaddq_s32(a_right[6], a_right[9]);
222 b_right[7] = vaddq_s32(a_right[7], a_right[8]);
223
224 b_left[8] = vsubq_s32(a_left[7], a_left[8]);
225 b_left[9] = vsubq_s32(a_left[6], a_left[9]);
226 b_left[10] = vsubq_s32(a_left[5], a_left[10]);
227 b_left[11] = vsubq_s32(a_left[4], a_left[11]);
228 b_left[12] = vsubq_s32(a_left[3], a_left[12]);
229 b_left[13] = vsubq_s32(a_left[2], a_left[13]);
230 b_left[14] = vsubq_s32(a_left[1], a_left[14]);
231 b_left[15] = vsubq_s32(a_left[0], a_left[15]);
232
233 b_right[8] = vsubq_s32(a_right[7], a_right[8]);
234 b_right[9] = vsubq_s32(a_right[6], a_right[9]);
235 b_right[10] = vsubq_s32(a_right[5], a_right[10]);
236 b_right[11] = vsubq_s32(a_right[4], a_right[11]);
237 b_right[12] = vsubq_s32(a_right[3], a_right[12]);
238 b_right[13] = vsubq_s32(a_right[2], a_right[13]);
239 b_right[14] = vsubq_s32(a_right[1], a_right[14]);
240 b_right[15] = vsubq_s32(a_right[0], a_right[15]);
241 }
242
highbd_partial_round_shift(int32x4_t * left,int32x4_t * right)243 static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/,
244 int32x4_t *right /* [16] */) {
245 const int32x4_t one = vdupq_n_s32(1);
246 left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2);
247 left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2);
248 left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2);
249 left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2);
250 left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2);
251 left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2);
252 left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2);
253 left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2);
254 left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2);
255 left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2);
256 left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2);
257 left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2);
258 left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2);
259 left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2);
260 left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2);
261 left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2);
262
263 right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
264 right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
265 right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
266 right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
267 right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
268 right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
269 right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
270 right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
271 right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
272 right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
273 right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
274 right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
275 right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
276 right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
277 right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
278 right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
279 }
280
281 // Store 16 32x4 vectors, assuming stride == 16.
store16_s32(tran_low_t * a,const int32x4_t * b)282 static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
283 vst1q_s32(a, b[0]);
284 a += 16;
285 vst1q_s32(a, b[1]);
286 a += 16;
287 vst1q_s32(a, b[2]);
288 a += 16;
289 vst1q_s32(a, b[3]);
290 a += 16;
291 vst1q_s32(a, b[4]);
292 a += 16;
293 vst1q_s32(a, b[5]);
294 a += 16;
295 vst1q_s32(a, b[6]);
296 a += 16;
297 vst1q_s32(a, b[7]);
298 a += 16;
299 vst1q_s32(a, b[8]);
300 a += 16;
301 vst1q_s32(a, b[9]);
302 a += 16;
303 vst1q_s32(a, b[10]);
304 a += 16;
305 vst1q_s32(a, b[11]);
306 a += 16;
307 vst1q_s32(a, b[12]);
308 a += 16;
309 vst1q_s32(a, b[13]);
310 a += 16;
311 vst1q_s32(a, b[14]);
312 a += 16;
313 vst1q_s32(a, b[15]);
314 }
315
316 #endif // CONFIG_VP9_HIGHBITDEPTH
317
318 #endif // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
319