1 /*
2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13
14 #include "config/aom_config.h"
15 #include "config/av1_rtcd.h"
16
17 #include "aom_dsp/aom_dsp_common.h"
18 #include "aom_dsp/arm/mem_neon.h"
19 #include "aom_dsp/arm/transpose_neon.h"
20 #include "aom_ports/mem.h"
21 #include "av1/common/convolve.h"
22 #include "av1/common/filter.h"
23 #include "av1/common/arm/highbd_convolve_neon.h"
24
25 #define UPSCALE_NORMATIVE_TAPS 8
26
av1_highbd_convolve_horiz_rs_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn,int bd)27 void av1_highbd_convolve_horiz_rs_neon(const uint16_t *src, int src_stride,
28 uint16_t *dst, int dst_stride, int w,
29 int h, const int16_t *x_filters,
30 int x0_qn, int x_step_qn, int bd) {
31 const int horiz_offset = UPSCALE_NORMATIVE_TAPS / 2 - 1;
32
33 static const int32_t kIdx[4] = { 0, 1, 2, 3 };
34 const int32x4_t idx = vld1q_s32(kIdx);
35 const int32x4_t subpel_mask = vdupq_n_s32(RS_SCALE_SUBPEL_MASK);
36 const int32x4_t shift_s32 = vdupq_n_s32(-FILTER_BITS);
37 const int32x4_t offset_s32 = vdupq_n_s32(0);
38 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
39
40 const uint16_t *src_ptr = src - horiz_offset;
41 uint16_t *dst_ptr = dst;
42
43 if (w <= 4) {
44 int height = h;
45 uint16_t *d = dst_ptr;
46
47 do {
48 int x_qn = x0_qn;
49
50 // Load 4 src vectors at a time, they might be the same, but we have to
51 // calculate the indices anyway. Doing it in SIMD and then storing the
52 // indices is faster than having to calculate the expression
53 // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
54 // Ideally this should be a gather using the indices, but NEON does not
55 // have that, so have to emulate
56 const int32x4_t xqn_idx = vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
57 // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) =
58 // 2
59 const int32x4_t src_idx =
60 vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);
61 // Similarly for the filter vector indices, we calculate the filter
62 // indices for 4 columns. First we calculate the indices:
63 // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
64 // Then we calculate the actual pointers, multiplying with
65 // UPSCALE_UPSCALE_NORMATIVE_TAPS
66 // again shift left by 1
67 const int32x4_t x_filter4_idx = vshlq_n_s32(
68 vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS), 1);
69 // Even though pointers are unsigned 32/64-bit ints we do signed
70 // addition The reason for this is that x_qn can be negative, leading to
71 // negative offsets. Argon test
72 // profile0_core/streams/test10573_11003.obu was failing because of
73 // this.
74 #if AOM_ARCH_AARCH64
75 uint64x2_t tmp4[2];
76 tmp4[0] = vreinterpretq_u64_s64(vaddw_s32(
77 vdupq_n_s64((const int64_t)src_ptr), vget_low_s32(src_idx)));
78 tmp4[1] = vreinterpretq_u64_s64(vaddw_s32(
79 vdupq_n_s64((const int64_t)src_ptr), vget_high_s32(src_idx)));
80 int16_t *src4_ptr[4];
81 uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
82 vst1q_u64(tmp_ptr, tmp4[0]);
83 vst1q_u64(tmp_ptr + 2, tmp4[1]);
84
85 // filter vectors
86 tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
87 vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
88 vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
89 tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
90 vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
91 vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
92
93 const int16_t *x_filter4_ptr[4];
94 tmp_ptr = (uint64_t *)&x_filter4_ptr;
95 vst1q_u64(tmp_ptr, tmp4[0]);
96 vst1q_u64(tmp_ptr + 2, tmp4[1]);
97 #else
98 uint32x4_t tmp4;
99 tmp4 = vreinterpretq_u32_s32(
100 vaddq_s32(vdupq_n_s32((const int32_t)src_ptr), src_idx));
101 int16_t *src4_ptr[4];
102 uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
103 vst1q_u32(tmp_ptr, tmp4);
104
105 // filter vectors
106 tmp4 = vreinterpretq_u32_s32(
107 vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
108 vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));
109
110 const int16_t *x_filter4_ptr[4];
111 tmp_ptr = (uint32_t *)&x_filter4_ptr;
112 vst1q_u32(tmp_ptr, tmp4);
113 #endif // AOM_ARCH_AARCH64
114 // Load source
115 int16x8_t s0 = vld1q_s16(src4_ptr[0]);
116 int16x8_t s1 = vld1q_s16(src4_ptr[1]);
117 int16x8_t s2 = vld1q_s16(src4_ptr[2]);
118 int16x8_t s3 = vld1q_s16(src4_ptr[3]);
119
120 // Actually load the filters
121 const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
122 const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
123 const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
124 const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
125
126 // Group low and high parts and transpose
127 int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
128 vget_low_s16(x_filter1),
129 vget_low_s16(x_filter2),
130 vget_low_s16(x_filter3) };
131 int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
132 vget_high_s16(x_filter1),
133 vget_high_s16(x_filter2),
134 vget_high_s16(x_filter3) };
135 transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
136 transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
137
138 // Run the 2D Scale convolution
139 uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
140 s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
141
142 d0 = vmin_u16(d0, max);
143
144 if (w == 2) {
145 store_u16_2x1(d, d0);
146 } else {
147 vst1_u16(d, d0);
148 }
149
150 src_ptr += src_stride;
151 d += dst_stride;
152 height--;
153 } while (height > 0);
154 } else {
155 int height = h;
156
157 do {
158 int width = w;
159 int x_qn = x0_qn;
160 uint16_t *d = dst_ptr;
161 const uint16_t *s = src_ptr;
162
163 do {
164 // Load 4 src vectors at a time, they might be the same, but we have to
165 // calculate the indices anyway. Doing it in SIMD and then storing the
166 // indices is faster than having to calculate the expression
167 // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
168 // Ideally this should be a gather using the indices, but NEON does not
169 // have that, so have to emulate
170 const int32x4_t xqn_idx =
171 vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
172 // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
173 // = 2
174 const int32x4_t src_idx =
175 vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);
176
177 // Similarly for the filter vector indices, we calculate the filter
178 // indices for 4 columns. First we calculate the indices:
179 // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
180 // Then we calculate the actual pointers, multiplying with
181 // UPSCALE_UPSCALE_NORMATIVE_TAPS
182 // again shift left by 1
183 const int32x4_t x_filter4_idx = vshlq_n_s32(
184 vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS),
185 1);
186 // Even though pointers are unsigned 32/64-bit ints we do signed
187 // addition The reason for this is that x_qn can be negative, leading to
188 // negative offsets. Argon test
189 // profile0_core/streams/test10573_11003.obu was failing because of
190 // this.
191 #if AOM_ARCH_AARCH64
192 uint64x2_t tmp4[2];
193 tmp4[0] = vreinterpretq_u64_s64(
194 vaddw_s32(vdupq_n_s64((const int64_t)s), vget_low_s32(src_idx)));
195 tmp4[1] = vreinterpretq_u64_s64(
196 vaddw_s32(vdupq_n_s64((const int64_t)s), vget_high_s32(src_idx)));
197 int16_t *src4_ptr[4];
198 uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
199 vst1q_u64(tmp_ptr, tmp4[0]);
200 vst1q_u64(tmp_ptr + 2, tmp4[1]);
201
202 // filter vectors
203 tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
204 vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
205 vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
206 tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
207 vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
208 vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
209
210 const int16_t *x_filter4_ptr[4];
211 tmp_ptr = (uint64_t *)&x_filter4_ptr;
212 vst1q_u64(tmp_ptr, tmp4[0]);
213 vst1q_u64(tmp_ptr + 2, tmp4[1]);
214 #else
215 uint32x4_t tmp4;
216 tmp4 = vreinterpretq_u32_s32(
217 vaddq_s32(vdupq_n_s32((const int32_t)s), src_idx));
218 int16_t *src4_ptr[4];
219 uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
220 vst1q_u32(tmp_ptr, tmp4);
221
222 // filter vectors
223 tmp4 = vreinterpretq_u32_s32(
224 vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
225 vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));
226
227 const int16_t *x_filter4_ptr[4];
228 tmp_ptr = (uint32_t *)&x_filter4_ptr;
229 vst1q_u32(tmp_ptr, tmp4);
230 #endif // AOM_ARCH_AARCH64
231
232 // Load source
233 int16x8_t s0 = vld1q_s16(src4_ptr[0]);
234 int16x8_t s1 = vld1q_s16(src4_ptr[1]);
235 int16x8_t s2 = vld1q_s16(src4_ptr[2]);
236 int16x8_t s3 = vld1q_s16(src4_ptr[3]);
237
238 // Actually load the filters
239 const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
240 const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
241 const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
242 const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
243
244 // Group low and high parts and transpose
245 int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
246 vget_low_s16(x_filter1),
247 vget_low_s16(x_filter2),
248 vget_low_s16(x_filter3) };
249 int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
250 vget_high_s16(x_filter1),
251 vget_high_s16(x_filter2),
252 vget_high_s16(x_filter3) };
253 transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
254 transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
255
256 // Run the 2D Scale X convolution
257 uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
258 s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
259
260 d0 = vmin_u16(d0, max);
261 vst1_u16(d, d0);
262
263 x_qn += 4 * x_step_qn;
264 d += 4;
265 width -= 4;
266 } while (width > 0);
267
268 src_ptr += src_stride;
269 dst_ptr += dst_stride;
270 height--;
271 } while (height > 0);
272 }
273 }
274