1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vp9_rtcd.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "./vpx_scale_rtcd.h"
16 #include "vp9/common/vp9_blockd.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/transpose_neon.h"
19 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
20 #include "vpx_dsp/vpx_filter.h"
21 #include "vpx_scale/yv12config.h"
22
23 // Note: The scaling functions could write extra rows and columns in dst, which
24 // exceed the right and bottom boundaries of the destination frame. We rely on
25 // the following frame extension function to fix these rows and columns.
26
scale_plane_2_to_1_phase_0(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,const int w,const int h)27 static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
28 const int src_stride,
29 uint8_t *dst,
30 const int dst_stride, const int w,
31 const int h) {
32 const int max_width = (w + 15) & ~15;
33 int y = h;
34
35 assert(w && h);
36
37 do {
38 int x = max_width;
39 do {
40 const uint8x16x2_t s = vld2q_u8(src);
41 vst1q_u8(dst, s.val[0]);
42 src += 32;
43 dst += 16;
44 x -= 16;
45 } while (x);
46 src += 2 * (src_stride - max_width);
47 dst += dst_stride - max_width;
48 } while (--y);
49 }
50
scale_plane_4_to_1_phase_0(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,const int w,const int h)51 static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src,
52 const int src_stride,
53 uint8_t *dst,
54 const int dst_stride, const int w,
55 const int h) {
56 const int max_width = (w + 15) & ~15;
57 int y = h;
58
59 assert(w && h);
60
61 do {
62 int x = max_width;
63 do {
64 const uint8x16x4_t s = vld4q_u8(src);
65 vst1q_u8(dst, s.val[0]);
66 src += 64;
67 dst += 16;
68 x -= 16;
69 } while (x);
70 src += 4 * (src_stride - max_width);
71 dst += dst_stride - max_width;
72 } while (--y);
73 }
74
scale_plane_bilinear_kernel(const uint8x16_t in0,const uint8x16_t in1,const uint8x16_t in2,const uint8x16_t in3,const uint8x8_t coef0,const uint8x8_t coef1,uint8_t * const dst)75 static INLINE void scale_plane_bilinear_kernel(
76 const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2,
77 const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1,
78 uint8_t *const dst) {
79 const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0);
80 const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0);
81 const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0);
82 const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0);
83 const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1);
84 const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1);
85 const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1);
86 const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1);
87
88 const uint8x8_t hor0 = vrshrn_n_u16(h4, 7); // temp: 00 01 02 03 04 05 06 07
89 const uint8x8_t hor1 = vrshrn_n_u16(h5, 7); // temp: 08 09 0A 0B 0C 0D 0E 0F
90 const uint8x8_t hor2 = vrshrn_n_u16(h6, 7); // temp: 10 11 12 13 14 15 16 17
91 const uint8x8_t hor3 = vrshrn_n_u16(h7, 7); // temp: 18 19 1A 1B 1C 1D 1E 1F
92 const uint16x8_t v0 = vmull_u8(hor0, coef0);
93 const uint16x8_t v1 = vmull_u8(hor1, coef0);
94 const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1);
95 const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1);
96 // dst: 0 1 2 3 4 5 6 7 8 9 A B C D E F
97 const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7));
98 vst1q_u8(dst, d);
99 }
100
scale_plane_2_to_1_bilinear(const uint8_t * const src,const int src_stride,uint8_t * dst,const int dst_stride,const int w,const int h,const int16_t c0,const int16_t c1)101 static INLINE void scale_plane_2_to_1_bilinear(
102 const uint8_t *const src, const int src_stride, uint8_t *dst,
103 const int dst_stride, const int w, const int h, const int16_t c0,
104 const int16_t c1) {
105 const int max_width = (w + 15) & ~15;
106 const uint8_t *src0 = src;
107 const uint8_t *src1 = src + src_stride;
108 const uint8x8_t coef0 = vdup_n_u8(c0);
109 const uint8x8_t coef1 = vdup_n_u8(c1);
110 int y = h;
111
112 assert(w && h);
113
114 do {
115 int x = max_width;
116 do {
117 // 000 002 004 006 008 00A 00C 00E 010 012 014 016 018 01A 01C 01E
118 // 001 003 005 007 009 00B 00D 00F 011 013 015 017 019 01B 01D 01F
119 const uint8x16x2_t s0 = vld2q_u8(src0);
120 // 100 102 104 106 108 10A 10C 10E 110 112 114 116 118 11A 11C 11E
121 // 101 103 105 107 109 10B 10D 10F 111 113 115 117 119 11B 11D 11F
122 const uint8x16x2_t s1 = vld2q_u8(src1);
123 scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
124 coef0, coef1, dst);
125 src0 += 32;
126 src1 += 32;
127 dst += 16;
128 x -= 16;
129 } while (x);
130 src0 += 2 * (src_stride - max_width);
131 src1 += 2 * (src_stride - max_width);
132 dst += dst_stride - max_width;
133 } while (--y);
134 }
135
scale_plane_4_to_1_bilinear(const uint8_t * const src,const int src_stride,uint8_t * dst,const int dst_stride,const int w,const int h,const int16_t c0,const int16_t c1)136 static INLINE void scale_plane_4_to_1_bilinear(
137 const uint8_t *const src, const int src_stride, uint8_t *dst,
138 const int dst_stride, const int w, const int h, const int16_t c0,
139 const int16_t c1) {
140 const int max_width = (w + 15) & ~15;
141 const uint8_t *src0 = src;
142 const uint8_t *src1 = src + src_stride;
143 const uint8x8_t coef0 = vdup_n_u8(c0);
144 const uint8x8_t coef1 = vdup_n_u8(c1);
145 int y = h;
146
147 assert(w && h);
148
149 do {
150 int x = max_width;
151 do {
152 // (*) -- useless
153 // 000 004 008 00C 010 014 018 01C 020 024 028 02C 030 034 038 03C
154 // 001 005 009 00D 011 015 019 01D 021 025 029 02D 031 035 039 03D
155 // 002 006 00A 00E 012 016 01A 01E 022 026 02A 02E 032 036 03A 03E (*)
156 // 003 007 00B 00F 013 017 01B 01F 023 027 02B 02F 033 037 03B 03F (*)
157 const uint8x16x4_t s0 = vld4q_u8(src0);
158 // 100 104 108 10C 110 114 118 11C 120 124 128 12C 130 134 138 13C
159 // 101 105 109 10D 111 115 119 11D 121 125 129 12D 131 135 139 13D
160 // 102 106 10A 10E 112 116 11A 11E 122 126 12A 12E 132 136 13A 13E (*)
161 // 103 107 10B 10F 113 117 11B 11F 123 127 12B 12F 133 137 13B 13F (*)
162 const uint8x16x4_t s1 = vld4q_u8(src1);
163 scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
164 coef0, coef1, dst);
165 src0 += 64;
166 src1 += 64;
167 dst += 16;
168 x -= 16;
169 } while (x);
170 src0 += 4 * (src_stride - max_width);
171 src1 += 4 * (src_stride - max_width);
172 dst += dst_stride - max_width;
173 } while (--y);
174 }
175
scale_filter_bilinear(const uint8x8_t * const s,const uint8x8_t * const coef)176 static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s,
177 const uint8x8_t *const coef) {
178 const uint16x8_t h0 = vmull_u8(s[0], coef[0]);
179 const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]);
180
181 return vrshrn_n_u16(h1, 7);
182 }
183
scale_plane_2_to_1_general(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,const int w,const int h,const int16_t * const coef,uint8_t * const temp_buffer)184 static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
185 uint8_t *dst, const int dst_stride,
186 const int w, const int h,
187 const int16_t *const coef,
188 uint8_t *const temp_buffer) {
189 const int width_hor = (w + 3) & ~3;
190 const int width_ver = (w + 7) & ~7;
191 const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
192 const int height_ver = (h + 3) & ~3;
193 const int16x8_t filters = vld1q_s16(coef);
194 int x, y = height_hor;
195 uint8_t *t = temp_buffer;
196 uint8x8_t s[14], d[4];
197
198 assert(w && h);
199
200 src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
201
202 // horizontal 4x8
203 // Note: processing 4x8 is about 20% faster than processing row by row using
204 // vld4_u8().
205 do {
206 load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
207 &s[6], &s[7]);
208 transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
209 x = width_hor;
210
211 do {
212 src += 8;
213 load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
214 &s[12], &s[13]);
215 transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
216 &s[13]);
217
218 d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70
219 d[1] = scale_filter_8(&s[2], filters); // 01 11 21 31 41 51 61 71
220 d[2] = scale_filter_8(&s[4], filters); // 02 12 22 32 42 52 62 72
221 d[3] = scale_filter_8(&s[6], filters); // 03 13 23 33 43 53 63 73
222 // 00 01 02 03 40 41 42 43
223 // 10 11 12 13 50 51 52 53
224 // 20 21 22 23 60 61 62 63
225 // 30 31 32 33 70 71 72 73
226 transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]);
227 vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]),
228 0);
229 vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]),
230 0);
231 vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]),
232 0);
233 vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]),
234 0);
235 vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]),
236 1);
237 vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]),
238 1);
239 vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]),
240 1);
241 vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]),
242 1);
243
244 s[0] = s[8];
245 s[1] = s[9];
246 s[2] = s[10];
247 s[3] = s[11];
248 s[4] = s[12];
249 s[5] = s[13];
250
251 t += 4;
252 x -= 4;
253 } while (x);
254 src += 8 * src_stride - 2 * width_hor;
255 t += 7 * width_hor;
256 y -= 8;
257 } while (y);
258
259 // vertical 8x4
260 x = width_ver;
261 t = temp_buffer;
262 do {
263 load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
264 &s[7]);
265 t += 6 * width_hor;
266 y = height_ver;
267
268 do {
269 load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
270 &s[12], &s[13]);
271 t += 8 * width_hor;
272
273 d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07
274 d[1] = scale_filter_8(&s[2], filters); // 10 11 12 13 14 15 16 17
275 d[2] = scale_filter_8(&s[4], filters); // 20 21 22 23 24 25 26 27
276 d[3] = scale_filter_8(&s[6], filters); // 30 31 32 33 34 35 36 37
277 vst1_u8(dst + 0 * dst_stride, d[0]);
278 vst1_u8(dst + 1 * dst_stride, d[1]);
279 vst1_u8(dst + 2 * dst_stride, d[2]);
280 vst1_u8(dst + 3 * dst_stride, d[3]);
281
282 s[0] = s[8];
283 s[1] = s[9];
284 s[2] = s[10];
285 s[3] = s[11];
286 s[4] = s[12];
287 s[5] = s[13];
288
289 dst += 4 * dst_stride;
290 y -= 4;
291 } while (y);
292 t -= width_hor * (2 * height_ver + 6);
293 t += 8;
294 dst -= height_ver * dst_stride;
295 dst += 8;
296 x -= 8;
297 } while (x);
298 }
299
scale_plane_4_to_1_general(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,const int w,const int h,const int16_t * const coef,uint8_t * const temp_buffer)300 static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
301 uint8_t *dst, const int dst_stride,
302 const int w, const int h,
303 const int16_t *const coef,
304 uint8_t *const temp_buffer) {
305 const int width_hor = (w + 1) & ~1;
306 const int width_ver = (w + 7) & ~7;
307 const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
308 const int height_ver = (h + 1) & ~1;
309 const int16x8_t filters = vld1q_s16(coef);
310 int x, y = height_hor;
311 uint8_t *t = temp_buffer;
312 uint8x8_t s[12], d[2];
313
314 assert(w && h);
315
316 src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
317
318 // horizontal 2x8
319 // Note: processing 2x8 is about 20% faster than processing row by row using
320 // vld4_u8().
321 do {
322 load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
323 &s[6], &s[7]);
324 transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]);
325 x = width_hor;
326
327 do {
328 uint8x8x2_t dd;
329 src += 8;
330 load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
331 &s[10], &s[11]);
332 transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10],
333 &s[11]);
334
335 d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70
336 d[1] = scale_filter_8(&s[4], filters); // 01 11 21 31 41 51 61 71
337 // dd.val[0]: 00 01 20 21 40 41 60 61
338 // dd.val[1]: 10 11 30 31 50 51 70 71
339 dd = vtrn_u8(d[0], d[1]);
340 vst1_lane_u16((uint16_t *)(t + 0 * width_hor),
341 vreinterpret_u16_u8(dd.val[0]), 0);
342 vst1_lane_u16((uint16_t *)(t + 1 * width_hor),
343 vreinterpret_u16_u8(dd.val[1]), 0);
344 vst1_lane_u16((uint16_t *)(t + 2 * width_hor),
345 vreinterpret_u16_u8(dd.val[0]), 1);
346 vst1_lane_u16((uint16_t *)(t + 3 * width_hor),
347 vreinterpret_u16_u8(dd.val[1]), 1);
348 vst1_lane_u16((uint16_t *)(t + 4 * width_hor),
349 vreinterpret_u16_u8(dd.val[0]), 2);
350 vst1_lane_u16((uint16_t *)(t + 5 * width_hor),
351 vreinterpret_u16_u8(dd.val[1]), 2);
352 vst1_lane_u16((uint16_t *)(t + 6 * width_hor),
353 vreinterpret_u16_u8(dd.val[0]), 3);
354 vst1_lane_u16((uint16_t *)(t + 7 * width_hor),
355 vreinterpret_u16_u8(dd.val[1]), 3);
356
357 s[0] = s[8];
358 s[1] = s[9];
359 s[2] = s[10];
360 s[3] = s[11];
361
362 t += 2;
363 x -= 2;
364 } while (x);
365 src += 8 * src_stride - 4 * width_hor;
366 t += 7 * width_hor;
367 y -= 8;
368 } while (y);
369
370 // vertical 8x2
371 x = width_ver;
372 t = temp_buffer;
373 do {
374 load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]);
375 t += 4 * width_hor;
376 y = height_ver;
377
378 do {
379 load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
380 &s[10], &s[11]);
381 t += 8 * width_hor;
382
383 d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07
384 d[1] = scale_filter_8(&s[4], filters); // 10 11 12 13 14 15 16 17
385 vst1_u8(dst + 0 * dst_stride, d[0]);
386 vst1_u8(dst + 1 * dst_stride, d[1]);
387
388 s[0] = s[8];
389 s[1] = s[9];
390 s[2] = s[10];
391 s[3] = s[11];
392
393 dst += 2 * dst_stride;
394 y -= 2;
395 } while (y);
396 t -= width_hor * (4 * height_ver + 4);
397 t += 8;
398 dst -= height_ver * dst_stride;
399 dst += 8;
400 x -= 8;
401 } while (x);
402 }
403
404 // Notes for 4 to 3 scaling:
405 //
406 // 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be
407 // multiple of 6, and no less than w.
408 //
409 // 2. 8 rows are calculated in each vertical inner loop, so width_ver must be
410 // multiple of 8, and no less than w.
411 //
412 // 3. 8 columns are calculated in each horizontal inner loop for further
413 // vertical scaling, so height_hor must be multiple of 8, and no less than
414 // 4 * h / 3.
415 //
416 // 4. 6 columns are calculated in each vertical inner loop, so height_ver must
417 // be multiple of 6, and no less than h.
418 //
419 // 5. The physical location of the last row of the 4 to 3 scaled frame is
420 // decided by phase_scaler, and are always less than 1 pixel below the last row
421 // of the original image.
422
scale_plane_4_to_3_bilinear(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,const int w,const int h,const int phase_scaler,uint8_t * const temp_buffer)423 static void scale_plane_4_to_3_bilinear(const uint8_t *src,
424 const int src_stride, uint8_t *dst,
425 const int dst_stride, const int w,
426 const int h, const int phase_scaler,
427 uint8_t *const temp_buffer) {
428 static const int step_q4 = 16 * 4 / 3;
429 const int width_hor = (w + 5) - ((w + 5) % 6);
430 const int stride_hor = width_hor + 2; // store 2 extra pixels
431 const int width_ver = (w + 7) & ~7;
432 // We only need 1 extra row below because there are only 2 bilinear
433 // coefficients.
434 const int height_hor = (4 * h / 3 + 1 + 7) & ~7;
435 const int height_ver = (h + 5) - ((h + 5) % 6);
436 int x, y = height_hor;
437 uint8_t *t = temp_buffer;
438 uint8x8_t s[9], d[8], c[6];
439
440 assert(w && h);
441
442 c[0] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][3]);
443 c[1] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][4]);
444 c[2] = vdup_n_u8(
445 (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) &
446 SUBPEL_MASK][3]);
447 c[3] = vdup_n_u8(
448 (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) &
449 SUBPEL_MASK][4]);
450 c[4] = vdup_n_u8(
451 (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) &
452 SUBPEL_MASK][3]);
453 c[5] = vdup_n_u8(
454 (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) &
455 SUBPEL_MASK][4]);
456
457 d[6] = vdup_n_u8(0);
458 d[7] = vdup_n_u8(0);
459
460 // horizontal 6x8
461 do {
462 load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
463 &s[6], &s[7]);
464 src += 1;
465 transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
466 x = width_hor;
467
468 do {
469 load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
470 &s[7], &s[8]);
471 src += 8;
472 transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]);
473
474 // 00 10 20 30 40 50 60 70
475 // 01 11 21 31 41 51 61 71
476 // 02 12 22 32 42 52 62 72
477 // 03 13 23 33 43 53 63 73
478 // 04 14 24 34 44 54 64 74
479 // 05 15 25 35 45 55 65 75
480 d[0] = scale_filter_bilinear(&s[0], &c[0]);
481 d[1] =
482 scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
483 d[2] =
484 scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
485 d[3] = scale_filter_bilinear(&s[4], &c[0]);
486 d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
487 &c[2]);
488 d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
489 &c[4]);
490
491 // 00 01 02 03 04 05 xx xx
492 // 10 11 12 13 14 15 xx xx
493 // 20 21 22 23 24 25 xx xx
494 // 30 31 32 33 34 35 xx xx
495 // 40 41 42 43 44 45 xx xx
496 // 50 51 52 53 54 55 xx xx
497 // 60 61 62 63 64 65 xx xx
498 // 70 71 72 73 74 75 xx xx
499 transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
500 // store 2 extra pixels
501 vst1_u8(t + 0 * stride_hor, d[0]);
502 vst1_u8(t + 1 * stride_hor, d[1]);
503 vst1_u8(t + 2 * stride_hor, d[2]);
504 vst1_u8(t + 3 * stride_hor, d[3]);
505 vst1_u8(t + 4 * stride_hor, d[4]);
506 vst1_u8(t + 5 * stride_hor, d[5]);
507 vst1_u8(t + 6 * stride_hor, d[6]);
508 vst1_u8(t + 7 * stride_hor, d[7]);
509
510 s[0] = s[8];
511
512 t += 6;
513 x -= 6;
514 } while (x);
515 src += 8 * src_stride - 4 * width_hor / 3 - 1;
516 t += 7 * stride_hor + 2;
517 y -= 8;
518 } while (y);
519
520 // vertical 8x6
521 x = width_ver;
522 t = temp_buffer;
523 do {
524 load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
525 &s[7]);
526 t += stride_hor;
527 y = height_ver;
528
529 do {
530 load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
531 &s[7], &s[8]);
532 t += 8 * stride_hor;
533
534 d[0] = scale_filter_bilinear(&s[0], &c[0]);
535 d[1] =
536 scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
537 d[2] =
538 scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
539 d[3] = scale_filter_bilinear(&s[4], &c[0]);
540 d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
541 &c[2]);
542 d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
543 &c[4]);
544 vst1_u8(dst + 0 * dst_stride, d[0]);
545 vst1_u8(dst + 1 * dst_stride, d[1]);
546 vst1_u8(dst + 2 * dst_stride, d[2]);
547 vst1_u8(dst + 3 * dst_stride, d[3]);
548 vst1_u8(dst + 4 * dst_stride, d[4]);
549 vst1_u8(dst + 5 * dst_stride, d[5]);
550
551 s[0] = s[8];
552
553 dst += 6 * dst_stride;
554 y -= 6;
555 } while (y);
556 t -= stride_hor * (4 * height_ver / 3 + 1);
557 t += 8;
558 dst -= height_ver * dst_stride;
559 dst += 8;
560 x -= 8;
561 } while (x);
562 }
563
scale_plane_4_to_3_general(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,const int w,const int h,const InterpKernel * const coef,const int phase_scaler,uint8_t * const temp_buffer)564 static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
565 uint8_t *dst, const int dst_stride,
566 const int w, const int h,
567 const InterpKernel *const coef,
568 const int phase_scaler,
569 uint8_t *const temp_buffer) {
570 static const int step_q4 = 16 * 4 / 3;
571 const int width_hor = (w + 5) - ((w + 5) % 6);
572 const int stride_hor = width_hor + 2; // store 2 extra pixels
573 const int width_ver = (w + 7) & ~7;
574 // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
575 // above and (SUBPEL_TAPS / 2) extra rows below.
576 const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
577 const int height_ver = (h + 5) - ((h + 5) % 6);
578 const int16x8_t filters0 =
579 vld1q_s16(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]);
580 const int16x8_t filters1 =
581 vld1q_s16(coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]);
582 const int16x8_t filters2 =
583 vld1q_s16(coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]);
584 int x, y = height_hor;
585 uint8_t *t = temp_buffer;
586 uint8x8_t s[15], d[8];
587
588 assert(w && h);
589
590 src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2;
591 d[6] = vdup_n_u8(0);
592 d[7] = vdup_n_u8(0);
593
594 // horizontal 6x8
595 do {
596 load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
597 &s[6], &s[7]);
598 transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
599 x = width_hor;
600
601 do {
602 src += 8;
603 load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
604 &s[13], &s[14]);
605 transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13],
606 &s[14]);
607
608 // 00 10 20 30 40 50 60 70
609 // 01 11 21 31 41 51 61 71
610 // 02 12 22 32 42 52 62 72
611 // 03 13 23 33 43 53 63 73
612 // 04 14 24 34 44 54 64 74
613 // 05 15 25 35 45 55 65 75
614 d[0] = scale_filter_8(&s[0], filters0);
615 d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
616 d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
617 d[3] = scale_filter_8(&s[4], filters0);
618 d[4] =
619 scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
620 d[5] =
621 scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
622
623 // 00 01 02 03 04 05 xx xx
624 // 10 11 12 13 14 15 xx xx
625 // 20 21 22 23 24 25 xx xx
626 // 30 31 32 33 34 35 xx xx
627 // 40 41 42 43 44 45 xx xx
628 // 50 51 52 53 54 55 xx xx
629 // 60 61 62 63 64 65 xx xx
630 // 70 71 72 73 74 75 xx xx
631 transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
632 // store 2 extra pixels
633 vst1_u8(t + 0 * stride_hor, d[0]);
634 vst1_u8(t + 1 * stride_hor, d[1]);
635 vst1_u8(t + 2 * stride_hor, d[2]);
636 vst1_u8(t + 3 * stride_hor, d[3]);
637 vst1_u8(t + 4 * stride_hor, d[4]);
638 vst1_u8(t + 5 * stride_hor, d[5]);
639 vst1_u8(t + 6 * stride_hor, d[6]);
640 vst1_u8(t + 7 * stride_hor, d[7]);
641
642 s[0] = s[8];
643 s[1] = s[9];
644 s[2] = s[10];
645 s[3] = s[11];
646 s[4] = s[12];
647 s[5] = s[13];
648 s[6] = s[14];
649
650 t += 6;
651 x -= 6;
652 } while (x);
653 src += 8 * src_stride - 4 * width_hor / 3;
654 t += 7 * stride_hor + 2;
655 y -= 8;
656 } while (y);
657
658 // vertical 8x6
659 x = width_ver;
660 t = temp_buffer;
661 do {
662 load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
663 &s[7]);
664 t += 7 * stride_hor;
665 y = height_ver;
666
667 do {
668 load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
669 &s[13], &s[14]);
670 t += 8 * stride_hor;
671
672 d[0] = scale_filter_8(&s[0], filters0);
673 d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
674 d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
675 d[3] = scale_filter_8(&s[4], filters0);
676 d[4] =
677 scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
678 d[5] =
679 scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
680 vst1_u8(dst + 0 * dst_stride, d[0]);
681 vst1_u8(dst + 1 * dst_stride, d[1]);
682 vst1_u8(dst + 2 * dst_stride, d[2]);
683 vst1_u8(dst + 3 * dst_stride, d[3]);
684 vst1_u8(dst + 4 * dst_stride, d[4]);
685 vst1_u8(dst + 5 * dst_stride, d[5]);
686
687 s[0] = s[8];
688 s[1] = s[9];
689 s[2] = s[10];
690 s[3] = s[11];
691 s[4] = s[12];
692 s[5] = s[13];
693 s[6] = s[14];
694
695 dst += 6 * dst_stride;
696 y -= 6;
697 } while (y);
698 t -= stride_hor * (4 * height_ver / 3 + 7);
699 t += 8;
700 dst -= height_ver * dst_stride;
701 dst += 8;
702 x -= 8;
703 } while (x);
704 }
705
vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG * src,YV12_BUFFER_CONFIG * dst,INTERP_FILTER filter_type,int phase_scaler)706 void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
707 YV12_BUFFER_CONFIG *dst,
708 INTERP_FILTER filter_type,
709 int phase_scaler) {
710 const int src_w = src->y_crop_width;
711 const int src_h = src->y_crop_height;
712 const int dst_w = dst->y_crop_width;
713 const int dst_h = dst->y_crop_height;
714 const int dst_uv_w = dst->uv_crop_width;
715 const int dst_uv_h = dst->uv_crop_height;
716 int scaled = 0;
717
718 // phase_scaler is usually 0 or 8.
719 assert(phase_scaler >= 0 && phase_scaler < 16);
720
721 if (2 * dst_w == src_w && 2 * dst_h == src_h) {
722 // 2 to 1
723 scaled = 1;
724 if (phase_scaler == 0) {
725 scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
726 dst->y_stride, dst_w, dst_h);
727 scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
728 dst->uv_stride, dst_uv_w, dst_uv_h);
729 scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
730 dst->uv_stride, dst_uv_w, dst_uv_h);
731 } else if (filter_type == BILINEAR) {
732 const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
733 const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
734 scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
735 dst->y_stride, dst_w, dst_h, c0, c1);
736 scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
737 dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
738 scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
739 dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
740 } else {
741 const int buffer_stride = (dst_w + 3) & ~3;
742 const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
743 uint8_t *const temp_buffer =
744 (uint8_t *)malloc(buffer_stride * buffer_height);
745 if (temp_buffer) {
746 scale_plane_2_to_1_general(
747 src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
748 dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
749 scale_plane_2_to_1_general(
750 src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
751 dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
752 temp_buffer);
753 scale_plane_2_to_1_general(
754 src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
755 dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
756 temp_buffer);
757 free(temp_buffer);
758 } else {
759 scaled = 0;
760 }
761 }
762 } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
763 // 4 to 1
764 scaled = 1;
765 if (phase_scaler == 0) {
766 scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
767 dst->y_stride, dst_w, dst_h);
768 scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
769 dst->uv_stride, dst_uv_w, dst_uv_h);
770 scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
771 dst->uv_stride, dst_uv_w, dst_uv_h);
772 } else if (filter_type == BILINEAR) {
773 const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
774 const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
775 scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
776 dst->y_stride, dst_w, dst_h, c0, c1);
777 scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
778 dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
779 scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
780 dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
781 } else {
782 const int buffer_stride = (dst_w + 1) & ~1;
783 const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
784 uint8_t *const temp_buffer =
785 (uint8_t *)malloc(buffer_stride * buffer_height);
786 if (temp_buffer) {
787 scale_plane_4_to_1_general(
788 src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
789 dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
790 scale_plane_4_to_1_general(
791 src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
792 dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
793 temp_buffer);
794 scale_plane_4_to_1_general(
795 src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
796 dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
797 temp_buffer);
798 free(temp_buffer);
799 } else {
800 scaled = 0;
801 }
802 }
803 } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
804 // 4 to 3
805 const int buffer_stride = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
806 const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
807 uint8_t *const temp_buffer =
808 (uint8_t *)malloc(buffer_stride * buffer_height);
809 if (temp_buffer) {
810 scaled = 1;
811 if (filter_type == BILINEAR) {
812 scale_plane_4_to_3_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
813 dst->y_stride, dst_w, dst_h, phase_scaler,
814 temp_buffer);
815 scale_plane_4_to_3_bilinear(src->u_buffer, src->uv_stride,
816 dst->u_buffer, dst->uv_stride, dst_uv_w,
817 dst_uv_h, phase_scaler, temp_buffer);
818 scale_plane_4_to_3_bilinear(src->v_buffer, src->uv_stride,
819 dst->v_buffer, dst->uv_stride, dst_uv_w,
820 dst_uv_h, phase_scaler, temp_buffer);
821 } else {
822 scale_plane_4_to_3_general(
823 src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
824 dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
825 scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
826 dst->uv_stride, dst_uv_w, dst_uv_h,
827 vp9_filter_kernels[filter_type],
828 phase_scaler, temp_buffer);
829 scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
830 dst->uv_stride, dst_uv_w, dst_uv_h,
831 vp9_filter_kernels[filter_type],
832 phase_scaler, temp_buffer);
833 }
834 free(temp_buffer);
835 }
836 }
837
838 if (scaled) {
839 vpx_extend_frame_borders(dst);
840 } else {
841 // Call c version for all other scaling ratios.
842 vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
843 }
844 }
845