1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15
vpx_highbd_convolve_copy_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)16 void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
17 uint16_t *dst, ptrdiff_t dst_stride,
18 const InterpKernel *filter, int x0_q4,
19 int x_step_q4, int y0_q4, int y_step_q4,
20 int w, int h, int bd) {
21 (void)filter;
22 (void)x0_q4;
23 (void)x_step_q4;
24 (void)y0_q4;
25 (void)y_step_q4;
26 (void)bd;
27
28 if (w < 8) { // copy4
29 uint16x4_t s0, s1;
30 do {
31 s0 = vld1_u16(src);
32 src += src_stride;
33 s1 = vld1_u16(src);
34 src += src_stride;
35
36 vst1_u16(dst, s0);
37 dst += dst_stride;
38 vst1_u16(dst, s1);
39 dst += dst_stride;
40 h -= 2;
41 } while (h != 0);
42 } else if (w == 8) { // copy8
43 uint16x8_t s0, s1;
44 do {
45 s0 = vld1q_u16(src);
46 src += src_stride;
47 s1 = vld1q_u16(src);
48 src += src_stride;
49
50 vst1q_u16(dst, s0);
51 dst += dst_stride;
52 vst1q_u16(dst, s1);
53 dst += dst_stride;
54 h -= 2;
55 } while (h != 0);
56 } else if (w < 32) { // copy16
57 uint16x8_t s0, s1, s2, s3;
58 do {
59 s0 = vld1q_u16(src);
60 s1 = vld1q_u16(src + 8);
61 src += src_stride;
62 s2 = vld1q_u16(src);
63 s3 = vld1q_u16(src + 8);
64 src += src_stride;
65
66 vst1q_u16(dst, s0);
67 vst1q_u16(dst + 8, s1);
68 dst += dst_stride;
69 vst1q_u16(dst, s2);
70 vst1q_u16(dst + 8, s3);
71 dst += dst_stride;
72 h -= 2;
73 } while (h != 0);
74 } else if (w == 32) { // copy32
75 uint16x8_t s0, s1, s2, s3;
76 do {
77 s0 = vld1q_u16(src);
78 s1 = vld1q_u16(src + 8);
79 s2 = vld1q_u16(src + 16);
80 s3 = vld1q_u16(src + 24);
81 src += src_stride;
82
83 vst1q_u16(dst, s0);
84 vst1q_u16(dst + 8, s1);
85 vst1q_u16(dst + 16, s2);
86 vst1q_u16(dst + 24, s3);
87 dst += dst_stride;
88 } while (--h != 0);
89 } else { // copy64
90 uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
91 do {
92 s0 = vld1q_u16(src);
93 s1 = vld1q_u16(src + 8);
94 s2 = vld1q_u16(src + 16);
95 s3 = vld1q_u16(src + 24);
96 s4 = vld1q_u16(src + 32);
97 s5 = vld1q_u16(src + 40);
98 s6 = vld1q_u16(src + 48);
99 s7 = vld1q_u16(src + 56);
100 src += src_stride;
101
102 vst1q_u16(dst, s0);
103 vst1q_u16(dst + 8, s1);
104 vst1q_u16(dst + 16, s2);
105 vst1q_u16(dst + 24, s3);
106 vst1q_u16(dst + 32, s4);
107 vst1q_u16(dst + 40, s5);
108 vst1q_u16(dst + 48, s6);
109 vst1q_u16(dst + 56, s7);
110 dst += dst_stride;
111 } while (--h != 0);
112 }
113 }
114