1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <string.h>
14
15 #include "config/aom_dsp_rtcd.h"
16
aom_convolve_copy_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h)17 void aom_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
18 uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
19 const uint8_t *src1;
20 uint8_t *dst1;
21 int y;
22
23 if (!(w & 0x0F)) {
24 for (y = 0; y < h; ++y) {
25 src1 = src;
26 dst1 = dst;
27 for (int x = 0; x < (w >> 4); ++x) {
28 vst1q_u8(dst1, vld1q_u8(src1));
29 src1 += 16;
30 dst1 += 16;
31 }
32 src += src_stride;
33 dst += dst_stride;
34 }
35 } else if (!(w & 0x07)) {
36 for (y = 0; y < h; ++y) {
37 vst1_u8(dst, vld1_u8(src));
38 src += src_stride;
39 dst += dst_stride;
40 }
41 } else if (!(w & 0x03)) {
42 for (y = 0; y < h; ++y) {
43 memcpy(dst, src, sizeof(uint32_t));
44 src += src_stride;
45 dst += dst_stride;
46 }
47 } else if (!(w & 0x01)) {
48 for (y = 0; y < h; ++y) {
49 memcpy(dst, src, sizeof(uint16_t));
50 src += src_stride;
51 dst += dst_stride;
52 }
53 }
54 }
55
56 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_convolve_copy_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h)57 void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
58 uint16_t *dst, ptrdiff_t dst_stride, int w,
59 int h) {
60 if (w < 4) { // copy2
61 do {
62 memmove(dst, src, 2 * sizeof(*src));
63 src += src_stride;
64 dst += dst_stride;
65
66 memmove(dst, src, 2 * sizeof(*src));
67 src += src_stride;
68 dst += dst_stride;
69 h -= 2;
70 } while (h != 0);
71 } else if (w == 4) { // copy4
72 uint16x4_t s0, s1;
73 do {
74 s0 = vld1_u16(src);
75 src += src_stride;
76 s1 = vld1_u16(src);
77 src += src_stride;
78
79 vst1_u16(dst, s0);
80 dst += dst_stride;
81 vst1_u16(dst, s1);
82 dst += dst_stride;
83 h -= 2;
84 } while (h != 0);
85 } else if (w == 8) { // copy8
86 uint16x8_t s0, s1;
87 do {
88 s0 = vld1q_u16(src);
89 src += src_stride;
90 s1 = vld1q_u16(src);
91 src += src_stride;
92
93 vst1q_u16(dst, s0);
94 dst += dst_stride;
95 vst1q_u16(dst, s1);
96 dst += dst_stride;
97 h -= 2;
98 } while (h != 0);
99 } else if (w < 32) { // copy16
100 uint16x8_t s0, s1, s2, s3;
101 do {
102 s0 = vld1q_u16(src);
103 s1 = vld1q_u16(src + 8);
104 src += src_stride;
105 s2 = vld1q_u16(src);
106 s3 = vld1q_u16(src + 8);
107 src += src_stride;
108
109 vst1q_u16(dst, s0);
110 vst1q_u16(dst + 8, s1);
111 dst += dst_stride;
112 vst1q_u16(dst, s2);
113 vst1q_u16(dst + 8, s3);
114 dst += dst_stride;
115 h -= 2;
116 } while (h != 0);
117 } else if (w == 32) { // copy32
118 uint16x8_t s0, s1, s2, s3;
119 do {
120 s0 = vld1q_u16(src);
121 s1 = vld1q_u16(src + 8);
122 s2 = vld1q_u16(src + 16);
123 s3 = vld1q_u16(src + 24);
124 src += src_stride;
125
126 vst1q_u16(dst, s0);
127 vst1q_u16(dst + 8, s1);
128 vst1q_u16(dst + 16, s2);
129 vst1q_u16(dst + 24, s3);
130 dst += dst_stride;
131 } while (--h != 0);
132 } else { // copy64
133 uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
134 do {
135 const uint16_t *s = src;
136 uint16_t *d = dst;
137 int width = w;
138 do {
139 s0 = vld1q_u16(s);
140 s1 = vld1q_u16(s + 8);
141 s2 = vld1q_u16(s + 16);
142 s3 = vld1q_u16(s + 24);
143 s4 = vld1q_u16(s + 32);
144 s5 = vld1q_u16(s + 40);
145 s6 = vld1q_u16(s + 48);
146 s7 = vld1q_u16(s + 56);
147
148 vst1q_u16(d, s0);
149 vst1q_u16(d + 8, s1);
150 vst1q_u16(d + 16, s2);
151 vst1q_u16(d + 24, s3);
152 vst1q_u16(d + 32, s4);
153 vst1q_u16(d + 40, s5);
154 vst1q_u16(d + 48, s6);
155 vst1q_u16(d + 56, s7);
156 s += 64;
157 d += 64;
158 width -= 64;
159 } while (width > 0);
160 src += src_stride;
161 dst += dst_stride;
162 } while (--h != 0);
163 }
164 }
165
166 #endif // CONFIG_AV1_HIGHBITDEPTH
167