1 /*
2 * Copyright (c) 2021 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/sum_neon.h"
19
sad16_neon(uint8x16_t src,uint8x16_t ref,uint32x4_t * const sad_sum)20 static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
21 uint32x4_t *const sad_sum) {
22 uint8x16_t abs_diff = vabdq_u8(src, ref);
23 *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
24 }
25
sad64xhx4d_neon_dotprod(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)26 static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
27 const uint8_t *const ref[4],
28 int ref_stride, uint32_t res[4],
29 int h) {
30 uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
31 vdupq_n_u32(0) };
32 uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
33 vdupq_n_u32(0) };
34 uint32x4_t sum[4];
35
36 int i = 0;
37 do {
38 uint8x16_t s0, s1, s2, s3;
39
40 s0 = vld1q_u8(src + i * src_stride);
41 sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
42 sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
43 sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
44 sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
45
46 s1 = vld1q_u8(src + i * src_stride + 16);
47 sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
48 sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
49 sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
50 sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
51
52 s2 = vld1q_u8(src + i * src_stride + 32);
53 sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
54 sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
55 sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
56 sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
57
58 s3 = vld1q_u8(src + i * src_stride + 48);
59 sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
60 sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
61 sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
62 sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
63
64 } while (++i < h);
65
66 sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
67 sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
68 sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
69 sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
70
71 vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
72 }
73
sad32xhx4d_neon_dotprod(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)74 static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
75 const uint8_t *const ref[4],
76 int ref_stride, uint32_t res[4],
77 int h) {
78 uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
79 vdupq_n_u32(0) };
80 uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
81 vdupq_n_u32(0) };
82 uint32x4_t sum[4];
83
84 int i = 0;
85 do {
86 uint8x16_t s0, s1;
87
88 s0 = vld1q_u8(src + i * src_stride);
89 sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
90 sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
91 sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
92 sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
93
94 s1 = vld1q_u8(src + i * src_stride + 16);
95 sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
96 sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
97 sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
98 sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
99
100 } while (++i < h);
101
102 sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
103 sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
104 sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
105 sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
106
107 vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
108 }
109
sad16xhx4d_neon_dotprod(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)110 static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
111 const uint8_t *const ref[4],
112 int ref_stride, uint32_t res[4],
113 int h) {
114 uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
115 vdupq_n_u32(0) };
116
117 int i = 0;
118 do {
119 const uint8x16_t s = vld1q_u8(src + i * src_stride);
120 sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
121 sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
122 sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
123 sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
124
125 } while (++i < h);
126
127 vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
128 }
129
130 #define SAD_WXH_4D_NEON_DOTPROD(w, h) \
131 void vpx_sad##w##x##h##x4d_neon_dotprod( \
132 const uint8_t *src_ptr, int src_stride, \
133 const uint8_t *const ref_array[4], int ref_stride, \
134 uint32_t sad_array[4]) { \
135 sad##w##xhx4d_neon_dotprod(src_ptr, src_stride, ref_array, ref_stride, \
136 sad_array, (h)); \
137 }
138
139 SAD_WXH_4D_NEON_DOTPROD(16, 8)
140 SAD_WXH_4D_NEON_DOTPROD(16, 16)
141 SAD_WXH_4D_NEON_DOTPROD(16, 32)
142
143 SAD_WXH_4D_NEON_DOTPROD(32, 16)
144 SAD_WXH_4D_NEON_DOTPROD(32, 32)
145 SAD_WXH_4D_NEON_DOTPROD(32, 64)
146
147 SAD_WXH_4D_NEON_DOTPROD(64, 32)
148 SAD_WXH_4D_NEON_DOTPROD(64, 64)
149
150 #undef SAD_WXH_4D_NEON_DOTPROD
151
152 #define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h) \
153 void vpx_sad_skip_##w##x##h##x4d_neon_dotprod( \
154 const uint8_t *src_ptr, int src_stride, \
155 const uint8_t *const ref_array[4], int ref_stride, \
156 uint32_t sad_array[4]) { \
157 sad##w##xhx4d_neon_dotprod(src_ptr, 2 * src_stride, ref_array, \
158 2 * ref_stride, sad_array, ((h) >> 1)); \
159 sad_array[0] <<= 1; \
160 sad_array[1] <<= 1; \
161 sad_array[2] <<= 1; \
162 sad_array[3] <<= 1; \
163 }
164
165 SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8)
166 SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16)
167 SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32)
168
169 SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16)
170 SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32)
171 SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64)
172
173 SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32)
174 SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64)
175
176 #undef SAD_SKIP_WXH_4D_NEON_DOTPROD
177