1 /*
2 * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <stdint.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/arm/mem_neon.h"
17 #include "vpx_dsp/arm/sum_neon.h"
18
sse_16x1_neon(const uint8_t * src,const uint8_t * ref,uint32x4_t * sse)19 static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
20 uint32x4_t *sse) {
21 uint8x16_t s = vld1q_u8(src);
22 uint8x16_t r = vld1q_u8(ref);
23
24 uint8x16_t abs_diff = vabdq_u8(s, r);
25 uint8x8_t abs_diff_lo = vget_low_u8(abs_diff);
26 uint8x8_t abs_diff_hi = vget_high_u8(abs_diff);
27
28 *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo));
29 *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi));
30 }
31
sse_8x1_neon(const uint8_t * src,const uint8_t * ref,uint32x4_t * sse)32 static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
33 uint32x4_t *sse) {
34 uint8x8_t s = vld1_u8(src);
35 uint8x8_t r = vld1_u8(ref);
36
37 uint8x8_t abs_diff = vabd_u8(s, r);
38
39 *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
40 }
41
sse_4x2_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,uint32x4_t * sse)42 static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
43 const uint8_t *ref, int ref_stride,
44 uint32x4_t *sse) {
45 uint8x8_t s = load_unaligned_u8(src, src_stride);
46 uint8x8_t r = load_unaligned_u8(ref, ref_stride);
47
48 uint8x8_t abs_diff = vabd_u8(s, r);
49
50 *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
51 }
52
sse_wxh_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int width,int height)53 static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
54 const uint8_t *ref, int ref_stride,
55 int width, int height) {
56 uint32x4_t sse = vdupq_n_u32(0);
57
58 if ((width & 0x07) && ((width & 0x07) < 5)) {
59 int i = height;
60 do {
61 int j = 0;
62 do {
63 sse_8x1_neon(src + j, ref + j, &sse);
64 sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse);
65 j += 8;
66 } while (j + 4 < width);
67
68 sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse);
69 src += 2 * src_stride;
70 ref += 2 * ref_stride;
71 i -= 2;
72 } while (i != 0);
73 } else {
74 int i = height;
75 do {
76 int j = 0;
77 do {
78 sse_8x1_neon(src + j, ref + j, &sse);
79 j += 8;
80 } while (j < width);
81
82 src += src_stride;
83 ref += ref_stride;
84 } while (--i != 0);
85 }
86 return horizontal_add_uint32x4(sse);
87 }
88
sse_64xh_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int height)89 static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
90 const uint8_t *ref, int ref_stride,
91 int height) {
92 uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
93
94 int i = height;
95 do {
96 sse_16x1_neon(src, ref, &sse[0]);
97 sse_16x1_neon(src + 16, ref + 16, &sse[1]);
98 sse_16x1_neon(src + 32, ref + 32, &sse[0]);
99 sse_16x1_neon(src + 48, ref + 48, &sse[1]);
100
101 src += src_stride;
102 ref += ref_stride;
103 } while (--i != 0);
104
105 return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
106 }
107
sse_32xh_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int height)108 static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride,
109 const uint8_t *ref, int ref_stride,
110 int height) {
111 uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
112
113 int i = height;
114 do {
115 sse_16x1_neon(src, ref, &sse[0]);
116 sse_16x1_neon(src + 16, ref + 16, &sse[1]);
117
118 src += src_stride;
119 ref += ref_stride;
120 } while (--i != 0);
121
122 return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
123 }
124
sse_16xh_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int height)125 static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride,
126 const uint8_t *ref, int ref_stride,
127 int height) {
128 uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
129
130 int i = height;
131 do {
132 sse_16x1_neon(src, ref, &sse[0]);
133 src += src_stride;
134 ref += ref_stride;
135 sse_16x1_neon(src, ref, &sse[1]);
136 src += src_stride;
137 ref += ref_stride;
138 i -= 2;
139 } while (i != 0);
140
141 return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
142 }
143
sse_8xh_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int height)144 static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
145 const uint8_t *ref, int ref_stride,
146 int height) {
147 uint32x4_t sse = vdupq_n_u32(0);
148
149 int i = height;
150 do {
151 sse_8x1_neon(src, ref, &sse);
152
153 src += src_stride;
154 ref += ref_stride;
155 } while (--i != 0);
156
157 return horizontal_add_uint32x4(sse);
158 }
159
sse_4xh_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int height)160 static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
161 const uint8_t *ref, int ref_stride,
162 int height) {
163 uint32x4_t sse = vdupq_n_u32(0);
164
165 int i = height;
166 do {
167 sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
168
169 src += 2 * src_stride;
170 ref += 2 * ref_stride;
171 i -= 2;
172 } while (i != 0);
173
174 return horizontal_add_uint32x4(sse);
175 }
176
vpx_sse_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int width,int height)177 int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
178 int ref_stride, int width, int height) {
179 switch (width) {
180 case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height);
181 case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height);
182 case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
183 case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
184 case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
185 default:
186 return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
187 }
188 }
189