xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/sum_neon.h"
19 
sad16_neon(uint8x16_t src,uint8x16_t ref,uint32x4_t * const sad_sum)20 static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
21                               uint32x4_t *const sad_sum) {
22   uint8x16_t abs_diff = vabdq_u8(src, ref);
23   *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
24 }
25 
sad64xhx4d_neon_dotprod(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)26 static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
27                                            const uint8_t *const ref[4],
28                                            int ref_stride, uint32_t res[4],
29                                            int h) {
30   uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
31                            vdupq_n_u32(0) };
32   uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
33                            vdupq_n_u32(0) };
34   uint32x4_t sum[4];
35 
36   int i = 0;
37   do {
38     uint8x16_t s0, s1, s2, s3;
39 
40     s0 = vld1q_u8(src + i * src_stride);
41     sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
42     sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
43     sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
44     sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
45 
46     s1 = vld1q_u8(src + i * src_stride + 16);
47     sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
48     sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
49     sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
50     sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
51 
52     s2 = vld1q_u8(src + i * src_stride + 32);
53     sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
54     sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
55     sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
56     sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
57 
58     s3 = vld1q_u8(src + i * src_stride + 48);
59     sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
60     sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
61     sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
62     sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
63 
64   } while (++i < h);
65 
66   sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
67   sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
68   sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
69   sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
70 
71   vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
72 }
73 
sad32xhx4d_neon_dotprod(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)74 static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
75                                            const uint8_t *const ref[4],
76                                            int ref_stride, uint32_t res[4],
77                                            int h) {
78   uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
79                            vdupq_n_u32(0) };
80   uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
81                            vdupq_n_u32(0) };
82   uint32x4_t sum[4];
83 
84   int i = 0;
85   do {
86     uint8x16_t s0, s1;
87 
88     s0 = vld1q_u8(src + i * src_stride);
89     sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
90     sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
91     sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
92     sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
93 
94     s1 = vld1q_u8(src + i * src_stride + 16);
95     sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
96     sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
97     sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
98     sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
99 
100   } while (++i < h);
101 
102   sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
103   sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
104   sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
105   sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
106 
107   vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
108 }
109 
sad16xhx4d_neon_dotprod(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)110 static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
111                                            const uint8_t *const ref[4],
112                                            int ref_stride, uint32_t res[4],
113                                            int h) {
114   uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
115                         vdupq_n_u32(0) };
116 
117   int i = 0;
118   do {
119     const uint8x16_t s = vld1q_u8(src + i * src_stride);
120     sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
121     sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
122     sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
123     sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
124 
125   } while (++i < h);
126 
127   vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
128 }
129 
130 #define SAD_WXH_4D_NEON_DOTPROD(w, h)                                      \
131   void vpx_sad##w##x##h##x4d_neon_dotprod(                                 \
132       const uint8_t *src_ptr, int src_stride,                              \
133       const uint8_t *const ref_array[4], int ref_stride,                   \
134       uint32_t sad_array[4]) {                                             \
135     sad##w##xhx4d_neon_dotprod(src_ptr, src_stride, ref_array, ref_stride, \
136                                sad_array, (h));                            \
137   }
138 
139 SAD_WXH_4D_NEON_DOTPROD(16, 8)
140 SAD_WXH_4D_NEON_DOTPROD(16, 16)
141 SAD_WXH_4D_NEON_DOTPROD(16, 32)
142 
143 SAD_WXH_4D_NEON_DOTPROD(32, 16)
144 SAD_WXH_4D_NEON_DOTPROD(32, 32)
145 SAD_WXH_4D_NEON_DOTPROD(32, 64)
146 
147 SAD_WXH_4D_NEON_DOTPROD(64, 32)
148 SAD_WXH_4D_NEON_DOTPROD(64, 64)
149 
150 #undef SAD_WXH_4D_NEON_DOTPROD
151 
152 #define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h)                             \
153   void vpx_sad_skip_##w##x##h##x4d_neon_dotprod(                       \
154       const uint8_t *src_ptr, int src_stride,                          \
155       const uint8_t *const ref_array[4], int ref_stride,               \
156       uint32_t sad_array[4]) {                                         \
157     sad##w##xhx4d_neon_dotprod(src_ptr, 2 * src_stride, ref_array,     \
158                                2 * ref_stride, sad_array, ((h) >> 1)); \
159     sad_array[0] <<= 1;                                                \
160     sad_array[1] <<= 1;                                                \
161     sad_array[2] <<= 1;                                                \
162     sad_array[3] <<= 1;                                                \
163   }
164 
165 SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8)
166 SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16)
167 SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32)
168 
169 SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16)
170 SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32)
171 SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64)
172 
173 SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32)
174 SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64)
175 
176 #undef SAD_SKIP_WXH_4D_NEON_DOTPROD
177