xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/highbd_variance_neon_dotprod.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "./vpx_config.h"
15 
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/sum_neon.h"
19 #include "vpx_ports/mem.h"
20 
highbd_mse8_8xh_neon_dotprod(const uint16_t * src_ptr,int src_stride,const uint16_t * ref_ptr,int ref_stride,int h)21 static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
22                                                     int src_stride,
23                                                     const uint16_t *ref_ptr,
24                                                     int ref_stride, int h) {
25   uint32x4_t sse_u32 = vdupq_n_u32(0);
26 
27   int i = h / 2;
28   do {
29     uint16x8_t s0, s1, r0, r1;
30     uint8x16_t s, r, diff;
31 
32     s0 = vld1q_u16(src_ptr);
33     src_ptr += src_stride;
34     s1 = vld1q_u16(src_ptr);
35     src_ptr += src_stride;
36     r0 = vld1q_u16(ref_ptr);
37     ref_ptr += ref_stride;
38     r1 = vld1q_u16(ref_ptr);
39     ref_ptr += ref_stride;
40 
41     s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
42     r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
43 
44     diff = vabdq_u8(s, r);
45     sse_u32 = vdotq_u32(sse_u32, diff, diff);
46   } while (--i != 0);
47 
48   return horizontal_add_uint32x4(sse_u32);
49 }
50 
highbd_mse8_16xh_neon_dotprod(const uint16_t * src_ptr,int src_stride,const uint16_t * ref_ptr,int ref_stride,int h)51 static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
52                                                      int src_stride,
53                                                      const uint16_t *ref_ptr,
54                                                      int ref_stride, int h) {
55   uint32x4_t sse_u32 = vdupq_n_u32(0);
56 
57   int i = h;
58   do {
59     uint16x8_t s0, s1, r0, r1;
60     uint8x16_t s, r, diff;
61 
62     s0 = vld1q_u16(src_ptr);
63     s1 = vld1q_u16(src_ptr + 8);
64     r0 = vld1q_u16(ref_ptr);
65     r1 = vld1q_u16(ref_ptr + 8);
66 
67     s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
68     r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
69 
70     diff = vabdq_u8(s, r);
71     sse_u32 = vdotq_u32(sse_u32, diff, diff);
72 
73     src_ptr += src_stride;
74     ref_ptr += ref_stride;
75   } while (--i != 0);
76 
77   return horizontal_add_uint32x4(sse_u32);
78 }
79 
80 #define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h)                                      \
81   uint32_t vpx_highbd_8_mse##w##x##h##_neon_dotprod(                           \
82       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
83       int ref_stride, uint32_t *sse) {                                         \
84     uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
85     uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                              \
86     *sse =                                                                     \
87         highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h); \
88     return *sse;                                                               \
89   }
90 
91 HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
92 HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
93 HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
94 HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
95 
96 #undef HIGHBD_MSE_WXH_NEON_DOTPROD
97