1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "./vpx_config.h"
15
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/sum_neon.h"
19 #include "vpx_ports/mem.h"
20
highbd_mse8_8xh_neon_dotprod(const uint16_t * src_ptr,int src_stride,const uint16_t * ref_ptr,int ref_stride,int h)21 static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
22 int src_stride,
23 const uint16_t *ref_ptr,
24 int ref_stride, int h) {
25 uint32x4_t sse_u32 = vdupq_n_u32(0);
26
27 int i = h / 2;
28 do {
29 uint16x8_t s0, s1, r0, r1;
30 uint8x16_t s, r, diff;
31
32 s0 = vld1q_u16(src_ptr);
33 src_ptr += src_stride;
34 s1 = vld1q_u16(src_ptr);
35 src_ptr += src_stride;
36 r0 = vld1q_u16(ref_ptr);
37 ref_ptr += ref_stride;
38 r1 = vld1q_u16(ref_ptr);
39 ref_ptr += ref_stride;
40
41 s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
42 r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
43
44 diff = vabdq_u8(s, r);
45 sse_u32 = vdotq_u32(sse_u32, diff, diff);
46 } while (--i != 0);
47
48 return horizontal_add_uint32x4(sse_u32);
49 }
50
highbd_mse8_16xh_neon_dotprod(const uint16_t * src_ptr,int src_stride,const uint16_t * ref_ptr,int ref_stride,int h)51 static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
52 int src_stride,
53 const uint16_t *ref_ptr,
54 int ref_stride, int h) {
55 uint32x4_t sse_u32 = vdupq_n_u32(0);
56
57 int i = h;
58 do {
59 uint16x8_t s0, s1, r0, r1;
60 uint8x16_t s, r, diff;
61
62 s0 = vld1q_u16(src_ptr);
63 s1 = vld1q_u16(src_ptr + 8);
64 r0 = vld1q_u16(ref_ptr);
65 r1 = vld1q_u16(ref_ptr + 8);
66
67 s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
68 r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
69
70 diff = vabdq_u8(s, r);
71 sse_u32 = vdotq_u32(sse_u32, diff, diff);
72
73 src_ptr += src_stride;
74 ref_ptr += ref_stride;
75 } while (--i != 0);
76
77 return horizontal_add_uint32x4(sse_u32);
78 }
79
80 #define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h) \
81 uint32_t vpx_highbd_8_mse##w##x##h##_neon_dotprod( \
82 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
83 int ref_stride, uint32_t *sse) { \
84 uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
85 uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
86 *sse = \
87 highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h); \
88 return *sse; \
89 }
90
91 HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
92 HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
93 HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
94 HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
95
96 #undef HIGHBD_MSE_WXH_NEON_DOTPROD
97