xref: /aosp_15_r20/external/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 
14 #include "./vp9_rtcd.h"
15 #include "vpx_dsp/arm/mem_neon.h"
16 #include "vpx_dsp/arm/sum_neon.h"
17 #include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
18 
vp9_block_error_sve(const tran_low_t * coeff,const tran_low_t * dqcoeff,intptr_t block_size,int64_t * ssz)19 int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
20                             intptr_t block_size, int64_t *ssz) {
21   int64x2_t err_v = vdupq_n_s64(0);
22   int64x2_t ssz_v = vdupq_n_s64(0);
23 
24   assert(block_size >= 16);
25   assert((block_size % 16) == 0);
26 
27   do {
28     const int16x8_t c0 = load_tran_low_to_s16q(coeff);
29     const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
30 
31     const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
32     const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
33 
34     const int16x8_t diff0 = vabdq_s16(c0, d0);
35     const int16x8_t diff1 = vabdq_s16(c1, d1);
36 
37     err_v = vpx_dotq_s16(err_v, diff0, diff0);
38     err_v = vpx_dotq_s16(err_v, diff1, diff1);
39 
40     ssz_v = vpx_dotq_s16(ssz_v, c0, c0);
41     ssz_v = vpx_dotq_s16(ssz_v, c1, c1);
42 
43     coeff += 16;
44     dqcoeff += 16;
45     block_size -= 16;
46   } while (block_size != 0);
47 
48   *ssz = horizontal_add_int64x2(ssz_v);
49   return horizontal_add_int64x2(err_v);
50 }
51 
vp9_block_error_fp_sve(const tran_low_t * coeff,const tran_low_t * dqcoeff,int block_size)52 int64_t vp9_block_error_fp_sve(const tran_low_t *coeff,
53                                const tran_low_t *dqcoeff, int block_size) {
54   int64x2_t err = vdupq_n_s64(0);
55 
56   assert(block_size >= 16);
57   assert((block_size % 16) == 0);
58 
59   do {
60     const int16x8_t c0 = load_tran_low_to_s16q(coeff);
61     const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
62 
63     const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
64     const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
65 
66     const int16x8_t diff0 = vabdq_s16(c0, d0);
67     const int16x8_t diff1 = vabdq_s16(c1, d1);
68 
69     err = vpx_dotq_s16(err, diff0, diff0);
70     err = vpx_dotq_s16(err, diff1, diff1);
71 
72     coeff += 16;
73     dqcoeff += 16;
74     block_size -= 16;
75   } while (block_size != 0);
76 
77   return horizontal_add_int64x2(err);
78 }
79