1 /*
2 * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vp9_rtcd.h"
15 #include "vpx_dsp/arm/mem_neon.h"
16 #include "vpx_dsp/arm/sum_neon.h"
17 #include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
18
vp9_block_error_sve(const tran_low_t * coeff,const tran_low_t * dqcoeff,intptr_t block_size,int64_t * ssz)19 int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
20 intptr_t block_size, int64_t *ssz) {
21 int64x2_t err_v = vdupq_n_s64(0);
22 int64x2_t ssz_v = vdupq_n_s64(0);
23
24 assert(block_size >= 16);
25 assert((block_size % 16) == 0);
26
27 do {
28 const int16x8_t c0 = load_tran_low_to_s16q(coeff);
29 const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
30
31 const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
32 const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
33
34 const int16x8_t diff0 = vabdq_s16(c0, d0);
35 const int16x8_t diff1 = vabdq_s16(c1, d1);
36
37 err_v = vpx_dotq_s16(err_v, diff0, diff0);
38 err_v = vpx_dotq_s16(err_v, diff1, diff1);
39
40 ssz_v = vpx_dotq_s16(ssz_v, c0, c0);
41 ssz_v = vpx_dotq_s16(ssz_v, c1, c1);
42
43 coeff += 16;
44 dqcoeff += 16;
45 block_size -= 16;
46 } while (block_size != 0);
47
48 *ssz = horizontal_add_int64x2(ssz_v);
49 return horizontal_add_int64x2(err_v);
50 }
51
vp9_block_error_fp_sve(const tran_low_t * coeff,const tran_low_t * dqcoeff,int block_size)52 int64_t vp9_block_error_fp_sve(const tran_low_t *coeff,
53 const tran_low_t *dqcoeff, int block_size) {
54 int64x2_t err = vdupq_n_s64(0);
55
56 assert(block_size >= 16);
57 assert((block_size % 16) == 0);
58
59 do {
60 const int16x8_t c0 = load_tran_low_to_s16q(coeff);
61 const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
62
63 const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
64 const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
65
66 const int16x8_t diff0 = vabdq_s16(c0, d0);
67 const int16x8_t diff1 = vabdq_s16(c1, d1);
68
69 err = vpx_dotq_s16(err, diff0, diff0);
70 err = vpx_dotq_s16(err, diff1, diff1);
71
72 coeff += 16;
73 dqcoeff += 16;
74 block_size -= 16;
75 } while (block_size != 0);
76
77 return horizontal_add_int64x2(err);
78 }
79