xref: /aosp_15_r20/external/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 
14 #include "./vp9_rtcd.h"
15 #include "vpx_dsp/arm/mem_neon.h"
16 #include "vpx_dsp/arm/sum_neon.h"
17 
vp9_highbd_block_error_neon(const tran_low_t * coeff,const tran_low_t * dqcoeff,intptr_t block_size,int64_t * ssz,int bd)18 int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff,
19                                     const tran_low_t *dqcoeff,
20                                     intptr_t block_size, int64_t *ssz, int bd) {
21   uint64x2_t err_u64 = vdupq_n_u64(0);
22   int64x2_t ssz_s64 = vdupq_n_s64(0);
23 
24   const int shift = 2 * (bd - 8);
25   const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
26 
27   assert(block_size >= 16);
28   assert((block_size % 16) == 0);
29 
30   do {
31     const int32x4_t c = load_tran_low_to_s32q(coeff);
32     const int32x4_t d = load_tran_low_to_s32q(dqcoeff);
33 
34     const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d));
35 
36     err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff));
37     err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff));
38 
39     ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c));
40     ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c));
41 
42     coeff += 4;
43     dqcoeff += 4;
44     block_size -= 4;
45   } while (block_size != 0);
46 
47   *ssz = (horizontal_add_int64x2(ssz_s64) + rounding) >> shift;
48   return ((int64_t)horizontal_add_uint64x2(err_u64) + rounding) >> shift;
49 }
50