xref: /aosp_15_r20/external/libaom/av1/encoder/arm/encodetxb_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
13*77c1e3ccSAndroid Build Coastguard Worker #include <assert.h>
14*77c1e3ccSAndroid Build Coastguard Worker #include <math.h>
15*77c1e3ccSAndroid Build Coastguard Worker 
16*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
17*77c1e3ccSAndroid Build Coastguard Worker 
18*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/mem_neon.h"
19*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/txb_common.h"
20*77c1e3ccSAndroid Build Coastguard Worker #include "av1/encoder/encodetxb.h"
21*77c1e3ccSAndroid Build Coastguard Worker 
av1_txb_init_levels_neon(const tran_low_t * const coeff,const int width,const int height,uint8_t * const levels)22*77c1e3ccSAndroid Build Coastguard Worker void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
23*77c1e3ccSAndroid Build Coastguard Worker                               const int height, uint8_t *const levels) {
24*77c1e3ccSAndroid Build Coastguard Worker   const int stride = height + TX_PAD_HOR;
25*77c1e3ccSAndroid Build Coastguard Worker   memset(levels - TX_PAD_TOP * stride, 0,
26*77c1e3ccSAndroid Build Coastguard Worker          sizeof(*levels) * TX_PAD_TOP * stride);
27*77c1e3ccSAndroid Build Coastguard Worker   memset(levels + stride * width, 0,
28*77c1e3ccSAndroid Build Coastguard Worker          sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
29*77c1e3ccSAndroid Build Coastguard Worker 
30*77c1e3ccSAndroid Build Coastguard Worker   const int32x4_t zeros = vdupq_n_s32(0);
31*77c1e3ccSAndroid Build Coastguard Worker   int i = 0;
32*77c1e3ccSAndroid Build Coastguard Worker   uint8_t *ls = levels;
33*77c1e3ccSAndroid Build Coastguard Worker   const tran_low_t *cf = coeff;
34*77c1e3ccSAndroid Build Coastguard Worker   if (height == 4) {
35*77c1e3ccSAndroid Build Coastguard Worker     do {
36*77c1e3ccSAndroid Build Coastguard Worker       const int32x4_t coeffA = vld1q_s32(cf);
37*77c1e3ccSAndroid Build Coastguard Worker       const int32x4_t coeffB = vld1q_s32(cf + height);
38*77c1e3ccSAndroid Build Coastguard Worker       const int16x8_t coeffAB =
39*77c1e3ccSAndroid Build Coastguard Worker           vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
40*77c1e3ccSAndroid Build Coastguard Worker       const int16x8_t absAB = vqabsq_s16(coeffAB);
41*77c1e3ccSAndroid Build Coastguard Worker       const int8x8_t absABs = vqmovn_s16(absAB);
42*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
43*77c1e3ccSAndroid Build Coastguard Worker       const int8x16_t absAB8 =
44*77c1e3ccSAndroid Build Coastguard Worker           vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros)));
45*77c1e3ccSAndroid Build Coastguard Worker       const uint8x16_t lsAB =
46*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros));
47*77c1e3ccSAndroid Build Coastguard Worker #else
48*77c1e3ccSAndroid Build Coastguard Worker       const int32x2x2_t absAB8 =
49*77c1e3ccSAndroid Build Coastguard Worker           vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros));
50*77c1e3ccSAndroid Build Coastguard Worker       const uint8x16_t lsAB =
51*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1]));
52*77c1e3ccSAndroid Build Coastguard Worker #endif
53*77c1e3ccSAndroid Build Coastguard Worker       vst1q_u8(ls, lsAB);
54*77c1e3ccSAndroid Build Coastguard Worker       ls += (stride << 1);
55*77c1e3ccSAndroid Build Coastguard Worker       cf += (height << 1);
56*77c1e3ccSAndroid Build Coastguard Worker       i += 2;
57*77c1e3ccSAndroid Build Coastguard Worker     } while (i < width);
58*77c1e3ccSAndroid Build Coastguard Worker   } else if (height == 8) {
59*77c1e3ccSAndroid Build Coastguard Worker     do {
60*77c1e3ccSAndroid Build Coastguard Worker       const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
61*77c1e3ccSAndroid Build Coastguard Worker       const int16x8_t absAB = vqabsq_s16(coeffAB);
62*77c1e3ccSAndroid Build Coastguard Worker       const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
63*77c1e3ccSAndroid Build Coastguard Worker           vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
64*77c1e3ccSAndroid Build Coastguard Worker       vst1q_u8(ls, absAB8);
65*77c1e3ccSAndroid Build Coastguard Worker       ls += stride;
66*77c1e3ccSAndroid Build Coastguard Worker       cf += height;
67*77c1e3ccSAndroid Build Coastguard Worker       i += 1;
68*77c1e3ccSAndroid Build Coastguard Worker     } while (i < width);
69*77c1e3ccSAndroid Build Coastguard Worker   } else {
70*77c1e3ccSAndroid Build Coastguard Worker     do {
71*77c1e3ccSAndroid Build Coastguard Worker       int j = 0;
72*77c1e3ccSAndroid Build Coastguard Worker       do {
73*77c1e3ccSAndroid Build Coastguard Worker         const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
74*77c1e3ccSAndroid Build Coastguard Worker         const int16x8_t coeffCD = load_tran_low_to_s16q(cf + 8);
75*77c1e3ccSAndroid Build Coastguard Worker         const int16x8_t absAB = vqabsq_s16(coeffAB);
76*77c1e3ccSAndroid Build Coastguard Worker         const int16x8_t absCD = vqabsq_s16(coeffCD);
77*77c1e3ccSAndroid Build Coastguard Worker         const uint8x16_t absABCD = vreinterpretq_u8_s8(
78*77c1e3ccSAndroid Build Coastguard Worker             vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD)));
79*77c1e3ccSAndroid Build Coastguard Worker         vst1q_u8((ls + j), absABCD);
80*77c1e3ccSAndroid Build Coastguard Worker         j += 16;
81*77c1e3ccSAndroid Build Coastguard Worker         cf += 16;
82*77c1e3ccSAndroid Build Coastguard Worker       } while (j < height);
83*77c1e3ccSAndroid Build Coastguard Worker       *(int32_t *)(ls + height) = 0;
84*77c1e3ccSAndroid Build Coastguard Worker       ls += stride;
85*77c1e3ccSAndroid Build Coastguard Worker       i += 1;
86*77c1e3ccSAndroid Build Coastguard Worker     } while (i < width);
87*77c1e3ccSAndroid Build Coastguard Worker   }
88*77c1e3ccSAndroid Build Coastguard Worker }
89*77c1e3ccSAndroid Build Coastguard Worker 
90*77c1e3ccSAndroid Build Coastguard Worker // get_4_nz_map_contexts_2d coefficients:
91*77c1e3ccSAndroid Build Coastguard Worker static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = {
92*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 },
93*77c1e3ccSAndroid Build Coastguard Worker   { 0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21 }
94*77c1e3ccSAndroid Build Coastguard Worker };
95*77c1e3ccSAndroid Build Coastguard Worker 
96*77c1e3ccSAndroid Build Coastguard Worker // get_4_nz_map_contexts_hor coefficients:
97*77c1e3ccSAndroid Build Coastguard Worker /* clang-format off */
98*77c1e3ccSAndroid Build Coastguard Worker #define SIG_COEF_CONTEXTS_2D_X4_051010                        \
99*77c1e3ccSAndroid Build Coastguard Worker   (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \
100*77c1e3ccSAndroid Build Coastguard Worker   ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24))
101*77c1e3ccSAndroid Build Coastguard Worker /* clang-format on */
102*77c1e3ccSAndroid Build Coastguard Worker 
103*77c1e3ccSAndroid Build Coastguard Worker // get_4_nz_map_contexts_ver coefficients:
104*77c1e3ccSAndroid Build Coastguard Worker static const DECLARE_ALIGNED(16, uint8_t, c_4_po_hor[16]) = {
105*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
106*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
107*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
108*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
109*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
110*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
111*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
112*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
113*77c1e3ccSAndroid Build Coastguard Worker };
114*77c1e3ccSAndroid Build Coastguard Worker 
115*77c1e3ccSAndroid Build Coastguard Worker // get_8_coeff_contexts_2d coefficients:
116*77c1e3ccSAndroid Build Coastguard Worker // if (width == 8)
117*77c1e3ccSAndroid Build Coastguard Worker static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = {
118*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 },
119*77c1e3ccSAndroid Build Coastguard Worker   { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
120*77c1e3ccSAndroid Build Coastguard Worker };
121*77c1e3ccSAndroid Build Coastguard Worker // if (width < 8)
122*77c1e3ccSAndroid Build Coastguard Worker static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = {
123*77c1e3ccSAndroid Build Coastguard Worker   { 0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21 },
124*77c1e3ccSAndroid Build Coastguard Worker   { 11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21 }
125*77c1e3ccSAndroid Build Coastguard Worker };
126*77c1e3ccSAndroid Build Coastguard Worker 
127*77c1e3ccSAndroid Build Coastguard Worker // if (width > 8)
128*77c1e3ccSAndroid Build Coastguard Worker static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = {
129*77c1e3ccSAndroid Build Coastguard Worker   { 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 },
130*77c1e3ccSAndroid Build Coastguard Worker   { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
131*77c1e3ccSAndroid Build Coastguard Worker };
132*77c1e3ccSAndroid Build Coastguard Worker 
133*77c1e3ccSAndroid Build Coastguard Worker // get_4_nz_map_contexts_ver coefficients:
134*77c1e3ccSAndroid Build Coastguard Worker static const DECLARE_ALIGNED(16, uint8_t, c_8_po_ver[16]) = {
135*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
136*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
137*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
138*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
139*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
140*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
141*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
142*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
143*77c1e3ccSAndroid Build Coastguard Worker };
144*77c1e3ccSAndroid Build Coastguard Worker 
145*77c1e3ccSAndroid Build Coastguard Worker // get_16n_coeff_contexts_2d coefficients:
146*77c1e3ccSAndroid Build Coastguard Worker // real_width == real_height
147*77c1e3ccSAndroid Build Coastguard Worker static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = {
148*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
149*77c1e3ccSAndroid Build Coastguard Worker   { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
150*77c1e3ccSAndroid Build Coastguard Worker   { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
151*77c1e3ccSAndroid Build Coastguard Worker   { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
152*77c1e3ccSAndroid Build Coastguard Worker };
153*77c1e3ccSAndroid Build Coastguard Worker 
154*77c1e3ccSAndroid Build Coastguard Worker // real_width < real_height
155*77c1e3ccSAndroid Build Coastguard Worker static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = {
156*77c1e3ccSAndroid Build Coastguard Worker   { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
157*77c1e3ccSAndroid Build Coastguard Worker   { 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
158*77c1e3ccSAndroid Build Coastguard Worker   { 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
159*77c1e3ccSAndroid Build Coastguard Worker };
160*77c1e3ccSAndroid Build Coastguard Worker 
161*77c1e3ccSAndroid Build Coastguard Worker // real_width > real_height
162*77c1e3ccSAndroid Build Coastguard Worker static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = {
163*77c1e3ccSAndroid Build Coastguard Worker   { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 },
164*77c1e3ccSAndroid Build Coastguard Worker   { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
165*77c1e3ccSAndroid Build Coastguard Worker   { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
166*77c1e3ccSAndroid Build Coastguard Worker };
167*77c1e3ccSAndroid Build Coastguard Worker 
168*77c1e3ccSAndroid Build Coastguard Worker // get_16n_coeff_contexts_hor coefficients:
169*77c1e3ccSAndroid Build Coastguard Worker static const DECLARE_ALIGNED(16, uint8_t, c_16_po_ver[16]) = {
170*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
171*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
172*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
173*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
174*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
175*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
176*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
177*77c1e3ccSAndroid Build Coastguard Worker   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
178*77c1e3ccSAndroid Build Coastguard Worker };
179*77c1e3ccSAndroid Build Coastguard Worker 
180*77c1e3ccSAndroid Build Coastguard Worker // end of coefficients declaration area
181*77c1e3ccSAndroid Build Coastguard Worker 
load_8bit_4x4_to_1_reg(const uint8_t * const src,const int byte_stride)182*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
183*77c1e3ccSAndroid Build Coastguard Worker                                                 const int byte_stride) {
184*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
185*77c1e3ccSAndroid Build Coastguard Worker   uint32x4_t v_data = vld1q_u32((uint32_t *)src);
186*77c1e3ccSAndroid Build Coastguard Worker   v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1);
187*77c1e3ccSAndroid Build Coastguard Worker   v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2);
188*77c1e3ccSAndroid Build Coastguard Worker   v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3);
189*77c1e3ccSAndroid Build Coastguard Worker 
190*77c1e3ccSAndroid Build Coastguard Worker   return vreinterpretq_u8_u32(v_data);
191*77c1e3ccSAndroid Build Coastguard Worker #else
192*77c1e3ccSAndroid Build Coastguard Worker   return load_unaligned_u8q(src, byte_stride);
193*77c1e3ccSAndroid Build Coastguard Worker #endif
194*77c1e3ccSAndroid Build Coastguard Worker }
195*77c1e3ccSAndroid Build Coastguard Worker 
load_8bit_8x2_to_1_reg(const uint8_t * const src,const int byte_stride)196*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
197*77c1e3ccSAndroid Build Coastguard Worker                                                 const int byte_stride) {
198*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
199*77c1e3ccSAndroid Build Coastguard Worker   uint64x2_t v_data = vld1q_u64((uint64_t *)src);
200*77c1e3ccSAndroid Build Coastguard Worker   v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1);
201*77c1e3ccSAndroid Build Coastguard Worker 
202*77c1e3ccSAndroid Build Coastguard Worker   return vreinterpretq_u8_u64(v_data);
203*77c1e3ccSAndroid Build Coastguard Worker #else
204*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t v_data_low = vld1_u8(src);
205*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t v_data_high = vld1_u8(src + byte_stride);
206*77c1e3ccSAndroid Build Coastguard Worker 
207*77c1e3ccSAndroid Build Coastguard Worker   return vcombine_u8(v_data_low, v_data_high);
208*77c1e3ccSAndroid Build Coastguard Worker #endif
209*77c1e3ccSAndroid Build Coastguard Worker }
210*77c1e3ccSAndroid Build Coastguard Worker 
load_8bit_16x1_to_1_reg(const uint8_t * const src,const int byte_stride)211*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
212*77c1e3ccSAndroid Build Coastguard Worker                                                  const int byte_stride) {
213*77c1e3ccSAndroid Build Coastguard Worker   (void)byte_stride;
214*77c1e3ccSAndroid Build Coastguard Worker   return vld1q_u8(src);
215*77c1e3ccSAndroid Build Coastguard Worker }
216*77c1e3ccSAndroid Build Coastguard Worker 
load_levels_4x4x5(const uint8_t * const src,const int stride,const ptrdiff_t * const offsets,uint8x16_t * const level)217*77c1e3ccSAndroid Build Coastguard Worker static inline void load_levels_4x4x5(const uint8_t *const src, const int stride,
218*77c1e3ccSAndroid Build Coastguard Worker                                      const ptrdiff_t *const offsets,
219*77c1e3ccSAndroid Build Coastguard Worker                                      uint8x16_t *const level) {
220*77c1e3ccSAndroid Build Coastguard Worker   level[0] = load_8bit_4x4_to_1_reg(&src[1], stride);
221*77c1e3ccSAndroid Build Coastguard Worker   level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride);
222*77c1e3ccSAndroid Build Coastguard Worker   level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride);
223*77c1e3ccSAndroid Build Coastguard Worker   level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride);
224*77c1e3ccSAndroid Build Coastguard Worker   level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride);
225*77c1e3ccSAndroid Build Coastguard Worker }
226*77c1e3ccSAndroid Build Coastguard Worker 
load_levels_8x2x5(const uint8_t * const src,const int stride,const ptrdiff_t * const offsets,uint8x16_t * const level)227*77c1e3ccSAndroid Build Coastguard Worker static inline void load_levels_8x2x5(const uint8_t *const src, const int stride,
228*77c1e3ccSAndroid Build Coastguard Worker                                      const ptrdiff_t *const offsets,
229*77c1e3ccSAndroid Build Coastguard Worker                                      uint8x16_t *const level) {
230*77c1e3ccSAndroid Build Coastguard Worker   level[0] = load_8bit_8x2_to_1_reg(&src[1], stride);
231*77c1e3ccSAndroid Build Coastguard Worker   level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride);
232*77c1e3ccSAndroid Build Coastguard Worker   level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride);
233*77c1e3ccSAndroid Build Coastguard Worker   level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride);
234*77c1e3ccSAndroid Build Coastguard Worker   level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride);
235*77c1e3ccSAndroid Build Coastguard Worker }
236*77c1e3ccSAndroid Build Coastguard Worker 
load_levels_16x1x5(const uint8_t * const src,const int stride,const ptrdiff_t * const offsets,uint8x16_t * const level)237*77c1e3ccSAndroid Build Coastguard Worker static inline void load_levels_16x1x5(const uint8_t *const src,
238*77c1e3ccSAndroid Build Coastguard Worker                                       const int stride,
239*77c1e3ccSAndroid Build Coastguard Worker                                       const ptrdiff_t *const offsets,
240*77c1e3ccSAndroid Build Coastguard Worker                                       uint8x16_t *const level) {
241*77c1e3ccSAndroid Build Coastguard Worker   level[0] = load_8bit_16x1_to_1_reg(&src[1], stride);
242*77c1e3ccSAndroid Build Coastguard Worker   level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride);
243*77c1e3ccSAndroid Build Coastguard Worker   level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride);
244*77c1e3ccSAndroid Build Coastguard Worker   level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride);
245*77c1e3ccSAndroid Build Coastguard Worker   level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride);
246*77c1e3ccSAndroid Build Coastguard Worker }
247*77c1e3ccSAndroid Build Coastguard Worker 
get_coeff_contexts_kernel(uint8x16_t * const level)248*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
249*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t const_3 = vdupq_n_u8(3);
250*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t const_4 = vdupq_n_u8(4);
251*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t count;
252*77c1e3ccSAndroid Build Coastguard Worker 
253*77c1e3ccSAndroid Build Coastguard Worker   count = vminq_u8(level[0], const_3);
254*77c1e3ccSAndroid Build Coastguard Worker   level[1] = vminq_u8(level[1], const_3);
255*77c1e3ccSAndroid Build Coastguard Worker   level[2] = vminq_u8(level[2], const_3);
256*77c1e3ccSAndroid Build Coastguard Worker   level[3] = vminq_u8(level[3], const_3);
257*77c1e3ccSAndroid Build Coastguard Worker   level[4] = vminq_u8(level[4], const_3);
258*77c1e3ccSAndroid Build Coastguard Worker   count = vaddq_u8(count, level[1]);
259*77c1e3ccSAndroid Build Coastguard Worker   count = vaddq_u8(count, level[2]);
260*77c1e3ccSAndroid Build Coastguard Worker   count = vaddq_u8(count, level[3]);
261*77c1e3ccSAndroid Build Coastguard Worker   count = vaddq_u8(count, level[4]);
262*77c1e3ccSAndroid Build Coastguard Worker 
263*77c1e3ccSAndroid Build Coastguard Worker   count = vrshrq_n_u8(count, 1);
264*77c1e3ccSAndroid Build Coastguard Worker   count = vminq_u8(count, const_4);
265*77c1e3ccSAndroid Build Coastguard Worker   return count;
266*77c1e3ccSAndroid Build Coastguard Worker }
267*77c1e3ccSAndroid Build Coastguard Worker 
get_4_nz_map_contexts_2d(const uint8_t * levels,const int width,const ptrdiff_t * const offsets,uint8_t * const coeff_contexts)268*77c1e3ccSAndroid Build Coastguard Worker static inline void get_4_nz_map_contexts_2d(const uint8_t *levels,
269*77c1e3ccSAndroid Build Coastguard Worker                                             const int width,
270*77c1e3ccSAndroid Build Coastguard Worker                                             const ptrdiff_t *const offsets,
271*77c1e3ccSAndroid Build Coastguard Worker                                             uint8_t *const coeff_contexts) {
272*77c1e3ccSAndroid Build Coastguard Worker   const int stride = 4 + TX_PAD_HOR;
273*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);
274*77c1e3ccSAndroid Build Coastguard Worker 
275*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t pos_to_offset =
276*77c1e3ccSAndroid Build Coastguard Worker       (width == 4) ? vld1q_u8(c_4_po_2d[0]) : vld1q_u8(c_4_po_2d[1]);
277*77c1e3ccSAndroid Build Coastguard Worker 
278*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t count;
279*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t level[5];
280*77c1e3ccSAndroid Build Coastguard Worker   uint8_t *cc = coeff_contexts;
281*77c1e3ccSAndroid Build Coastguard Worker 
282*77c1e3ccSAndroid Build Coastguard Worker   assert(!(width % 4));
283*77c1e3ccSAndroid Build Coastguard Worker 
284*77c1e3ccSAndroid Build Coastguard Worker   int col = width;
285*77c1e3ccSAndroid Build Coastguard Worker   do {
286*77c1e3ccSAndroid Build Coastguard Worker     load_levels_4x4x5(levels, stride, offsets, level);
287*77c1e3ccSAndroid Build Coastguard Worker     count = get_coeff_contexts_kernel(level);
288*77c1e3ccSAndroid Build Coastguard Worker     count = vaddq_u8(count, pos_to_offset);
289*77c1e3ccSAndroid Build Coastguard Worker     vst1q_u8(cc, count);
290*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset = pos_to_offset_large;
291*77c1e3ccSAndroid Build Coastguard Worker     levels += 4 * stride;
292*77c1e3ccSAndroid Build Coastguard Worker     cc += 16;
293*77c1e3ccSAndroid Build Coastguard Worker     col -= 4;
294*77c1e3ccSAndroid Build Coastguard Worker   } while (col);
295*77c1e3ccSAndroid Build Coastguard Worker 
296*77c1e3ccSAndroid Build Coastguard Worker   coeff_contexts[0] = 0;
297*77c1e3ccSAndroid Build Coastguard Worker }
298*77c1e3ccSAndroid Build Coastguard Worker 
get_4_nz_map_contexts_ver(const uint8_t * levels,const int width,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)299*77c1e3ccSAndroid Build Coastguard Worker static inline void get_4_nz_map_contexts_ver(const uint8_t *levels,
300*77c1e3ccSAndroid Build Coastguard Worker                                              const int width,
301*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t *const offsets,
302*77c1e3ccSAndroid Build Coastguard Worker                                              uint8_t *coeff_contexts) {
303*77c1e3ccSAndroid Build Coastguard Worker   const int stride = 4 + TX_PAD_HOR;
304*77c1e3ccSAndroid Build Coastguard Worker 
305*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t pos_to_offset =
306*77c1e3ccSAndroid Build Coastguard Worker       vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010));
307*77c1e3ccSAndroid Build Coastguard Worker 
308*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t count;
309*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t level[5];
310*77c1e3ccSAndroid Build Coastguard Worker 
311*77c1e3ccSAndroid Build Coastguard Worker   assert(!(width % 4));
312*77c1e3ccSAndroid Build Coastguard Worker 
313*77c1e3ccSAndroid Build Coastguard Worker   int col = width;
314*77c1e3ccSAndroid Build Coastguard Worker   do {
315*77c1e3ccSAndroid Build Coastguard Worker     load_levels_4x4x5(levels, stride, offsets, level);
316*77c1e3ccSAndroid Build Coastguard Worker     count = get_coeff_contexts_kernel(level);
317*77c1e3ccSAndroid Build Coastguard Worker     count = vaddq_u8(count, pos_to_offset);
318*77c1e3ccSAndroid Build Coastguard Worker     vst1q_u8(coeff_contexts, count);
319*77c1e3ccSAndroid Build Coastguard Worker     levels += 4 * stride;
320*77c1e3ccSAndroid Build Coastguard Worker     coeff_contexts += 16;
321*77c1e3ccSAndroid Build Coastguard Worker     col -= 4;
322*77c1e3ccSAndroid Build Coastguard Worker   } while (col);
323*77c1e3ccSAndroid Build Coastguard Worker }
324*77c1e3ccSAndroid Build Coastguard Worker 
get_4_nz_map_contexts_hor(const uint8_t * levels,const int width,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)325*77c1e3ccSAndroid Build Coastguard Worker static inline void get_4_nz_map_contexts_hor(const uint8_t *levels,
326*77c1e3ccSAndroid Build Coastguard Worker                                              const int width,
327*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t *const offsets,
328*77c1e3ccSAndroid Build Coastguard Worker                                              uint8_t *coeff_contexts) {
329*77c1e3ccSAndroid Build Coastguard Worker   const int stride = 4 + TX_PAD_HOR;
330*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
331*77c1e3ccSAndroid Build Coastguard Worker 
332*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t pos_to_offset = vld1q_u8(c_4_po_hor);
333*77c1e3ccSAndroid Build Coastguard Worker 
334*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t count;
335*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t level[5];
336*77c1e3ccSAndroid Build Coastguard Worker 
337*77c1e3ccSAndroid Build Coastguard Worker   assert(!(width % 4));
338*77c1e3ccSAndroid Build Coastguard Worker 
339*77c1e3ccSAndroid Build Coastguard Worker   int col = width;
340*77c1e3ccSAndroid Build Coastguard Worker   do {
341*77c1e3ccSAndroid Build Coastguard Worker     load_levels_4x4x5(levels, stride, offsets, level);
342*77c1e3ccSAndroid Build Coastguard Worker     count = get_coeff_contexts_kernel(level);
343*77c1e3ccSAndroid Build Coastguard Worker     count = vaddq_u8(count, pos_to_offset);
344*77c1e3ccSAndroid Build Coastguard Worker     vst1q_u8(coeff_contexts, count);
345*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset = pos_to_offset_large;
346*77c1e3ccSAndroid Build Coastguard Worker     levels += 4 * stride;
347*77c1e3ccSAndroid Build Coastguard Worker     coeff_contexts += 16;
348*77c1e3ccSAndroid Build Coastguard Worker     col -= 4;
349*77c1e3ccSAndroid Build Coastguard Worker   } while (col);
350*77c1e3ccSAndroid Build Coastguard Worker }
351*77c1e3ccSAndroid Build Coastguard Worker 
get_8_coeff_contexts_2d(const uint8_t * levels,const int width,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)352*77c1e3ccSAndroid Build Coastguard Worker static inline void get_8_coeff_contexts_2d(const uint8_t *levels,
353*77c1e3ccSAndroid Build Coastguard Worker                                            const int width,
354*77c1e3ccSAndroid Build Coastguard Worker                                            const ptrdiff_t *const offsets,
355*77c1e3ccSAndroid Build Coastguard Worker                                            uint8_t *coeff_contexts) {
356*77c1e3ccSAndroid Build Coastguard Worker   const int stride = 8 + TX_PAD_HOR;
357*77c1e3ccSAndroid Build Coastguard Worker   uint8_t *cc = coeff_contexts;
358*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t count;
359*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t level[5];
360*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t pos_to_offset[3];
361*77c1e3ccSAndroid Build Coastguard Worker 
362*77c1e3ccSAndroid Build Coastguard Worker   assert(!(width % 2));
363*77c1e3ccSAndroid Build Coastguard Worker 
364*77c1e3ccSAndroid Build Coastguard Worker   if (width == 8) {
365*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]);
366*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]);
367*77c1e3ccSAndroid Build Coastguard Worker   } else if (width < 8) {
368*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]);
369*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]);
370*77c1e3ccSAndroid Build Coastguard Worker   } else {
371*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]);
372*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]);
373*77c1e3ccSAndroid Build Coastguard Worker   }
374*77c1e3ccSAndroid Build Coastguard Worker   pos_to_offset[2] = vdupq_n_u8(21);
375*77c1e3ccSAndroid Build Coastguard Worker 
376*77c1e3ccSAndroid Build Coastguard Worker   int col = width;
377*77c1e3ccSAndroid Build Coastguard Worker   do {
378*77c1e3ccSAndroid Build Coastguard Worker     load_levels_8x2x5(levels, stride, offsets, level);
379*77c1e3ccSAndroid Build Coastguard Worker     count = get_coeff_contexts_kernel(level);
380*77c1e3ccSAndroid Build Coastguard Worker     count = vaddq_u8(count, pos_to_offset[0]);
381*77c1e3ccSAndroid Build Coastguard Worker     vst1q_u8(cc, count);
382*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[0] = pos_to_offset[1];
383*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[1] = pos_to_offset[2];
384*77c1e3ccSAndroid Build Coastguard Worker     levels += 2 * stride;
385*77c1e3ccSAndroid Build Coastguard Worker     cc += 16;
386*77c1e3ccSAndroid Build Coastguard Worker     col -= 2;
387*77c1e3ccSAndroid Build Coastguard Worker   } while (col);
388*77c1e3ccSAndroid Build Coastguard Worker 
389*77c1e3ccSAndroid Build Coastguard Worker   coeff_contexts[0] = 0;
390*77c1e3ccSAndroid Build Coastguard Worker }
391*77c1e3ccSAndroid Build Coastguard Worker 
get_8_coeff_contexts_ver(const uint8_t * levels,const int width,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)392*77c1e3ccSAndroid Build Coastguard Worker static inline void get_8_coeff_contexts_ver(const uint8_t *levels,
393*77c1e3ccSAndroid Build Coastguard Worker                                             const int width,
394*77c1e3ccSAndroid Build Coastguard Worker                                             const ptrdiff_t *const offsets,
395*77c1e3ccSAndroid Build Coastguard Worker                                             uint8_t *coeff_contexts) {
396*77c1e3ccSAndroid Build Coastguard Worker   const int stride = 8 + TX_PAD_HOR;
397*77c1e3ccSAndroid Build Coastguard Worker 
398*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_ver);
399*77c1e3ccSAndroid Build Coastguard Worker 
400*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t count;
401*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t level[5];
402*77c1e3ccSAndroid Build Coastguard Worker 
403*77c1e3ccSAndroid Build Coastguard Worker   assert(!(width % 2));
404*77c1e3ccSAndroid Build Coastguard Worker 
405*77c1e3ccSAndroid Build Coastguard Worker   int col = width;
406*77c1e3ccSAndroid Build Coastguard Worker   do {
407*77c1e3ccSAndroid Build Coastguard Worker     load_levels_8x2x5(levels, stride, offsets, level);
408*77c1e3ccSAndroid Build Coastguard Worker     count = get_coeff_contexts_kernel(level);
409*77c1e3ccSAndroid Build Coastguard Worker     count = vaddq_u8(count, pos_to_offset);
410*77c1e3ccSAndroid Build Coastguard Worker     vst1q_u8(coeff_contexts, count);
411*77c1e3ccSAndroid Build Coastguard Worker     levels += 2 * stride;
412*77c1e3ccSAndroid Build Coastguard Worker     coeff_contexts += 16;
413*77c1e3ccSAndroid Build Coastguard Worker     col -= 2;
414*77c1e3ccSAndroid Build Coastguard Worker   } while (col);
415*77c1e3ccSAndroid Build Coastguard Worker }
416*77c1e3ccSAndroid Build Coastguard Worker 
get_8_coeff_contexts_hor(const uint8_t * levels,const int width,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)417*77c1e3ccSAndroid Build Coastguard Worker static inline void get_8_coeff_contexts_hor(const uint8_t *levels,
418*77c1e3ccSAndroid Build Coastguard Worker                                             const int width,
419*77c1e3ccSAndroid Build Coastguard Worker                                             const ptrdiff_t *const offsets,
420*77c1e3ccSAndroid Build Coastguard Worker                                             uint8_t *coeff_contexts) {
421*77c1e3ccSAndroid Build Coastguard Worker   const int stride = 8 + TX_PAD_HOR;
422*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
423*77c1e3ccSAndroid Build Coastguard Worker 
424*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0),
425*77c1e3ccSAndroid Build Coastguard Worker                                          vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5));
426*77c1e3ccSAndroid Build Coastguard Worker 
427*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t count;
428*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t level[5];
429*77c1e3ccSAndroid Build Coastguard Worker 
430*77c1e3ccSAndroid Build Coastguard Worker   assert(!(width % 2));
431*77c1e3ccSAndroid Build Coastguard Worker 
432*77c1e3ccSAndroid Build Coastguard Worker   int col = width;
433*77c1e3ccSAndroid Build Coastguard Worker   do {
434*77c1e3ccSAndroid Build Coastguard Worker     load_levels_8x2x5(levels, stride, offsets, level);
435*77c1e3ccSAndroid Build Coastguard Worker     count = get_coeff_contexts_kernel(level);
436*77c1e3ccSAndroid Build Coastguard Worker     count = vaddq_u8(count, pos_to_offset);
437*77c1e3ccSAndroid Build Coastguard Worker     vst1q_u8(coeff_contexts, count);
438*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset = pos_to_offset_large;
439*77c1e3ccSAndroid Build Coastguard Worker     levels += 2 * stride;
440*77c1e3ccSAndroid Build Coastguard Worker     coeff_contexts += 16;
441*77c1e3ccSAndroid Build Coastguard Worker     col -= 2;
442*77c1e3ccSAndroid Build Coastguard Worker   } while (col);
443*77c1e3ccSAndroid Build Coastguard Worker }
444*77c1e3ccSAndroid Build Coastguard Worker 
get_16n_coeff_contexts_2d(const uint8_t * levels,const int real_width,const int real_height,const int width,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)445*77c1e3ccSAndroid Build Coastguard Worker static inline void get_16n_coeff_contexts_2d(const uint8_t *levels,
446*77c1e3ccSAndroid Build Coastguard Worker                                              const int real_width,
447*77c1e3ccSAndroid Build Coastguard Worker                                              const int real_height,
448*77c1e3ccSAndroid Build Coastguard Worker                                              const int width, const int height,
449*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t *const offsets,
450*77c1e3ccSAndroid Build Coastguard Worker                                              uint8_t *coeff_contexts) {
451*77c1e3ccSAndroid Build Coastguard Worker   const int stride = height + TX_PAD_HOR;
452*77c1e3ccSAndroid Build Coastguard Worker   uint8_t *cc = coeff_contexts;
453*77c1e3ccSAndroid Build Coastguard Worker   int col = width;
454*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t pos_to_offset[5];
455*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t pos_to_offset_large[3];
456*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t count;
457*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t level[5];
458*77c1e3ccSAndroid Build Coastguard Worker 
459*77c1e3ccSAndroid Build Coastguard Worker   assert(!(height % 16));
460*77c1e3ccSAndroid Build Coastguard Worker 
461*77c1e3ccSAndroid Build Coastguard Worker   pos_to_offset_large[2] = vdupq_n_u8(21);
462*77c1e3ccSAndroid Build Coastguard Worker   if (real_width == real_height) {
463*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]);
464*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]);
465*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]);
466*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]);
467*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
468*77c1e3ccSAndroid Build Coastguard Worker         pos_to_offset_large[2];
469*77c1e3ccSAndroid Build Coastguard Worker   } else if (real_width < real_height) {
470*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]);
471*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]);
472*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] =
473*77c1e3ccSAndroid Build Coastguard Worker         vld1q_u8(c_16_po_2d_g[2]);
474*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
475*77c1e3ccSAndroid Build Coastguard Worker   } else {  // real_width > real_height
476*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]);
477*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]);
478*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]);
479*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[4] = pos_to_offset_large[2];
480*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(16);
481*77c1e3ccSAndroid Build Coastguard Worker   }
482*77c1e3ccSAndroid Build Coastguard Worker 
483*77c1e3ccSAndroid Build Coastguard Worker   do {
484*77c1e3ccSAndroid Build Coastguard Worker     int h = height;
485*77c1e3ccSAndroid Build Coastguard Worker 
486*77c1e3ccSAndroid Build Coastguard Worker     do {
487*77c1e3ccSAndroid Build Coastguard Worker       load_levels_16x1x5(levels, stride, offsets, level);
488*77c1e3ccSAndroid Build Coastguard Worker       count = get_coeff_contexts_kernel(level);
489*77c1e3ccSAndroid Build Coastguard Worker       count = vaddq_u8(count, pos_to_offset[0]);
490*77c1e3ccSAndroid Build Coastguard Worker       vst1q_u8(cc, count);
491*77c1e3ccSAndroid Build Coastguard Worker       levels += 16;
492*77c1e3ccSAndroid Build Coastguard Worker       cc += 16;
493*77c1e3ccSAndroid Build Coastguard Worker       h -= 16;
494*77c1e3ccSAndroid Build Coastguard Worker       pos_to_offset[0] = pos_to_offset_large[0];
495*77c1e3ccSAndroid Build Coastguard Worker     } while (h);
496*77c1e3ccSAndroid Build Coastguard Worker 
497*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[0] = pos_to_offset[1];
498*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[1] = pos_to_offset[2];
499*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[2] = pos_to_offset[3];
500*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[3] = pos_to_offset[4];
501*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset_large[0] = pos_to_offset_large[1];
502*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset_large[1] = pos_to_offset_large[2];
503*77c1e3ccSAndroid Build Coastguard Worker     levels += TX_PAD_HOR;
504*77c1e3ccSAndroid Build Coastguard Worker   } while (--col);
505*77c1e3ccSAndroid Build Coastguard Worker 
506*77c1e3ccSAndroid Build Coastguard Worker   coeff_contexts[0] = 0;
507*77c1e3ccSAndroid Build Coastguard Worker }
508*77c1e3ccSAndroid Build Coastguard Worker 
get_16n_coeff_contexts_ver(const uint8_t * levels,const int width,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)509*77c1e3ccSAndroid Build Coastguard Worker static inline void get_16n_coeff_contexts_ver(const uint8_t *levels,
510*77c1e3ccSAndroid Build Coastguard Worker                                               const int width, const int height,
511*77c1e3ccSAndroid Build Coastguard Worker                                               const ptrdiff_t *const offsets,
512*77c1e3ccSAndroid Build Coastguard Worker                                               uint8_t *coeff_contexts) {
513*77c1e3ccSAndroid Build Coastguard Worker   const int stride = height + TX_PAD_HOR;
514*77c1e3ccSAndroid Build Coastguard Worker 
515*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
516*77c1e3ccSAndroid Build Coastguard Worker 
517*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t count;
518*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t level[5];
519*77c1e3ccSAndroid Build Coastguard Worker 
520*77c1e3ccSAndroid Build Coastguard Worker   assert(!(height % 16));
521*77c1e3ccSAndroid Build Coastguard Worker 
522*77c1e3ccSAndroid Build Coastguard Worker   int col = width;
523*77c1e3ccSAndroid Build Coastguard Worker   do {
524*77c1e3ccSAndroid Build Coastguard Worker     uint8x16_t pos_to_offset = vld1q_u8(c_16_po_ver);
525*77c1e3ccSAndroid Build Coastguard Worker 
526*77c1e3ccSAndroid Build Coastguard Worker     int h = height;
527*77c1e3ccSAndroid Build Coastguard Worker     do {
528*77c1e3ccSAndroid Build Coastguard Worker       load_levels_16x1x5(levels, stride, offsets, level);
529*77c1e3ccSAndroid Build Coastguard Worker       count = get_coeff_contexts_kernel(level);
530*77c1e3ccSAndroid Build Coastguard Worker       count = vaddq_u8(count, pos_to_offset);
531*77c1e3ccSAndroid Build Coastguard Worker       vst1q_u8(coeff_contexts, count);
532*77c1e3ccSAndroid Build Coastguard Worker       pos_to_offset = pos_to_offset_large;
533*77c1e3ccSAndroid Build Coastguard Worker       levels += 16;
534*77c1e3ccSAndroid Build Coastguard Worker       coeff_contexts += 16;
535*77c1e3ccSAndroid Build Coastguard Worker       h -= 16;
536*77c1e3ccSAndroid Build Coastguard Worker     } while (h);
537*77c1e3ccSAndroid Build Coastguard Worker 
538*77c1e3ccSAndroid Build Coastguard Worker     levels += TX_PAD_HOR;
539*77c1e3ccSAndroid Build Coastguard Worker   } while (--col);
540*77c1e3ccSAndroid Build Coastguard Worker }
541*77c1e3ccSAndroid Build Coastguard Worker 
get_16n_coeff_contexts_hor(const uint8_t * levels,const int width,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)542*77c1e3ccSAndroid Build Coastguard Worker static inline void get_16n_coeff_contexts_hor(const uint8_t *levels,
543*77c1e3ccSAndroid Build Coastguard Worker                                               const int width, const int height,
544*77c1e3ccSAndroid Build Coastguard Worker                                               const ptrdiff_t *const offsets,
545*77c1e3ccSAndroid Build Coastguard Worker                                               uint8_t *coeff_contexts) {
546*77c1e3ccSAndroid Build Coastguard Worker   const int stride = height + TX_PAD_HOR;
547*77c1e3ccSAndroid Build Coastguard Worker 
548*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t pos_to_offset[3];
549*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t count;
550*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t level[5];
551*77c1e3ccSAndroid Build Coastguard Worker 
552*77c1e3ccSAndroid Build Coastguard Worker   assert(!(height % 16));
553*77c1e3ccSAndroid Build Coastguard Worker 
554*77c1e3ccSAndroid Build Coastguard Worker   pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0);
555*77c1e3ccSAndroid Build Coastguard Worker   pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5);
556*77c1e3ccSAndroid Build Coastguard Worker   pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
557*77c1e3ccSAndroid Build Coastguard Worker 
558*77c1e3ccSAndroid Build Coastguard Worker   int col = width;
559*77c1e3ccSAndroid Build Coastguard Worker   do {
560*77c1e3ccSAndroid Build Coastguard Worker     int h = height;
561*77c1e3ccSAndroid Build Coastguard Worker     do {
562*77c1e3ccSAndroid Build Coastguard Worker       load_levels_16x1x5(levels, stride, offsets, level);
563*77c1e3ccSAndroid Build Coastguard Worker       count = get_coeff_contexts_kernel(level);
564*77c1e3ccSAndroid Build Coastguard Worker       count = vaddq_u8(count, pos_to_offset[0]);
565*77c1e3ccSAndroid Build Coastguard Worker       vst1q_u8(coeff_contexts, count);
566*77c1e3ccSAndroid Build Coastguard Worker       levels += 16;
567*77c1e3ccSAndroid Build Coastguard Worker       coeff_contexts += 16;
568*77c1e3ccSAndroid Build Coastguard Worker       h -= 16;
569*77c1e3ccSAndroid Build Coastguard Worker     } while (h);
570*77c1e3ccSAndroid Build Coastguard Worker 
571*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[0] = pos_to_offset[1];
572*77c1e3ccSAndroid Build Coastguard Worker     pos_to_offset[1] = pos_to_offset[2];
573*77c1e3ccSAndroid Build Coastguard Worker     levels += TX_PAD_HOR;
574*77c1e3ccSAndroid Build Coastguard Worker   } while (--col);
575*77c1e3ccSAndroid Build Coastguard Worker }
576*77c1e3ccSAndroid Build Coastguard Worker 
577*77c1e3ccSAndroid Build Coastguard Worker // Note: levels[] must be in the range [0, 127], inclusive.
av1_get_nz_map_contexts_neon(const uint8_t * const levels,const int16_t * const scan,const uint16_t eob,const TX_SIZE tx_size,const TX_CLASS tx_class,int8_t * const coeff_contexts)578*77c1e3ccSAndroid Build Coastguard Worker void av1_get_nz_map_contexts_neon(const uint8_t *const levels,
579*77c1e3ccSAndroid Build Coastguard Worker                                   const int16_t *const scan, const uint16_t eob,
580*77c1e3ccSAndroid Build Coastguard Worker                                   const TX_SIZE tx_size,
581*77c1e3ccSAndroid Build Coastguard Worker                                   const TX_CLASS tx_class,
582*77c1e3ccSAndroid Build Coastguard Worker                                   int8_t *const coeff_contexts) {
583*77c1e3ccSAndroid Build Coastguard Worker   const int last_idx = eob - 1;
584*77c1e3ccSAndroid Build Coastguard Worker   if (!last_idx) {
585*77c1e3ccSAndroid Build Coastguard Worker     coeff_contexts[0] = 0;
586*77c1e3ccSAndroid Build Coastguard Worker     return;
587*77c1e3ccSAndroid Build Coastguard Worker   }
588*77c1e3ccSAndroid Build Coastguard Worker 
589*77c1e3ccSAndroid Build Coastguard Worker   uint8_t *const coefficients = (uint8_t *const)coeff_contexts;
590*77c1e3ccSAndroid Build Coastguard Worker 
591*77c1e3ccSAndroid Build Coastguard Worker   const int real_width = tx_size_wide[tx_size];
592*77c1e3ccSAndroid Build Coastguard Worker   const int real_height = tx_size_high[tx_size];
593*77c1e3ccSAndroid Build Coastguard Worker   const int width = get_txb_wide(tx_size);
594*77c1e3ccSAndroid Build Coastguard Worker   const int height = get_txb_high(tx_size);
595*77c1e3ccSAndroid Build Coastguard Worker   const int stride = height + TX_PAD_HOR;
596*77c1e3ccSAndroid Build Coastguard Worker   ptrdiff_t offsets[3];
597*77c1e3ccSAndroid Build Coastguard Worker 
598*77c1e3ccSAndroid Build Coastguard Worker   /* coeff_contexts must be 16 byte aligned. */
599*77c1e3ccSAndroid Build Coastguard Worker   assert(!((intptr_t)coeff_contexts & 0xf));
600*77c1e3ccSAndroid Build Coastguard Worker 
601*77c1e3ccSAndroid Build Coastguard Worker   if (tx_class == TX_CLASS_2D) {
602*77c1e3ccSAndroid Build Coastguard Worker     offsets[0] = 0 * stride + 2;
603*77c1e3ccSAndroid Build Coastguard Worker     offsets[1] = 1 * stride + 1;
604*77c1e3ccSAndroid Build Coastguard Worker     offsets[2] = 2 * stride + 0;
605*77c1e3ccSAndroid Build Coastguard Worker 
606*77c1e3ccSAndroid Build Coastguard Worker     if (height == 4) {
607*77c1e3ccSAndroid Build Coastguard Worker       get_4_nz_map_contexts_2d(levels, width, offsets, coefficients);
608*77c1e3ccSAndroid Build Coastguard Worker     } else if (height == 8) {
609*77c1e3ccSAndroid Build Coastguard Worker       get_8_coeff_contexts_2d(levels, width, offsets, coefficients);
610*77c1e3ccSAndroid Build Coastguard Worker     } else {
611*77c1e3ccSAndroid Build Coastguard Worker       get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
612*77c1e3ccSAndroid Build Coastguard Worker                                 offsets, coefficients);
613*77c1e3ccSAndroid Build Coastguard Worker     }
614*77c1e3ccSAndroid Build Coastguard Worker   } else if (tx_class == TX_CLASS_HORIZ) {
615*77c1e3ccSAndroid Build Coastguard Worker     offsets[0] = 2 * stride;
616*77c1e3ccSAndroid Build Coastguard Worker     offsets[1] = 3 * stride;
617*77c1e3ccSAndroid Build Coastguard Worker     offsets[2] = 4 * stride;
618*77c1e3ccSAndroid Build Coastguard Worker     if (height == 4) {
619*77c1e3ccSAndroid Build Coastguard Worker       get_4_nz_map_contexts_hor(levels, width, offsets, coefficients);
620*77c1e3ccSAndroid Build Coastguard Worker     } else if (height == 8) {
621*77c1e3ccSAndroid Build Coastguard Worker       get_8_coeff_contexts_hor(levels, width, offsets, coefficients);
622*77c1e3ccSAndroid Build Coastguard Worker     } else {
623*77c1e3ccSAndroid Build Coastguard Worker       get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients);
624*77c1e3ccSAndroid Build Coastguard Worker     }
625*77c1e3ccSAndroid Build Coastguard Worker   } else {  // TX_CLASS_VERT
626*77c1e3ccSAndroid Build Coastguard Worker     offsets[0] = 2;
627*77c1e3ccSAndroid Build Coastguard Worker     offsets[1] = 3;
628*77c1e3ccSAndroid Build Coastguard Worker     offsets[2] = 4;
629*77c1e3ccSAndroid Build Coastguard Worker     if (height == 4) {
630*77c1e3ccSAndroid Build Coastguard Worker       get_4_nz_map_contexts_ver(levels, width, offsets, coefficients);
631*77c1e3ccSAndroid Build Coastguard Worker     } else if (height == 8) {
632*77c1e3ccSAndroid Build Coastguard Worker       get_8_coeff_contexts_ver(levels, width, offsets, coefficients);
633*77c1e3ccSAndroid Build Coastguard Worker     } else {
634*77c1e3ccSAndroid Build Coastguard Worker       get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients);
635*77c1e3ccSAndroid Build Coastguard Worker     }
636*77c1e3ccSAndroid Build Coastguard Worker   }
637*77c1e3ccSAndroid Build Coastguard Worker 
638*77c1e3ccSAndroid Build Coastguard Worker   const int bhl = get_txb_bhl(tx_size);
639*77c1e3ccSAndroid Build Coastguard Worker   const int pos = scan[last_idx];
640*77c1e3ccSAndroid Build Coastguard Worker   if (last_idx <= (width << bhl) / 8)
641*77c1e3ccSAndroid Build Coastguard Worker     coeff_contexts[pos] = 1;
642*77c1e3ccSAndroid Build Coastguard Worker   else if (last_idx <= (width << bhl) / 4)
643*77c1e3ccSAndroid Build Coastguard Worker     coeff_contexts[pos] = 2;
644*77c1e3ccSAndroid Build Coastguard Worker   else
645*77c1e3ccSAndroid Build Coastguard Worker     coeff_contexts[pos] = 3;
646*77c1e3ccSAndroid Build Coastguard Worker }
647