xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <emmintrin.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/vpx_dsp_common.h"
16 #include "vpx_mem/vpx_mem.h"
17 #include "vpx_ports/mem.h"
18 #include "vp9/common/vp9_scan.h"
19 #include "vp9/encoder/vp9_block.h"
20 
vpx_highbd_quantize_b_sse2(const tran_low_t * coeff_ptr,intptr_t count,const struct macroblock_plane * mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)21 void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
22                                 const struct macroblock_plane *mb_plane,
23                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
24                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
25                                 const struct ScanOrder *const scan_order) {
26   int i, j, non_zero_regs = (int)count / 4, eob_i = 0;
27   __m128i zbins[2];
28   __m128i nzbins[2];
29   const int16_t *iscan = scan_order->iscan;
30   const int16_t *zbin_ptr = mb_plane->zbin;
31   const int16_t *round_ptr = mb_plane->round;
32   const int16_t *quant_ptr = mb_plane->quant;
33   const int16_t *quant_shift_ptr = mb_plane->quant_shift;
34 
35   zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
36                            (int)zbin_ptr[0]);
37   zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
38 
39   nzbins[0] = _mm_setzero_si128();
40   nzbins[1] = _mm_setzero_si128();
41   nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
42   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
43 
44   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
45   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
46 
47   // Pre-scan pass
48   for (i = ((int)count / 4) - 1; i >= 0; i--) {
49     __m128i coeffs, cmp1, cmp2;
50     int test;
51     coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
52     cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
53     cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
54     cmp1 = _mm_and_si128(cmp1, cmp2);
55     test = _mm_movemask_epi8(cmp1);
56     if (test == 0xffff)
57       non_zero_regs--;
58     else
59       break;
60   }
61 
62   // Quantization pass:
63   for (i = 0; i < non_zero_regs; i++) {
64     __m128i coeffs, coeffs_sign, tmp1, tmp2;
65     int test;
66     int abs_coeff[4];
67     int coeff_sign[4];
68 
69     coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
70     coeffs_sign = _mm_srai_epi32(coeffs, 31);
71     coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
72     tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
73     tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
74     tmp1 = _mm_or_si128(tmp1, tmp2);
75     test = _mm_movemask_epi8(tmp1);
76     _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
77     _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
78 
79     for (j = 0; j < 4; j++) {
80       if (test & (1 << (4 * j))) {
81         int k = 4 * i + j;
82         const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
83         const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
84         const uint32_t abs_qcoeff =
85             (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
86         qcoeff_ptr[k] =
87             (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
88         dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
89         if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
90       }
91     }
92   }
93   *eob_ptr = eob_i;
94 }
95 
vpx_highbd_quantize_b_32x32_sse2(const tran_low_t * coeff_ptr,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)96 void vpx_highbd_quantize_b_32x32_sse2(
97     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
98     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
99     uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
100   __m128i zbins[2];
101   __m128i nzbins[2];
102   int idx = 0;
103   int idx_arr[1024];
104   int i, eob = 0;
105   const intptr_t n_coeffs = 32 * 32;
106   const int16_t *iscan = scan_order->iscan;
107   const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1);
108   const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1);
109 
110   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
111   zbins[1] = _mm_set1_epi32(zbin1_tmp);
112 
113   nzbins[0] = _mm_setzero_si128();
114   nzbins[1] = _mm_setzero_si128();
115   nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
116   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
117 
118   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
119   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
120 
121   // Pre-scan pass
122   for (i = 0; i < n_coeffs / 4; i++) {
123     __m128i coeffs, cmp1, cmp2;
124     int test;
125     coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
126     cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
127     cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
128     cmp1 = _mm_and_si128(cmp1, cmp2);
129     test = _mm_movemask_epi8(cmp1);
130     if (!(test & 0xf)) idx_arr[idx++] = i * 4;
131     if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
132     if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
133     if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
134   }
135 
136   // Quantization pass: only process the coefficients selected in
137   // pre-scan pass. Note: idx can be zero.
138   for (i = 0; i < idx; i++) {
139     const int rc = idx_arr[i];
140     const int coeff = coeff_ptr[rc];
141     const int coeff_sign = (coeff >> 31);
142     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
143     const int64_t tmp1 =
144         abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1);
145     const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1;
146     const uint32_t abs_qcoeff =
147         (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15);
148     qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
149     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
150     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
151   }
152   *eob_ptr = eob;
153 }
154