xref: /aosp_15_r20/external/libaom/av1/common/x86/convolve_2d_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>
13 
14 #include "config/av1_rtcd.h"
15 
16 #if CONFIG_SVT_AV1
17 #include "third_party/SVT-AV1/convolve_2d_avx2.h"
18 #endif
19 
20 #include "aom_dsp/x86/convolve_avx2.h"
21 #include "aom_dsp/aom_filter.h"
22 #include "aom_dsp/x86/synonyms.h"
23 
24 #include "av1/common/convolve.h"
25 
convolve_2d_sr_general_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)26 static void convolve_2d_sr_general_avx2(
27     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
28     int h, const InterpFilterParams *filter_params_x,
29     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
30     const int subpel_y_qn, ConvolveParams *conv_params) {
31   if (filter_params_x->taps > 8) {
32     const int bd = 8;
33     int im_stride = 8, i;
34     DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
35     const int bits =
36         FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
37     const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
38 
39     assert(conv_params->round_0 > 0);
40 
41     const __m256i round_const_h12 = _mm256_set1_epi32(
42         ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
43     const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);
44 
45     const __m256i sum_round_v = _mm256_set1_epi32(
46         (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
47     const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
48 
49     const __m256i round_const_v = _mm256_set1_epi32(
50         ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
51         ((1 << (offset_bits - conv_params->round_1)) >> 1));
52     const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
53 
54     __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };
55 
56     int horiz_tap = 12;
57     int vert_tap = 12;
58 
59     prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
60     prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);
61 
62     int im_h = h + vert_tap - 1;
63     const int fo_vert = vert_tap / 2 - 1;
64     const int fo_horiz = horiz_tap / 2 - 1;
65     const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
66 
67     for (int j = 0; j < w; j += 8) {
68       CONVOLVE_SR_HORIZONTAL_FILTER_12TAP
69       CONVOLVE_SR_VERTICAL_FILTER_12TAP
70     }
71   } else {
72     const int bd = 8;
73     int im_stride = 8, i;
74     DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
75     const int bits =
76         FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
77     const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
78 
79     assert(conv_params->round_0 > 0);
80 
81     const __m256i round_const_h =
82         _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) +
83                           (1 << (bd + FILTER_BITS - 2)));
84     const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
85 
86     const __m256i sum_round_v = _mm256_set1_epi32(
87         (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
88     const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
89 
90     const __m256i round_const_v = _mm256_set1_epi32(
91         ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
92         ((1 << (offset_bits - conv_params->round_1)) >> 1));
93     const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
94 
95     __m256i filt[4], coeffs_h[4], coeffs_v[4];
96 
97     prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
98     prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
99 
100     int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
101     int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
102 
103     if (horiz_tap == 6)
104       prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
105     else
106       prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
107 
108     if (vert_tap == 6)
109       prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
110     else
111       prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
112 
113     int im_h = h + vert_tap - 1;
114     const int fo_vert = vert_tap / 2 - 1;
115     const int fo_horiz = horiz_tap / 2 - 1;
116     const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
117 
118     filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
119     filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
120     filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
121     filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
122 
123     for (int j = 0; j < w; j += 8) {
124       if (horiz_tap == 4) {
125         CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
126       } else if (horiz_tap == 6) {
127         CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
128       } else {
129         CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
130       }
131 
132       if (vert_tap == 4) {
133         CONVOLVE_SR_VERTICAL_FILTER_4TAP
134       } else if (vert_tap == 6) {
135         CONVOLVE_SR_VERTICAL_FILTER_6TAP
136       } else {
137         CONVOLVE_SR_VERTICAL_FILTER_8TAP
138       }
139     }
140   }
141 }
142 
av1_convolve_2d_sr_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_qn,const int32_t subpel_y_qn,ConvolveParams * conv_params)143 void av1_convolve_2d_sr_avx2(
144     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
145     int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
146     const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
147     const int32_t subpel_y_qn, ConvolveParams *conv_params) {
148 #if CONFIG_SVT_AV1
149   const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn);
150   const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn);
151 
152   const bool use_general = (tap_x == 12 || tap_y == 12);
153   if (use_general) {
154     convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
155                                 filter_params_x, filter_params_y, subpel_x_qn,
156                                 subpel_y_qn, conv_params);
157   } else {
158     av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
159                                         filter_params_x, filter_params_y,
160                                         subpel_x_qn, subpel_y_qn, conv_params);
161   }
162 #else
163   convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
164                               filter_params_x, filter_params_y, subpel_x_qn,
165                               subpel_y_qn, conv_params);
166 #endif
167 }
168