xref: /aosp_15_r20/external/libaom/aom_dsp/x86/subtract_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker #include <immintrin.h>
12*77c1e3ccSAndroid Build Coastguard Worker 
13*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
14*77c1e3ccSAndroid Build Coastguard Worker 
subtract32_avx2(int16_t * diff_ptr,const uint8_t * src_ptr,const uint8_t * pred_ptr)15*77c1e3ccSAndroid Build Coastguard Worker static inline void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
16*77c1e3ccSAndroid Build Coastguard Worker                                    const uint8_t *pred_ptr) {
17*77c1e3ccSAndroid Build Coastguard Worker   __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
18*77c1e3ccSAndroid Build Coastguard Worker   __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
19*77c1e3ccSAndroid Build Coastguard Worker   __m256i set_one_minusone = _mm256_set1_epi32((int)0xff01ff01);
20*77c1e3ccSAndroid Build Coastguard Worker   __m256i diff0 = _mm256_unpacklo_epi8(s, p);
21*77c1e3ccSAndroid Build Coastguard Worker   __m256i diff1 = _mm256_unpackhi_epi8(s, p);
22*77c1e3ccSAndroid Build Coastguard Worker   diff0 = _mm256_maddubs_epi16(diff0, set_one_minusone);
23*77c1e3ccSAndroid Build Coastguard Worker   diff1 = _mm256_maddubs_epi16(diff1, set_one_minusone);
24*77c1e3ccSAndroid Build Coastguard Worker   _mm256_store_si256((__m256i *)(diff_ptr),
25*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_permute2x128_si256(diff0, diff1, 0x20));
26*77c1e3ccSAndroid Build Coastguard Worker   _mm256_store_si256((__m256i *)(diff_ptr + 16),
27*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_permute2x128_si256(diff0, diff1, 0x31));
28*77c1e3ccSAndroid Build Coastguard Worker }
29*77c1e3ccSAndroid Build Coastguard Worker 
subtract_block_16xn_avx2(int rows,int16_t * diff_ptr,ptrdiff_t diff_stride,const uint8_t * src_ptr,ptrdiff_t src_stride,const uint8_t * pred_ptr,ptrdiff_t pred_stride)30*77c1e3ccSAndroid Build Coastguard Worker static inline void subtract_block_16xn_avx2(
31*77c1e3ccSAndroid Build Coastguard Worker     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
32*77c1e3ccSAndroid Build Coastguard Worker     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
33*77c1e3ccSAndroid Build Coastguard Worker   for (int32_t j = 0; j < rows; ++j) {
34*77c1e3ccSAndroid Build Coastguard Worker     __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
35*77c1e3ccSAndroid Build Coastguard Worker     __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
36*77c1e3ccSAndroid Build Coastguard Worker     __m256i s_0 = _mm256_cvtepu8_epi16(s);
37*77c1e3ccSAndroid Build Coastguard Worker     __m256i p_0 = _mm256_cvtepu8_epi16(p);
38*77c1e3ccSAndroid Build Coastguard Worker     const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
39*77c1e3ccSAndroid Build Coastguard Worker     _mm256_store_si256((__m256i *)(diff_ptr), d_0);
40*77c1e3ccSAndroid Build Coastguard Worker     src_ptr += src_stride;
41*77c1e3ccSAndroid Build Coastguard Worker     pred_ptr += pred_stride;
42*77c1e3ccSAndroid Build Coastguard Worker     diff_ptr += diff_stride;
43*77c1e3ccSAndroid Build Coastguard Worker   }
44*77c1e3ccSAndroid Build Coastguard Worker }
45*77c1e3ccSAndroid Build Coastguard Worker 
subtract_block_32xn_avx2(int rows,int16_t * diff_ptr,ptrdiff_t diff_stride,const uint8_t * src_ptr,ptrdiff_t src_stride,const uint8_t * pred_ptr,ptrdiff_t pred_stride)46*77c1e3ccSAndroid Build Coastguard Worker static inline void subtract_block_32xn_avx2(
47*77c1e3ccSAndroid Build Coastguard Worker     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
48*77c1e3ccSAndroid Build Coastguard Worker     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
49*77c1e3ccSAndroid Build Coastguard Worker   for (int32_t j = 0; j < rows; ++j) {
50*77c1e3ccSAndroid Build Coastguard Worker     subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
51*77c1e3ccSAndroid Build Coastguard Worker     src_ptr += src_stride;
52*77c1e3ccSAndroid Build Coastguard Worker     pred_ptr += pred_stride;
53*77c1e3ccSAndroid Build Coastguard Worker     diff_ptr += diff_stride;
54*77c1e3ccSAndroid Build Coastguard Worker   }
55*77c1e3ccSAndroid Build Coastguard Worker }
56*77c1e3ccSAndroid Build Coastguard Worker 
subtract_block_64xn_avx2(int rows,int16_t * diff_ptr,ptrdiff_t diff_stride,const uint8_t * src_ptr,ptrdiff_t src_stride,const uint8_t * pred_ptr,ptrdiff_t pred_stride)57*77c1e3ccSAndroid Build Coastguard Worker static inline void subtract_block_64xn_avx2(
58*77c1e3ccSAndroid Build Coastguard Worker     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
59*77c1e3ccSAndroid Build Coastguard Worker     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
60*77c1e3ccSAndroid Build Coastguard Worker   for (int32_t j = 0; j < rows; ++j) {
61*77c1e3ccSAndroid Build Coastguard Worker     subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
62*77c1e3ccSAndroid Build Coastguard Worker     subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
63*77c1e3ccSAndroid Build Coastguard Worker     src_ptr += src_stride;
64*77c1e3ccSAndroid Build Coastguard Worker     pred_ptr += pred_stride;
65*77c1e3ccSAndroid Build Coastguard Worker     diff_ptr += diff_stride;
66*77c1e3ccSAndroid Build Coastguard Worker   }
67*77c1e3ccSAndroid Build Coastguard Worker }
68*77c1e3ccSAndroid Build Coastguard Worker 
subtract_block_128xn_avx2(int rows,int16_t * diff_ptr,ptrdiff_t diff_stride,const uint8_t * src_ptr,ptrdiff_t src_stride,const uint8_t * pred_ptr,ptrdiff_t pred_stride)69*77c1e3ccSAndroid Build Coastguard Worker static inline void subtract_block_128xn_avx2(
70*77c1e3ccSAndroid Build Coastguard Worker     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
71*77c1e3ccSAndroid Build Coastguard Worker     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
72*77c1e3ccSAndroid Build Coastguard Worker   for (int32_t j = 0; j < rows; ++j) {
73*77c1e3ccSAndroid Build Coastguard Worker     subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
74*77c1e3ccSAndroid Build Coastguard Worker     subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
75*77c1e3ccSAndroid Build Coastguard Worker     subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
76*77c1e3ccSAndroid Build Coastguard Worker     subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
77*77c1e3ccSAndroid Build Coastguard Worker     src_ptr += src_stride;
78*77c1e3ccSAndroid Build Coastguard Worker     pred_ptr += pred_stride;
79*77c1e3ccSAndroid Build Coastguard Worker     diff_ptr += diff_stride;
80*77c1e3ccSAndroid Build Coastguard Worker   }
81*77c1e3ccSAndroid Build Coastguard Worker }
82*77c1e3ccSAndroid Build Coastguard Worker 
aom_subtract_block_avx2(int rows,int cols,int16_t * diff_ptr,ptrdiff_t diff_stride,const uint8_t * src_ptr,ptrdiff_t src_stride,const uint8_t * pred_ptr,ptrdiff_t pred_stride)83*77c1e3ccSAndroid Build Coastguard Worker void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
84*77c1e3ccSAndroid Build Coastguard Worker                              ptrdiff_t diff_stride, const uint8_t *src_ptr,
85*77c1e3ccSAndroid Build Coastguard Worker                              ptrdiff_t src_stride, const uint8_t *pred_ptr,
86*77c1e3ccSAndroid Build Coastguard Worker                              ptrdiff_t pred_stride) {
87*77c1e3ccSAndroid Build Coastguard Worker   switch (cols) {
88*77c1e3ccSAndroid Build Coastguard Worker     case 16:
89*77c1e3ccSAndroid Build Coastguard Worker       subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
90*77c1e3ccSAndroid Build Coastguard Worker                                pred_ptr, pred_stride);
91*77c1e3ccSAndroid Build Coastguard Worker       break;
92*77c1e3ccSAndroid Build Coastguard Worker     case 32:
93*77c1e3ccSAndroid Build Coastguard Worker       subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
94*77c1e3ccSAndroid Build Coastguard Worker                                pred_ptr, pred_stride);
95*77c1e3ccSAndroid Build Coastguard Worker       break;
96*77c1e3ccSAndroid Build Coastguard Worker     case 64:
97*77c1e3ccSAndroid Build Coastguard Worker       subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
98*77c1e3ccSAndroid Build Coastguard Worker                                pred_ptr, pred_stride);
99*77c1e3ccSAndroid Build Coastguard Worker       break;
100*77c1e3ccSAndroid Build Coastguard Worker     case 128:
101*77c1e3ccSAndroid Build Coastguard Worker       subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
102*77c1e3ccSAndroid Build Coastguard Worker                                 src_stride, pred_ptr, pred_stride);
103*77c1e3ccSAndroid Build Coastguard Worker       break;
104*77c1e3ccSAndroid Build Coastguard Worker     default:
105*77c1e3ccSAndroid Build Coastguard Worker       aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
106*77c1e3ccSAndroid Build Coastguard Worker                               src_stride, pred_ptr, pred_stride);
107*77c1e3ccSAndroid Build Coastguard Worker       break;
108*77c1e3ccSAndroid Build Coastguard Worker   }
109*77c1e3ccSAndroid Build Coastguard Worker }
110