xref: /aosp_15_r20/external/libaom/aom_dsp/x86/mem_sse2.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
13*77c1e3ccSAndroid Build Coastguard Worker #define AOM_AOM_DSP_X86_MEM_SSE2_H_
14*77c1e3ccSAndroid Build Coastguard Worker 
15*77c1e3ccSAndroid Build Coastguard Worker #include <emmintrin.h>  // SSE2
16*77c1e3ccSAndroid Build Coastguard Worker #include <string.h>
17*77c1e3ccSAndroid Build Coastguard Worker 
18*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
19*77c1e3ccSAndroid Build Coastguard Worker 
20*77c1e3ccSAndroid Build Coastguard Worker #include "aom/aom_integer.h"
21*77c1e3ccSAndroid Build Coastguard Worker 
loadu_int16(const void * src)22*77c1e3ccSAndroid Build Coastguard Worker static inline int16_t loadu_int16(const void *src) {
23*77c1e3ccSAndroid Build Coastguard Worker   int16_t v;
24*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&v, src, sizeof(v));
25*77c1e3ccSAndroid Build Coastguard Worker   return v;
26*77c1e3ccSAndroid Build Coastguard Worker }
27*77c1e3ccSAndroid Build Coastguard Worker 
loadu_int32(const void * src)28*77c1e3ccSAndroid Build Coastguard Worker static inline int32_t loadu_int32(const void *src) {
29*77c1e3ccSAndroid Build Coastguard Worker   int32_t v;
30*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&v, src, sizeof(v));
31*77c1e3ccSAndroid Build Coastguard Worker   return v;
32*77c1e3ccSAndroid Build Coastguard Worker }
33*77c1e3ccSAndroid Build Coastguard Worker 
loadu_int64(const void * src)34*77c1e3ccSAndroid Build Coastguard Worker static inline int64_t loadu_int64(const void *src) {
35*77c1e3ccSAndroid Build Coastguard Worker   int64_t v;
36*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&v, src, sizeof(v));
37*77c1e3ccSAndroid Build Coastguard Worker   return v;
38*77c1e3ccSAndroid Build Coastguard Worker }
39*77c1e3ccSAndroid Build Coastguard Worker 
_mm_storeh_epi64(__m128i * const d,const __m128i s)40*77c1e3ccSAndroid Build Coastguard Worker static inline void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
41*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
42*77c1e3ccSAndroid Build Coastguard Worker }
43*77c1e3ccSAndroid Build Coastguard Worker 
loadh_epi64(const void * const src,const __m128i s)44*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i loadh_epi64(const void *const src, const __m128i s) {
45*77c1e3ccSAndroid Build Coastguard Worker   return _mm_castps_si128(
46*77c1e3ccSAndroid Build Coastguard Worker       _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
47*77c1e3ccSAndroid Build Coastguard Worker }
48*77c1e3ccSAndroid Build Coastguard Worker 
load_8bit_4x4_to_1_reg_sse2(const void * const src,const int byte_stride)49*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
50*77c1e3ccSAndroid Build Coastguard Worker                                                   const int byte_stride) {
51*77c1e3ccSAndroid Build Coastguard Worker   return _mm_setr_epi32(loadu_int32((int8_t *)src + 0 * byte_stride),
52*77c1e3ccSAndroid Build Coastguard Worker                         loadu_int32((int8_t *)src + 1 * byte_stride),
53*77c1e3ccSAndroid Build Coastguard Worker                         loadu_int32((int8_t *)src + 2 * byte_stride),
54*77c1e3ccSAndroid Build Coastguard Worker                         loadu_int32((int8_t *)src + 3 * byte_stride));
55*77c1e3ccSAndroid Build Coastguard Worker }
56*77c1e3ccSAndroid Build Coastguard Worker 
load_8bit_8x2_to_1_reg_sse2(const void * const src,const int byte_stride)57*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
58*77c1e3ccSAndroid Build Coastguard Worker                                                   const int byte_stride) {
59*77c1e3ccSAndroid Build Coastguard Worker   __m128i dst;
60*77c1e3ccSAndroid Build Coastguard Worker   dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
61*77c1e3ccSAndroid Build Coastguard Worker   dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
62*77c1e3ccSAndroid Build Coastguard Worker   return dst;
63*77c1e3ccSAndroid Build Coastguard Worker }
64*77c1e3ccSAndroid Build Coastguard Worker 
store_8bit_8x4_from_16x2(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)65*77c1e3ccSAndroid Build Coastguard Worker static inline void store_8bit_8x4_from_16x2(const __m128i *const s,
66*77c1e3ccSAndroid Build Coastguard Worker                                             uint8_t *const d,
67*77c1e3ccSAndroid Build Coastguard Worker                                             const ptrdiff_t stride) {
68*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
69*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
70*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
71*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
72*77c1e3ccSAndroid Build Coastguard Worker }
73*77c1e3ccSAndroid Build Coastguard Worker 
store_8bit_4x4(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)74*77c1e3ccSAndroid Build Coastguard Worker static inline void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
75*77c1e3ccSAndroid Build Coastguard Worker                                   const ptrdiff_t stride) {
76*77c1e3ccSAndroid Build Coastguard Worker   *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
77*77c1e3ccSAndroid Build Coastguard Worker   *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
78*77c1e3ccSAndroid Build Coastguard Worker   *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
79*77c1e3ccSAndroid Build Coastguard Worker   *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
80*77c1e3ccSAndroid Build Coastguard Worker }
81*77c1e3ccSAndroid Build Coastguard Worker 
store_8bit_4x4_sse2(const __m128i s,uint8_t * const d,const ptrdiff_t stride)82*77c1e3ccSAndroid Build Coastguard Worker static inline void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
83*77c1e3ccSAndroid Build Coastguard Worker                                        const ptrdiff_t stride) {
84*77c1e3ccSAndroid Build Coastguard Worker   __m128i ss[4];
85*77c1e3ccSAndroid Build Coastguard Worker 
86*77c1e3ccSAndroid Build Coastguard Worker   ss[0] = s;
87*77c1e3ccSAndroid Build Coastguard Worker   ss[1] = _mm_srli_si128(s, 4);
88*77c1e3ccSAndroid Build Coastguard Worker   ss[2] = _mm_srli_si128(s, 8);
89*77c1e3ccSAndroid Build Coastguard Worker   ss[3] = _mm_srli_si128(s, 12);
90*77c1e3ccSAndroid Build Coastguard Worker   store_8bit_4x4(ss, d, stride);
91*77c1e3ccSAndroid Build Coastguard Worker }
92*77c1e3ccSAndroid Build Coastguard Worker 
load_8bit_4x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)93*77c1e3ccSAndroid Build Coastguard Worker static inline void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
94*77c1e3ccSAndroid Build Coastguard Worker                                  __m128i *const d) {
95*77c1e3ccSAndroid Build Coastguard Worker   d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
96*77c1e3ccSAndroid Build Coastguard Worker   d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
97*77c1e3ccSAndroid Build Coastguard Worker   d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
98*77c1e3ccSAndroid Build Coastguard Worker   d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
99*77c1e3ccSAndroid Build Coastguard Worker }
100*77c1e3ccSAndroid Build Coastguard Worker 
load_8bit_4x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)101*77c1e3ccSAndroid Build Coastguard Worker static inline void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
102*77c1e3ccSAndroid Build Coastguard Worker                                  __m128i *const d) {
103*77c1e3ccSAndroid Build Coastguard Worker   load_8bit_4x4(s + 0 * stride, stride, &d[0]);
104*77c1e3ccSAndroid Build Coastguard Worker   load_8bit_4x4(s + 4 * stride, stride, &d[4]);
105*77c1e3ccSAndroid Build Coastguard Worker }
106*77c1e3ccSAndroid Build Coastguard Worker 
load_8bit_8x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)107*77c1e3ccSAndroid Build Coastguard Worker static inline void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
108*77c1e3ccSAndroid Build Coastguard Worker                                  __m128i *const d) {
109*77c1e3ccSAndroid Build Coastguard Worker   d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
110*77c1e3ccSAndroid Build Coastguard Worker   d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
111*77c1e3ccSAndroid Build Coastguard Worker   d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
112*77c1e3ccSAndroid Build Coastguard Worker   d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
113*77c1e3ccSAndroid Build Coastguard Worker }
114*77c1e3ccSAndroid Build Coastguard Worker 
loadu_8bit_16x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)115*77c1e3ccSAndroid Build Coastguard Worker static inline void loadu_8bit_16x4(const uint8_t *const s,
116*77c1e3ccSAndroid Build Coastguard Worker                                    const ptrdiff_t stride, __m128i *const d) {
117*77c1e3ccSAndroid Build Coastguard Worker   d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
118*77c1e3ccSAndroid Build Coastguard Worker   d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
119*77c1e3ccSAndroid Build Coastguard Worker   d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
120*77c1e3ccSAndroid Build Coastguard Worker   d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
121*77c1e3ccSAndroid Build Coastguard Worker }
122*77c1e3ccSAndroid Build Coastguard Worker 
load_8bit_8x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)123*77c1e3ccSAndroid Build Coastguard Worker static inline void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
124*77c1e3ccSAndroid Build Coastguard Worker                                  __m128i *const d) {
125*77c1e3ccSAndroid Build Coastguard Worker   load_8bit_8x4(s + 0 * stride, stride, &d[0]);
126*77c1e3ccSAndroid Build Coastguard Worker   load_8bit_8x4(s + 4 * stride, stride, &d[4]);
127*77c1e3ccSAndroid Build Coastguard Worker }
128*77c1e3ccSAndroid Build Coastguard Worker 
load_8bit_16x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)129*77c1e3ccSAndroid Build Coastguard Worker static inline void load_8bit_16x8(const uint8_t *const s,
130*77c1e3ccSAndroid Build Coastguard Worker                                   const ptrdiff_t stride, __m128i *const d) {
131*77c1e3ccSAndroid Build Coastguard Worker   d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
132*77c1e3ccSAndroid Build Coastguard Worker   d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
133*77c1e3ccSAndroid Build Coastguard Worker   d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
134*77c1e3ccSAndroid Build Coastguard Worker   d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
135*77c1e3ccSAndroid Build Coastguard Worker   d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
136*77c1e3ccSAndroid Build Coastguard Worker   d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
137*77c1e3ccSAndroid Build Coastguard Worker   d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
138*77c1e3ccSAndroid Build Coastguard Worker   d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
139*77c1e3ccSAndroid Build Coastguard Worker }
140*77c1e3ccSAndroid Build Coastguard Worker 
loadu_8bit_16x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)141*77c1e3ccSAndroid Build Coastguard Worker static inline void loadu_8bit_16x8(const uint8_t *const s,
142*77c1e3ccSAndroid Build Coastguard Worker                                    const ptrdiff_t stride, __m128i *const d) {
143*77c1e3ccSAndroid Build Coastguard Worker   loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
144*77c1e3ccSAndroid Build Coastguard Worker   loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
145*77c1e3ccSAndroid Build Coastguard Worker }
146*77c1e3ccSAndroid Build Coastguard Worker 
store_8bit_8x8(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)147*77c1e3ccSAndroid Build Coastguard Worker static inline void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
148*77c1e3ccSAndroid Build Coastguard Worker                                   const ptrdiff_t stride) {
149*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
150*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
151*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
152*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
153*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
154*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
155*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
156*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
157*77c1e3ccSAndroid Build Coastguard Worker }
158*77c1e3ccSAndroid Build Coastguard Worker 
storeu_8bit_16x4(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)159*77c1e3ccSAndroid Build Coastguard Worker static inline void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
160*77c1e3ccSAndroid Build Coastguard Worker                                     const ptrdiff_t stride) {
161*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
162*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
163*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
164*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
165*77c1e3ccSAndroid Build Coastguard Worker }
166*77c1e3ccSAndroid Build Coastguard Worker 
167*77c1e3ccSAndroid Build Coastguard Worker #endif  // AOM_AOM_DSP_X86_MEM_SSE2_H_
168