xref: /aosp_15_r20/external/libaom/aom_dsp/x86/blk_sse_sum_sse2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 
sse_sum_wd4_sse2(const int16_t * data,int stride,int bh,int * x_sum,int64_t * x2_sum)16 static inline void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh,
17                                     int *x_sum, int64_t *x2_sum) {
18   const int16_t *data_tmp = data;
19   __m128i temp_buffer1, temp_buffer2;
20   __m128i load_pixels_low, load_pixels_hi, sum_buffer, sse_buffer;
21   __m128i one = _mm_set1_epi16(1);
22   __m128i regx_sum = _mm_setzero_si128();
23   __m128i regx2_sum = regx_sum;
24 
25   for (int j = 0; j < (bh >> 1); ++j) {
26     // Load 2 rows (8 pixels) at a time.
27     load_pixels_low = _mm_loadl_epi64((__m128i const *)(data_tmp));
28     load_pixels_hi = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
29     load_pixels_low = _mm_unpacklo_epi64(load_pixels_low, load_pixels_hi);
30     sum_buffer = _mm_madd_epi16(load_pixels_low, one);
31     sse_buffer = _mm_madd_epi16(load_pixels_low, load_pixels_low);
32     regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
33     regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
34     data_tmp += 2 * stride;
35   }
36 
37   regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
38   regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
39   *x_sum = _mm_cvtsi128_si32(regx_sum);
40   temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
41   temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
42   regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
43   regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
44 #if AOM_ARCH_X86_64
45   *x2_sum += _mm_cvtsi128_si64(regx2_sum);
46 #else
47   {
48     int64_t tmp;
49     _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
50     *x2_sum += tmp;
51   }
52 #endif
53 }
54 
sse_sum_wd8_sse2(const int16_t * data,int stride,int bh,int * x_sum,int64_t * x2_sum,int loop_cycles)55 static inline void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh,
56                                     int *x_sum, int64_t *x2_sum,
57                                     int loop_cycles) {
58   const int16_t *data_tmp;
59   __m128i temp_buffer1, temp_buffer2;
60   __m128i one = _mm_set1_epi16(1);
61   __m128i regx_sum = _mm_setzero_si128();
62   __m128i regx2_sum = regx_sum;
63   __m128i load_pixels, sum_buffer, sse_buffer;
64 
65   for (int i = 0; i < loop_cycles; ++i) {
66     data_tmp = data + (8 * i);
67     for (int j = 0; j < bh; ++j) {
68       // Load 1 row (8-pixels) at a time.
69       load_pixels = _mm_loadu_si128((__m128i const *)(data_tmp));
70       sum_buffer = _mm_madd_epi16(load_pixels, one);
71       sse_buffer = _mm_madd_epi16(load_pixels, load_pixels);
72       regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
73       regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
74       data_tmp += stride;
75     }
76   }
77 
78   regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
79   regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
80   *x_sum += _mm_cvtsi128_si32(regx_sum);
81   temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
82   temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
83   regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
84   regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
85 #if AOM_ARCH_X86_64
86   *x2_sum += _mm_cvtsi128_si64(regx2_sum);
87 #else
88   {
89     int64_t tmp;
90     _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
91     *x2_sum += tmp;
92   }
93 #endif
94 }
95 
96 // This functions adds SSE2 Support for the functions 'get_blk_sse_sum_c'
aom_get_blk_sse_sum_sse2(const int16_t * data,int stride,int bw,int bh,int * x_sum,int64_t * x2_sum)97 void aom_get_blk_sse_sum_sse2(const int16_t *data, int stride, int bw, int bh,
98                               int *x_sum, int64_t *x2_sum) {
99   *x_sum = 0;
100   *x2_sum = 0;
101 
102   if ((bh & 3) == 0) {
103     switch (bw) {
104       case 4: sse_sum_wd4_sse2(data, stride, bh, x_sum, x2_sum); break;
105       case 8:
106       case 16:
107         sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
108         break;
109         // For widths 32 and 64, the registers may overflow. So compute
110         // partial widths at a time.
111       case 32:
112         if (bh <= 32) {
113           sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
114           break;
115         } else {
116           sse_sum_wd8_sse2(data, stride, 32, x_sum, x2_sum, bw >> 3);
117           sse_sum_wd8_sse2(data + 32 * stride, stride, 32, x_sum, x2_sum,
118                            bw >> 3);
119           break;
120         }
121 
122       case 64:
123         if (bh <= 16) {
124           sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
125           break;
126         } else {
127           for (int i = 0; i < bh; i += 16)
128             sse_sum_wd8_sse2(data + i * stride, stride, 16, x_sum, x2_sum,
129                              bw >> 3);
130           break;
131         }
132 
133       default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
134     }
135   } else {
136     aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
137   }
138 }
139