1 /*
2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h>
13
14 #include "config/aom_dsp_rtcd.h"
15
sse_sum_wd4_sse2(const int16_t * data,int stride,int bh,int * x_sum,int64_t * x2_sum)16 static inline void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh,
17 int *x_sum, int64_t *x2_sum) {
18 const int16_t *data_tmp = data;
19 __m128i temp_buffer1, temp_buffer2;
20 __m128i load_pixels_low, load_pixels_hi, sum_buffer, sse_buffer;
21 __m128i one = _mm_set1_epi16(1);
22 __m128i regx_sum = _mm_setzero_si128();
23 __m128i regx2_sum = regx_sum;
24
25 for (int j = 0; j < (bh >> 1); ++j) {
26 // Load 2 rows (8 pixels) at a time.
27 load_pixels_low = _mm_loadl_epi64((__m128i const *)(data_tmp));
28 load_pixels_hi = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
29 load_pixels_low = _mm_unpacklo_epi64(load_pixels_low, load_pixels_hi);
30 sum_buffer = _mm_madd_epi16(load_pixels_low, one);
31 sse_buffer = _mm_madd_epi16(load_pixels_low, load_pixels_low);
32 regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
33 regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
34 data_tmp += 2 * stride;
35 }
36
37 regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
38 regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
39 *x_sum = _mm_cvtsi128_si32(regx_sum);
40 temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
41 temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
42 regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
43 regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
44 #if AOM_ARCH_X86_64
45 *x2_sum += _mm_cvtsi128_si64(regx2_sum);
46 #else
47 {
48 int64_t tmp;
49 _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
50 *x2_sum += tmp;
51 }
52 #endif
53 }
54
sse_sum_wd8_sse2(const int16_t * data,int stride,int bh,int * x_sum,int64_t * x2_sum,int loop_cycles)55 static inline void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh,
56 int *x_sum, int64_t *x2_sum,
57 int loop_cycles) {
58 const int16_t *data_tmp;
59 __m128i temp_buffer1, temp_buffer2;
60 __m128i one = _mm_set1_epi16(1);
61 __m128i regx_sum = _mm_setzero_si128();
62 __m128i regx2_sum = regx_sum;
63 __m128i load_pixels, sum_buffer, sse_buffer;
64
65 for (int i = 0; i < loop_cycles; ++i) {
66 data_tmp = data + (8 * i);
67 for (int j = 0; j < bh; ++j) {
68 // Load 1 row (8-pixels) at a time.
69 load_pixels = _mm_loadu_si128((__m128i const *)(data_tmp));
70 sum_buffer = _mm_madd_epi16(load_pixels, one);
71 sse_buffer = _mm_madd_epi16(load_pixels, load_pixels);
72 regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
73 regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
74 data_tmp += stride;
75 }
76 }
77
78 regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
79 regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
80 *x_sum += _mm_cvtsi128_si32(regx_sum);
81 temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
82 temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
83 regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
84 regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
85 #if AOM_ARCH_X86_64
86 *x2_sum += _mm_cvtsi128_si64(regx2_sum);
87 #else
88 {
89 int64_t tmp;
90 _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
91 *x2_sum += tmp;
92 }
93 #endif
94 }
95
96 // This functions adds SSE2 Support for the functions 'get_blk_sse_sum_c'
aom_get_blk_sse_sum_sse2(const int16_t * data,int stride,int bw,int bh,int * x_sum,int64_t * x2_sum)97 void aom_get_blk_sse_sum_sse2(const int16_t *data, int stride, int bw, int bh,
98 int *x_sum, int64_t *x2_sum) {
99 *x_sum = 0;
100 *x2_sum = 0;
101
102 if ((bh & 3) == 0) {
103 switch (bw) {
104 case 4: sse_sum_wd4_sse2(data, stride, bh, x_sum, x2_sum); break;
105 case 8:
106 case 16:
107 sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
108 break;
109 // For widths 32 and 64, the registers may overflow. So compute
110 // partial widths at a time.
111 case 32:
112 if (bh <= 32) {
113 sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
114 break;
115 } else {
116 sse_sum_wd8_sse2(data, stride, 32, x_sum, x2_sum, bw >> 3);
117 sse_sum_wd8_sse2(data + 32 * stride, stride, 32, x_sum, x2_sum,
118 bw >> 3);
119 break;
120 }
121
122 case 64:
123 if (bh <= 16) {
124 sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
125 break;
126 } else {
127 for (int i = 0; i < bh; i += 16)
128 sse_sum_wd8_sse2(data + i * stride, stride, 16, x_sum, x2_sum,
129 bw >> 3);
130 break;
131 }
132
133 default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
134 }
135 } else {
136 aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
137 }
138 }
139