1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <smmintrin.h>
14
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17
av1_filter_intra_edge_sse4_1(uint8_t * p,int sz,int strength)18 void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
19 if (!strength) return;
20
21 DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
22 { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4
23 { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5
24 { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2
25 };
26
27 DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
28 { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
29 { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
30 { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
31 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
32 };
33
34 // Extend the first and last samples to simplify the loop for the 5-tap case
35 p[-1] = p[0];
36 __m128i last = _mm_set1_epi8((char)p[sz - 1]);
37 _mm_storeu_si128((__m128i *)&p[sz], last);
38
39 // Adjust input pointer for filter support area
40 uint8_t *in = (strength == 3) ? p - 1 : p;
41
42 // Avoid modifying first sample
43 uint8_t *out = p + 1;
44 int len = sz - 1;
45
46 const int use_3tap_filter = (strength < 3);
47
48 if (use_3tap_filter) {
49 __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
50 __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
51 __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
52 __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
53 __m128i in0 = _mm_lddqu_si128((__m128i *)in);
54 while (len > 0) {
55 int n_out = (len < 8) ? len : 8;
56 __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
57 __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
58 d0 = _mm_maddubs_epi16(d0, coef0);
59 d1 = _mm_maddubs_epi16(d1, coef0);
60 d0 = _mm_hadd_epi16(d0, d1);
61 __m128i eight = _mm_set1_epi16(8);
62 d0 = _mm_add_epi16(d0, eight);
63 d0 = _mm_srai_epi16(d0, 4);
64 d0 = _mm_packus_epi16(d0, d0);
65 __m128i out0 = _mm_lddqu_si128((__m128i *)out);
66 __m128i n0 = _mm_set1_epi8(n_out);
67 __m128i mask = _mm_cmpgt_epi8(n0, iden);
68 out0 = _mm_blendv_epi8(out0, d0, mask);
69 _mm_storel_epi64((__m128i *)out, out0);
70 __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
71 in0 = _mm_alignr_epi8(in1, in0, 8);
72 in += 8;
73 out += 8;
74 len -= n_out;
75 }
76 } else { // 5-tap filter
77 __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
78 __m128i two = _mm_set1_epi8(2);
79 __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
80 __m128i shuf_b = _mm_add_epi8(shuf_a, two);
81 __m128i shuf_c = _mm_add_epi8(shuf_b, two);
82 __m128i shuf_d = _mm_add_epi8(shuf_c, two);
83 __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
84 __m128i in0 = _mm_lddqu_si128((__m128i *)in);
85 while (len > 0) {
86 int n_out = (len < 8) ? len : 8;
87 __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
88 __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
89 __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
90 __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
91 d0 = _mm_maddubs_epi16(d0, coef0);
92 d1 = _mm_maddubs_epi16(d1, coef0);
93 d2 = _mm_maddubs_epi16(d2, coef0);
94 d3 = _mm_maddubs_epi16(d3, coef0);
95 d0 = _mm_hadd_epi16(d0, d1);
96 d2 = _mm_hadd_epi16(d2, d3);
97 d0 = _mm_hadd_epi16(d0, d2);
98 __m128i eight = _mm_set1_epi16(8);
99 d0 = _mm_add_epi16(d0, eight);
100 d0 = _mm_srai_epi16(d0, 4);
101 d0 = _mm_packus_epi16(d0, d0);
102 __m128i out0 = _mm_lddqu_si128((__m128i *)out);
103 __m128i n0 = _mm_set1_epi8(n_out);
104 __m128i mask = _mm_cmpgt_epi8(n0, iden);
105 out0 = _mm_blendv_epi8(out0, d0, mask);
106 _mm_storel_epi64((__m128i *)out, out0);
107 __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
108 in0 = _mm_alignr_epi8(in1, in0, 8);
109 in += 8;
110 out += 8;
111 len -= n_out;
112 }
113 }
114 }
115
av1_upsample_intra_edge_sse4_1(uint8_t * p,int sz)116 void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
117 // interpolate half-sample positions
118 assert(sz <= 24);
119
120 DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
121 { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
122 };
123
124 DECLARE_ALIGNED(
125 16, static const int8_t,
126 v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
127 { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
128
129 // Extend first/last samples (upper-left p[-1], last p[sz-1])
130 // to support 4-tap filter
131 p[-2] = p[-1];
132 p[sz] = p[sz - 1];
133
134 uint8_t *in = &p[-2];
135 uint8_t *out = &p[-2];
136
137 int n = sz + 1; // Input length including upper-left sample
138
139 __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
140 __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
141
142 __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
143 __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
144 __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
145
146 while (n > 0) {
147 __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
148 __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
149 __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
150 __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
151 __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
152 d0 = _mm_maddubs_epi16(d0, coef0);
153 d1 = _mm_maddubs_epi16(d1, coef0);
154 d2 = _mm_maddubs_epi16(d2, coef0);
155 d3 = _mm_maddubs_epi16(d3, coef0);
156 d0 = _mm_hadd_epi16(d0, d1);
157 d2 = _mm_hadd_epi16(d2, d3);
158 __m128i eight = _mm_set1_epi16(8);
159 d0 = _mm_add_epi16(d0, eight);
160 d2 = _mm_add_epi16(d2, eight);
161 d0 = _mm_srai_epi16(d0, 4);
162 d2 = _mm_srai_epi16(d2, 4);
163 d0 = _mm_packus_epi16(d0, d2);
164 __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
165 __m128i out0 = _mm_unpacklo_epi8(in1, d0);
166 __m128i out1 = _mm_unpackhi_epi8(in1, d0);
167 _mm_storeu_si128((__m128i *)&out[0], out0);
168 _mm_storeu_si128((__m128i *)&out[16], out1);
169 in0 = in16;
170 in16 = _mm_setzero_si128();
171 out += 32;
172 n -= 16;
173 }
174 }
175
176 #if CONFIG_AV1_HIGHBITDEPTH
177
av1_highbd_filter_intra_edge_sse4_1(uint16_t * p,int sz,int strength)178 void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) {
179 if (!strength) return;
180
181 DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
182 { 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4
183 { 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5
184 { 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2
185 };
186
187 DECLARE_ALIGNED(16, static const int16_t,
188 v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
189
190 // Extend the first and last samples to simplify the loop for the 5-tap case
191 p[-1] = p[0];
192 __m128i last = _mm_set1_epi16(p[sz - 1]);
193 _mm_storeu_si128((__m128i *)&p[sz], last);
194
195 // Adjust input pointer for filter support area
196 uint16_t *in = (strength == 3) ? p - 1 : p;
197
198 // Avoid modifying first sample
199 uint16_t *out = p + 1;
200 int len = sz - 1;
201
202 const int use_3tap_filter = (strength < 3);
203
204 if (use_3tap_filter) {
205 __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
206 __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
207 __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
208 __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
209 while (len > 0) {
210 int n_out = (len < 8) ? len : 8;
211 __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
212 __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
213 __m128i in02 = _mm_add_epi16(in0, in2);
214 __m128i d0 = _mm_unpacklo_epi16(in02, in1);
215 __m128i d1 = _mm_unpackhi_epi16(in02, in1);
216 d0 = _mm_mullo_epi16(d0, coef0);
217 d1 = _mm_mullo_epi16(d1, coef0);
218 d0 = _mm_hadd_epi16(d0, d1);
219 __m128i eight = _mm_set1_epi16(8);
220 d0 = _mm_add_epi16(d0, eight);
221 d0 = _mm_srli_epi16(d0, 4);
222 __m128i out0 = _mm_lddqu_si128((__m128i *)out);
223 __m128i n0 = _mm_set1_epi16(n_out);
224 __m128i mask = _mm_cmpgt_epi16(n0, iden);
225 out0 = _mm_blendv_epi8(out0, d0, mask);
226 _mm_storeu_si128((__m128i *)out, out0);
227 in += 8;
228 in0 = in8;
229 in8 = _mm_lddqu_si128((__m128i *)&in[8]);
230 out += 8;
231 len -= n_out;
232 }
233 } else { // 5-tap filter
234 __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
235 __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
236 __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
237 __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
238 while (len > 0) {
239 int n_out = (len < 8) ? len : 8;
240 __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
241 __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
242 __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
243 __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
244 __m128i in04 = _mm_add_epi16(in0, in4);
245 __m128i in123 = _mm_add_epi16(in1, in2);
246 in123 = _mm_add_epi16(in123, in3);
247 __m128i d0 = _mm_unpacklo_epi16(in04, in123);
248 __m128i d1 = _mm_unpackhi_epi16(in04, in123);
249 d0 = _mm_mullo_epi16(d0, coef0);
250 d1 = _mm_mullo_epi16(d1, coef0);
251 d0 = _mm_hadd_epi16(d0, d1);
252 __m128i eight = _mm_set1_epi16(8);
253 d0 = _mm_add_epi16(d0, eight);
254 d0 = _mm_srli_epi16(d0, 4);
255 __m128i out0 = _mm_lddqu_si128((__m128i *)out);
256 __m128i n0 = _mm_set1_epi16(n_out);
257 __m128i mask = _mm_cmpgt_epi16(n0, iden);
258 out0 = _mm_blendv_epi8(out0, d0, mask);
259 _mm_storeu_si128((__m128i *)out, out0);
260 in += 8;
261 in0 = in8;
262 in8 = _mm_lddqu_si128((__m128i *)&in[8]);
263 out += 8;
264 len -= n_out;
265 }
266 }
267 }
268
av1_highbd_upsample_intra_edge_sse4_1(uint16_t * p,int sz,int bd)269 void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) {
270 // interpolate half-sample positions
271 assert(sz <= 24);
272
273 DECLARE_ALIGNED(16, static const int16_t,
274 kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
275
276 // Extend first/last samples (upper-left p[-1], last p[sz-1])
277 // to support 4-tap filter
278 p[-2] = p[-1];
279 p[sz] = p[sz - 1];
280
281 uint16_t *in = &p[-2];
282 uint16_t *out = in;
283 int n = sz + 1;
284
285 __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
286 __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
287 __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
288 __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
289
290 while (n > 0) {
291 __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
292 __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
293 __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
294 __m128i sum0 = _mm_add_epi16(in0, in3);
295 __m128i sum1 = _mm_add_epi16(in1, in2);
296 __m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
297 __m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
298 __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
299 d0 = _mm_madd_epi16(d0, coef0);
300 d1 = _mm_madd_epi16(d1, coef0);
301 __m128i eight = _mm_set1_epi32(8);
302 d0 = _mm_add_epi32(d0, eight);
303 d1 = _mm_add_epi32(d1, eight);
304 d0 = _mm_srai_epi32(d0, 4);
305 d1 = _mm_srai_epi32(d1, 4);
306 d0 = _mm_packus_epi32(d0, d1);
307 __m128i max0 = _mm_set1_epi16((1 << bd) - 1);
308 d0 = _mm_min_epi16(d0, max0);
309 __m128i out0 = _mm_unpacklo_epi16(in1, d0);
310 __m128i out1 = _mm_unpackhi_epi16(in1, d0);
311 _mm_storeu_si128((__m128i *)&out[0], out0);
312 _mm_storeu_si128((__m128i *)&out[8], out1);
313 in0 = in8;
314 in8 = in16;
315 in16 = in24;
316 in24 = _mm_setzero_si128();
317 out += 16;
318 n -= 8;
319 }
320 }
321
322 #endif // CONFIG_AV1_HIGHBITDEPTH
323