xref: /aosp_15_r20/external/libaom/av1/common/x86/intra_edge_sse4.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <smmintrin.h>
14 
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17 
av1_filter_intra_edge_sse4_1(uint8_t * p,int sz,int strength)18 void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
19   if (!strength) return;
20 
21   DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
22     { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 },  // strength 1: 4,8,4
23     { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 },  // strength 2: 5,6,5
24     { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 }  // strength 3: 2,4,4,4,2
25   };
26 
27   DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
28     { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
29     { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
30     { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
31     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
32   };
33 
34   // Extend the first and last samples to simplify the loop for the 5-tap case
35   p[-1] = p[0];
36   __m128i last = _mm_set1_epi8((char)p[sz - 1]);
37   _mm_storeu_si128((__m128i *)&p[sz], last);
38 
39   // Adjust input pointer for filter support area
40   uint8_t *in = (strength == 3) ? p - 1 : p;
41 
42   // Avoid modifying first sample
43   uint8_t *out = p + 1;
44   int len = sz - 1;
45 
46   const int use_3tap_filter = (strength < 3);
47 
48   if (use_3tap_filter) {
49     __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
50     __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
51     __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
52     __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
53     __m128i in0 = _mm_lddqu_si128((__m128i *)in);
54     while (len > 0) {
55       int n_out = (len < 8) ? len : 8;
56       __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
57       __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
58       d0 = _mm_maddubs_epi16(d0, coef0);
59       d1 = _mm_maddubs_epi16(d1, coef0);
60       d0 = _mm_hadd_epi16(d0, d1);
61       __m128i eight = _mm_set1_epi16(8);
62       d0 = _mm_add_epi16(d0, eight);
63       d0 = _mm_srai_epi16(d0, 4);
64       d0 = _mm_packus_epi16(d0, d0);
65       __m128i out0 = _mm_lddqu_si128((__m128i *)out);
66       __m128i n0 = _mm_set1_epi8(n_out);
67       __m128i mask = _mm_cmpgt_epi8(n0, iden);
68       out0 = _mm_blendv_epi8(out0, d0, mask);
69       _mm_storel_epi64((__m128i *)out, out0);
70       __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
71       in0 = _mm_alignr_epi8(in1, in0, 8);
72       in += 8;
73       out += 8;
74       len -= n_out;
75     }
76   } else {  // 5-tap filter
77     __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
78     __m128i two = _mm_set1_epi8(2);
79     __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
80     __m128i shuf_b = _mm_add_epi8(shuf_a, two);
81     __m128i shuf_c = _mm_add_epi8(shuf_b, two);
82     __m128i shuf_d = _mm_add_epi8(shuf_c, two);
83     __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
84     __m128i in0 = _mm_lddqu_si128((__m128i *)in);
85     while (len > 0) {
86       int n_out = (len < 8) ? len : 8;
87       __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
88       __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
89       __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
90       __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
91       d0 = _mm_maddubs_epi16(d0, coef0);
92       d1 = _mm_maddubs_epi16(d1, coef0);
93       d2 = _mm_maddubs_epi16(d2, coef0);
94       d3 = _mm_maddubs_epi16(d3, coef0);
95       d0 = _mm_hadd_epi16(d0, d1);
96       d2 = _mm_hadd_epi16(d2, d3);
97       d0 = _mm_hadd_epi16(d0, d2);
98       __m128i eight = _mm_set1_epi16(8);
99       d0 = _mm_add_epi16(d0, eight);
100       d0 = _mm_srai_epi16(d0, 4);
101       d0 = _mm_packus_epi16(d0, d0);
102       __m128i out0 = _mm_lddqu_si128((__m128i *)out);
103       __m128i n0 = _mm_set1_epi8(n_out);
104       __m128i mask = _mm_cmpgt_epi8(n0, iden);
105       out0 = _mm_blendv_epi8(out0, d0, mask);
106       _mm_storel_epi64((__m128i *)out, out0);
107       __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
108       in0 = _mm_alignr_epi8(in1, in0, 8);
109       in += 8;
110       out += 8;
111       len -= n_out;
112     }
113   }
114 }
115 
av1_upsample_intra_edge_sse4_1(uint8_t * p,int sz)116 void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
117   // interpolate half-sample positions
118   assert(sz <= 24);
119 
120   DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
121     { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
122   };
123 
124   DECLARE_ALIGNED(
125       16, static const int8_t,
126       v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
127                           { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
128 
129   // Extend first/last samples (upper-left p[-1], last p[sz-1])
130   // to support 4-tap filter
131   p[-2] = p[-1];
132   p[sz] = p[sz - 1];
133 
134   uint8_t *in = &p[-2];
135   uint8_t *out = &p[-2];
136 
137   int n = sz + 1;  // Input length including upper-left sample
138 
139   __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
140   __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
141 
142   __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
143   __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
144   __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
145 
146   while (n > 0) {
147     __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
148     __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
149     __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
150     __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
151     __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
152     d0 = _mm_maddubs_epi16(d0, coef0);
153     d1 = _mm_maddubs_epi16(d1, coef0);
154     d2 = _mm_maddubs_epi16(d2, coef0);
155     d3 = _mm_maddubs_epi16(d3, coef0);
156     d0 = _mm_hadd_epi16(d0, d1);
157     d2 = _mm_hadd_epi16(d2, d3);
158     __m128i eight = _mm_set1_epi16(8);
159     d0 = _mm_add_epi16(d0, eight);
160     d2 = _mm_add_epi16(d2, eight);
161     d0 = _mm_srai_epi16(d0, 4);
162     d2 = _mm_srai_epi16(d2, 4);
163     d0 = _mm_packus_epi16(d0, d2);
164     __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
165     __m128i out0 = _mm_unpacklo_epi8(in1, d0);
166     __m128i out1 = _mm_unpackhi_epi8(in1, d0);
167     _mm_storeu_si128((__m128i *)&out[0], out0);
168     _mm_storeu_si128((__m128i *)&out[16], out1);
169     in0 = in16;
170     in16 = _mm_setzero_si128();
171     out += 32;
172     n -= 16;
173   }
174 }
175 
176 #if CONFIG_AV1_HIGHBITDEPTH
177 
av1_highbd_filter_intra_edge_sse4_1(uint16_t * p,int sz,int strength)178 void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) {
179   if (!strength) return;
180 
181   DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
182     { 4, 8, 4, 8, 4, 8, 4, 8 },  // strength 1: 4,8,4
183     { 5, 6, 5, 6, 5, 6, 5, 6 },  // strength 2: 5,6,5
184     { 2, 4, 2, 4, 2, 4, 2, 4 }   // strength 3: 2,4,4,4,2
185   };
186 
187   DECLARE_ALIGNED(16, static const int16_t,
188                   v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
189 
190   // Extend the first and last samples to simplify the loop for the 5-tap case
191   p[-1] = p[0];
192   __m128i last = _mm_set1_epi16(p[sz - 1]);
193   _mm_storeu_si128((__m128i *)&p[sz], last);
194 
195   // Adjust input pointer for filter support area
196   uint16_t *in = (strength == 3) ? p - 1 : p;
197 
198   // Avoid modifying first sample
199   uint16_t *out = p + 1;
200   int len = sz - 1;
201 
202   const int use_3tap_filter = (strength < 3);
203 
204   if (use_3tap_filter) {
205     __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
206     __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
207     __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
208     __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
209     while (len > 0) {
210       int n_out = (len < 8) ? len : 8;
211       __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
212       __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
213       __m128i in02 = _mm_add_epi16(in0, in2);
214       __m128i d0 = _mm_unpacklo_epi16(in02, in1);
215       __m128i d1 = _mm_unpackhi_epi16(in02, in1);
216       d0 = _mm_mullo_epi16(d0, coef0);
217       d1 = _mm_mullo_epi16(d1, coef0);
218       d0 = _mm_hadd_epi16(d0, d1);
219       __m128i eight = _mm_set1_epi16(8);
220       d0 = _mm_add_epi16(d0, eight);
221       d0 = _mm_srli_epi16(d0, 4);
222       __m128i out0 = _mm_lddqu_si128((__m128i *)out);
223       __m128i n0 = _mm_set1_epi16(n_out);
224       __m128i mask = _mm_cmpgt_epi16(n0, iden);
225       out0 = _mm_blendv_epi8(out0, d0, mask);
226       _mm_storeu_si128((__m128i *)out, out0);
227       in += 8;
228       in0 = in8;
229       in8 = _mm_lddqu_si128((__m128i *)&in[8]);
230       out += 8;
231       len -= n_out;
232     }
233   } else {  // 5-tap filter
234     __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
235     __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
236     __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
237     __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
238     while (len > 0) {
239       int n_out = (len < 8) ? len : 8;
240       __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
241       __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
242       __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
243       __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
244       __m128i in04 = _mm_add_epi16(in0, in4);
245       __m128i in123 = _mm_add_epi16(in1, in2);
246       in123 = _mm_add_epi16(in123, in3);
247       __m128i d0 = _mm_unpacklo_epi16(in04, in123);
248       __m128i d1 = _mm_unpackhi_epi16(in04, in123);
249       d0 = _mm_mullo_epi16(d0, coef0);
250       d1 = _mm_mullo_epi16(d1, coef0);
251       d0 = _mm_hadd_epi16(d0, d1);
252       __m128i eight = _mm_set1_epi16(8);
253       d0 = _mm_add_epi16(d0, eight);
254       d0 = _mm_srli_epi16(d0, 4);
255       __m128i out0 = _mm_lddqu_si128((__m128i *)out);
256       __m128i n0 = _mm_set1_epi16(n_out);
257       __m128i mask = _mm_cmpgt_epi16(n0, iden);
258       out0 = _mm_blendv_epi8(out0, d0, mask);
259       _mm_storeu_si128((__m128i *)out, out0);
260       in += 8;
261       in0 = in8;
262       in8 = _mm_lddqu_si128((__m128i *)&in[8]);
263       out += 8;
264       len -= n_out;
265     }
266   }
267 }
268 
av1_highbd_upsample_intra_edge_sse4_1(uint16_t * p,int sz,int bd)269 void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) {
270   // interpolate half-sample positions
271   assert(sz <= 24);
272 
273   DECLARE_ALIGNED(16, static const int16_t,
274                   kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
275 
276   // Extend first/last samples (upper-left p[-1], last p[sz-1])
277   // to support 4-tap filter
278   p[-2] = p[-1];
279   p[sz] = p[sz - 1];
280 
281   uint16_t *in = &p[-2];
282   uint16_t *out = in;
283   int n = sz + 1;
284 
285   __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
286   __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
287   __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
288   __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
289 
290   while (n > 0) {
291     __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
292     __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
293     __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
294     __m128i sum0 = _mm_add_epi16(in0, in3);
295     __m128i sum1 = _mm_add_epi16(in1, in2);
296     __m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
297     __m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
298     __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
299     d0 = _mm_madd_epi16(d0, coef0);
300     d1 = _mm_madd_epi16(d1, coef0);
301     __m128i eight = _mm_set1_epi32(8);
302     d0 = _mm_add_epi32(d0, eight);
303     d1 = _mm_add_epi32(d1, eight);
304     d0 = _mm_srai_epi32(d0, 4);
305     d1 = _mm_srai_epi32(d1, 4);
306     d0 = _mm_packus_epi32(d0, d1);
307     __m128i max0 = _mm_set1_epi16((1 << bd) - 1);
308     d0 = _mm_min_epi16(d0, max0);
309     __m128i out0 = _mm_unpacklo_epi16(in1, d0);
310     __m128i out1 = _mm_unpackhi_epi16(in1, d0);
311     _mm_storeu_si128((__m128i *)&out[0], out0);
312     _mm_storeu_si128((__m128i *)&out[8], out1);
313     in0 = in8;
314     in8 = in16;
315     in16 = in24;
316     in24 = _mm_setzero_si128();
317     out += 16;
318     n -= 8;
319   }
320 }
321 
322 #endif  // CONFIG_AV1_HIGHBITDEPTH
323