1 /*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <stdint.h>
18 #include <x86intrin.h>
19
20 namespace renderscript {
21
22 /* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
cvtepu8_epi32(__m128i x)23 static inline __m128i cvtepu8_epi32(__m128i x) {
24 #if defined(__SSE4_1__)
25 return _mm_cvtepu8_epi32(x);
26 #elif defined(__SSSE3__)
27 const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
28 x = _mm_shuffle_epi8(x, M8to32);
29 return x;
30 #else
31 # error "Require at least SSSE3"
32 #endif
33 }
34
packus_epi32(__m128i lo,__m128i hi)35 static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
36 #if defined(__SSE4_1__)
37 return _mm_packus_epi32(lo, hi);
38 #elif defined(__SSSE3__)
39 const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
40 const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
41 const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
42 const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
43 lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
44 lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
45 hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
46 hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
47 return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
48 _mm_shuffle_epi8(hi, M32to16H));
49 #else
50 # error "Require at least SSSE3"
51 #endif
52 }
53
mullo_epi32(__m128i x,__m128i y)54 static inline __m128i mullo_epi32(__m128i x, __m128i y) {
55 #if defined(__SSE4_1__)
56 return _mm_mullo_epi32(x, y);
57 #elif defined(__SSSE3__)
58 const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
59 __m128i even = _mm_mul_epu32(x, y);
60 __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
61 _mm_srli_si128(y, 4));
62 even = _mm_and_si128(even, Meven);
63 odd = _mm_and_si128(odd, Meven);
64 return _mm_or_si128(even, _mm_slli_si128(odd, 4));
65 #else
66 # error "Require at least SSSE3"
67 #endif
68 }
69
70 /* 'mask' must packed 8-bit of 0x00 or 0xff */
blendv_epi8(__m128i x,__m128i y,__m128i mask)71 static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
72 #if defined(__SSE4_1__)
73 return _mm_blendv_epi8(x, y, mask);
74 #elif defined(__SSSE3__)
75 return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
76 #else
77 # error "Require at least SSSE3"
78 #endif
79 }
80
rsdIntrinsicConvolve3x3_K(void * dst,const void * y0,const void * y1,const void * y2,const short * coef,uint32_t count)81 extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
82 const void *y1, const void *y2,
83 const short *coef, uint32_t count) {
84 __m128i x;
85 __m128i c0, c2, c4, c6, c8;
86 __m128i r0, r1, r2;
87 __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
88 __m128i o0, o1;
89 uint32_t i;
90
91 x = _mm_loadl_epi64((const __m128i *)(coef+0));
92 c0 = _mm_shuffle_epi32(x, 0x00);
93 c2 = _mm_shuffle_epi32(x, 0x55);
94 x = _mm_loadl_epi64((const __m128i *)(coef+4));
95 c4 = _mm_shuffle_epi32(x, 0x00);
96 c6 = _mm_shuffle_epi32(x, 0x55);
97 x = _mm_loadl_epi64((const __m128i *)(coef+8));
98 c8 = _mm_shuffle_epi32(x, 0x00);
99
100 for (i = 0; i < count; ++i) {
101
102 p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
103 p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
104 p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
105 p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
106 p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
107 p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
108 p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
109 p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
110 p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
111 p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
112 p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
113 p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
114
115 o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
116 o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
117
118 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
119 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
120
121 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
122 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
123
124 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
125 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
126
127 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
128 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
129
130 o0 = _mm_srai_epi32(o0, 8);
131 o1 = _mm_srai_epi32(o1, 8);
132
133 o0 = packus_epi32(o0, o1);
134 o0 = _mm_packus_epi16(o0, o0);
135 _mm_storel_epi64((__m128i *)dst, o0);
136
137 y0 = (const char *)y0 + 8;
138 y1 = (const char *)y1 + 8;
139 y2 = (const char *)y2 + 8;
140 dst = (char *)dst + 8;
141 }
142 }
143
rsdIntrinsicColorMatrix4x4_K(void * dst,const void * src,const short * coef,uint32_t count)144 void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
145 const short *coef, uint32_t count) {
146 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
147 14, 10, 6, 2,
148 13, 9, 5, 1,
149 12, 8, 4, 0);
150
151 const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
152 const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
153 __m128i c0, c1, c2, c3;
154 __m128i i4, o4;
155 __m128i xy, zw;
156 __m128i x2, y2, z2, w2;
157 uint32_t i;
158
159 c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
160 c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
161 c0 = _mm_unpacklo_epi16(c0, c1);
162
163 c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
164 c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
165 c2 = _mm_unpacklo_epi16(c2, c3);
166
167 for (i = 0; i < count; ++i) {
168 i4 = _mm_load_si128((const __m128i *)src);
169 xy = _mm_shuffle_epi8(i4, Mxy);
170 zw = _mm_shuffle_epi8(i4, Mzw);
171
172 x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
173 y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
174 z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
175 w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
176
177 x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
178 y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
179 z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
180 w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
181
182 x2 = _mm_srai_epi32(x2, 8);
183 y2 = _mm_srai_epi32(y2, 8);
184 z2 = _mm_srai_epi32(z2, 8);
185 w2 = _mm_srai_epi32(w2, 8);
186
187 x2 = packus_epi32(x2, y2);
188 z2 = packus_epi32(z2, w2);
189 o4 = _mm_packus_epi16(x2, z2);
190
191 o4 = _mm_shuffle_epi8(o4, T4x4);
192 _mm_storeu_si128((__m128i *)dst, o4);
193
194 src = (const char *)src + 16;
195 dst = (char *)dst + 16;
196 }
197 }
198
rsdIntrinsicColorMatrix3x3_K(void * dst,const void * src,const short * coef,uint32_t count)199 void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
200 const short *coef, uint32_t count) {
201 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
202 14, 10, 6, 2,
203 13, 9, 5, 1,
204 12, 8, 4, 0);
205
206 const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
207 const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
208
209 __m128i c0, c1, c2, c3;
210 __m128i i4, o4;
211 __m128i xy, zw;
212 __m128i x2, y2, z2, w2;
213 uint32_t i;
214
215 c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
216 c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
217 c0 = _mm_unpacklo_epi16(c0, c1);
218
219 c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
220 c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
221 c2 = _mm_unpacklo_epi16(c2, c3);
222
223 for (i = 0; i < count; ++i) {
224 i4 = _mm_loadu_si128((const __m128i *)src);
225 xy = _mm_shuffle_epi8(i4, Mxy);
226 zw = _mm_shuffle_epi8(i4, Mzw);
227
228 x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
229 y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
230 z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
231
232 x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
233 y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
234 z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
235
236 x2 = _mm_srai_epi32(x2, 8);
237 y2 = _mm_srai_epi32(y2, 8);
238 z2 = _mm_srai_epi32(z2, 8);
239 w2 = _mm_srli_epi32(zw, 16);
240
241 x2 = packus_epi32(x2, y2);
242 z2 = packus_epi32(z2, w2);
243 o4 = _mm_packus_epi16(x2, z2);
244
245 o4 = _mm_shuffle_epi8(o4, T4x4);
246 _mm_storeu_si128((__m128i *)dst, o4);
247
248 src = (const char *)src + 16;
249 dst = (char *)dst + 16;
250 }
251 }
252
rsdIntrinsicColorMatrixDot_K(void * dst,const void * src,const short * coef,uint32_t count)253 void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
254 const short *coef, uint32_t count) {
255 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
256 14, 10, 6, 2,
257 13, 9, 5, 1,
258 12, 8, 4, 0);
259 const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
260 const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
261 __m128i c0, c1, c2, c3;
262 __m128i i4, o4;
263 __m128i xy, zw;
264 __m128i x2, y2, z2, w2;
265 uint32_t i;
266
267 c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
268 c0 = _mm_shufflelo_epi16(c0, 0);
269 c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
270 c1 = _mm_shufflelo_epi16(c1, 0);
271 c0 = _mm_unpacklo_epi16(c0, c1);
272
273 c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
274 c2 = _mm_shufflelo_epi16(c2, 0);
275 c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
276 c3 = _mm_shufflelo_epi16(c3, 0);
277 c2 = _mm_unpacklo_epi16(c2, c3);
278
279 for (i = 0; i < count; ++i) {
280 i4 = _mm_loadu_si128((const __m128i *)src);
281
282 xy = _mm_shuffle_epi8(i4, Mxy);
283 zw = _mm_shuffle_epi8(i4, Mzw);
284
285 x2 = _mm_madd_epi16(xy, c0);
286 x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
287
288 x2 = _mm_srai_epi32(x2, 8);
289 y2 = x2;
290 z2 = x2;
291 w2 = _mm_srli_epi32(zw, 16);
292
293 x2 = packus_epi32(x2, y2);
294 z2 = packus_epi32(z2, w2);
295 o4 = _mm_packus_epi16(x2, z2);
296
297 o4 = _mm_shuffle_epi8(o4, T4x4);
298 _mm_storeu_si128((__m128i *)dst, o4);
299
300 src = (const char *)src + 16;
301 dst = (char *)dst + 16;
302 }
303 }
304
rsdIntrinsicBlurVFU4_K(void * dst,const void * pin,int stride,const void * gptr,int rct,int x1,int x2)305 void rsdIntrinsicBlurVFU4_K(void *dst,
306 const void *pin, int stride, const void *gptr,
307 int rct, int x1, int x2) {
308 const char *pi;
309 __m128i pi0, pi1;
310 __m128 pf0, pf1;
311 __m128 bp0, bp1;
312 __m128 x;
313 int r;
314
315 for (; x1 < x2; x1 += 2) {
316 pi = (const char *)pin + (x1 << 2);
317 bp0 = _mm_setzero_ps();
318 bp1 = _mm_setzero_ps();
319
320 for (r = 0; r < rct; ++r) {
321 x = _mm_load_ss((const float *)gptr + r);
322 x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
323
324 pi0 = _mm_cvtsi32_si128(*(const int *)pi);
325 pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
326
327 pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
328 pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
329
330 bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
331 bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
332
333 pi += stride;
334 }
335
336 _mm_storeu_ps((float *)dst, bp0);
337 _mm_storeu_ps((float *)dst + 4, bp1);
338 dst = (char *)dst + 32;
339 }
340 }
341
rsdIntrinsicBlurHFU4_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)342 void rsdIntrinsicBlurHFU4_K(void *dst,
343 const void *pin, const void *gptr,
344 int rct, int x1, int x2) {
345 const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
346 const float *pi;
347 __m128 pf, x, y;
348 __m128i o;
349 int r;
350
351 for (; x1 < x2; ++x1) {
352 /* rct is define as 2*r+1 by the caller */
353 x = _mm_load_ss((const float *)gptr);
354 x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
355
356 pi = (const float *)pin + (x1 << 2);
357 pf = _mm_mul_ps(x, _mm_load_ps(pi));
358
359 for (r = 1; r < rct; r += 2) {
360 x = _mm_load_ss((const float *)gptr + r);
361 y = _mm_load_ss((const float *)gptr + r + 1);
362 x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
363 y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
364
365 pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
366 pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
367 }
368
369 o = _mm_cvtps_epi32(pf);
370 *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
371 dst = (char *)dst + 4;
372 }
373 }
374
rsdIntrinsicBlurHFU1_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)375 void rsdIntrinsicBlurHFU1_K(void *dst,
376 const void *pin, const void *gptr,
377 int rct, int x1, int x2) {
378 const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
379 const float *pi;
380 __m128 pf, g0, g1, g2, g3, gx, p0, p1;
381 __m128i o;
382 int r;
383
384 for (; x1 < x2; x1+=4) {
385 g0 = _mm_load_ss((const float *)gptr);
386 g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
387
388 pi = (const float *)pin + x1;
389 pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
390
391 for (r = 1; r < rct; r += 4) {
392 gx = _mm_loadu_ps((const float *)gptr + r);
393 p0 = _mm_loadu_ps(pi + r);
394 p1 = _mm_loadu_ps(pi + r + 4);
395
396 g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
397 pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
398 g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
399 pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
400 g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
401 pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
402 g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
403 pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
404 }
405
406 o = _mm_cvtps_epi32(pf);
407 *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
408 dst = (char *)dst + 4;
409 }
410 }
411
rsdIntrinsicYuv_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)412 void rsdIntrinsicYuv_K(void *dst,
413 const unsigned char *pY, const unsigned char *pUV,
414 uint32_t count, const short *param) {
415 __m128i biasY, biasUV;
416 __m128i c0, c1, c2, c3, c4;
417
418 biasY = _mm_set1_epi32(param[8]); /* 16 */
419 biasUV = _mm_set1_epi32(param[16]); /* 128 */
420
421 c0 = _mm_set1_epi32(param[0]); /* 298 */
422 c1 = _mm_set1_epi32(param[1]); /* 409 */
423 c2 = _mm_set1_epi32(param[2]); /* -100 */
424 c3 = _mm_set1_epi32(param[3]); /* 516 */
425 c4 = _mm_set1_epi32(param[4]); /* -208 */
426
427 __m128i Y, UV, U, V, R, G, B, A;
428
429 A = _mm_set1_epi32(255);
430 uint32_t i;
431
432 for (i = 0; i < (count << 1); ++i) {
433 Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
434 UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
435
436 Y = _mm_sub_epi32(Y, biasY);
437 UV = _mm_sub_epi32(UV, biasUV);
438
439 U = _mm_shuffle_epi32(UV, 0xf5);
440 V = _mm_shuffle_epi32(UV, 0xa0);
441
442 Y = mullo_epi32(Y, c0);
443
444 R = _mm_add_epi32(Y, mullo_epi32(V, c1));
445 R = _mm_add_epi32(R, biasUV);
446 R = _mm_srai_epi32(R, 8);
447
448 G = _mm_add_epi32(Y, mullo_epi32(U, c2));
449 G = _mm_add_epi32(G, mullo_epi32(V, c4));
450 G = _mm_add_epi32(G, biasUV);
451 G = _mm_srai_epi32(G, 8);
452
453 B = _mm_add_epi32(Y, mullo_epi32(U, c3));
454 B = _mm_add_epi32(B, biasUV);
455 B = _mm_srai_epi32(B, 8);
456
457 __m128i y1, y2, y3, y4;
458
459 y1 = packus_epi32(R, G);
460 y2 = packus_epi32(B, A);
461 y3 = _mm_packus_epi16(y1, y2);
462 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
463 14, 10, 6, 2,
464 13, 9, 5, 1,
465 12, 8, 4, 0);
466 y4 = _mm_shuffle_epi8(y3, T4x4);
467 _mm_storeu_si128((__m128i *)dst, y4);
468 pY += 4;
469 pUV += 4;
470 dst = (__m128i *)dst + 1;
471 }
472 }
473
rsdIntrinsicYuvR_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)474 void rsdIntrinsicYuvR_K(void *dst,
475 const unsigned char *pY, const unsigned char *pUV,
476 uint32_t count, const short *param) {
477 __m128i biasY, biasUV;
478 __m128i c0, c1, c2, c3, c4;
479
480 biasY = _mm_set1_epi32(param[8]); /* 16 */
481 biasUV = _mm_set1_epi32(param[16]); /* 128 */
482
483 c0 = _mm_set1_epi32(param[0]); /* 298 */
484 c1 = _mm_set1_epi32(param[1]); /* 409 */
485 c2 = _mm_set1_epi32(param[2]); /* -100 */
486 c3 = _mm_set1_epi32(param[3]); /* 516 */
487 c4 = _mm_set1_epi32(param[4]); /* -208 */
488
489 __m128i Y, UV, U, V, R, G, B, A;
490
491 A = _mm_set1_epi32(255);
492 uint32_t i;
493
494 for (i = 0; i < (count << 1); ++i) {
495 Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
496 UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
497
498 Y = _mm_sub_epi32(Y, biasY);
499 UV = _mm_sub_epi32(UV, biasUV);
500
501 V = _mm_shuffle_epi32(UV, 0xf5);
502 U = _mm_shuffle_epi32(UV, 0xa0);
503
504 Y = mullo_epi32(Y, c0);
505
506 R = _mm_add_epi32(Y, mullo_epi32(V, c1));
507 R = _mm_add_epi32(R, biasUV);
508 R = _mm_srai_epi32(R, 8);
509
510 G = _mm_add_epi32(Y, mullo_epi32(U, c2));
511 G = _mm_add_epi32(G, mullo_epi32(V, c4));
512 G = _mm_add_epi32(G, biasUV);
513 G = _mm_srai_epi32(G, 8);
514
515 B = _mm_add_epi32(Y, mullo_epi32(U, c3));
516 B = _mm_add_epi32(B, biasUV);
517 B = _mm_srai_epi32(B, 8);
518
519 __m128i y1, y2, y3, y4;
520
521 y1 = packus_epi32(R, G);
522 y2 = packus_epi32(B, A);
523 y3 = _mm_packus_epi16(y1, y2);
524 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
525 14, 10, 6, 2,
526 13, 9, 5, 1,
527 12, 8, 4, 0);
528 y4 = _mm_shuffle_epi8(y3, T4x4);
529 _mm_storeu_si128((__m128i *)dst, y4);
530 pY += 4;
531 pUV += 4;
532 dst = (__m128i *)dst + 1;
533 }
534 }
535
rsdIntrinsicYuv2_K(void * dst,const unsigned char * pY,const unsigned char * pU,const unsigned char * pV,uint32_t count,const short * param)536 void rsdIntrinsicYuv2_K(void *dst,
537 const unsigned char *pY, const unsigned char *pU,
538 const unsigned char *pV, uint32_t count, const short *param) {
539 __m128i biasY, biasUV;
540 __m128i c0, c1, c2, c3, c4;
541
542 biasY = _mm_set1_epi32(param[8]); /* 16 */
543 biasUV = _mm_set1_epi32(param[16]); /* 128 */
544
545 c0 = _mm_set1_epi32(param[0]); /* 298 */
546 c1 = _mm_set1_epi32(param[1]); /* 409 */
547 c2 = _mm_set1_epi32(param[2]); /* -100 */
548 c3 = _mm_set1_epi32(param[3]); /* 516 */
549 c4 = _mm_set1_epi32(param[4]); /* -208 */
550
551 __m128i Y, U, V, R, G, B, A;
552
553 A = _mm_set1_epi32(255);
554 uint32_t i;
555
556 for (i = 0; i < (count << 1); ++i) {
557 Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
558 U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
559 V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
560
561 Y = _mm_sub_epi32(Y, biasY);
562 U = _mm_sub_epi32(U, biasUV);
563 V = _mm_sub_epi32(V, biasUV);
564
565 Y = mullo_epi32(Y, c0);
566
567 R = _mm_add_epi32(Y, mullo_epi32(V, c1));
568 R = _mm_add_epi32(R, biasUV);
569 R = _mm_srai_epi32(R, 8);
570
571 G = _mm_add_epi32(Y, mullo_epi32(U, c2));
572 G = _mm_add_epi32(G, mullo_epi32(V, c4));
573 G = _mm_add_epi32(G, biasUV);
574 G = _mm_srai_epi32(G, 8);
575
576 B = _mm_add_epi32(Y, mullo_epi32(U, c3));
577 B = _mm_add_epi32(B, biasUV);
578 B = _mm_srai_epi32(B, 8);
579
580 __m128i y1, y2, y3, y4;
581
582 y1 = packus_epi32(R, G);
583 y2 = packus_epi32(B, A);
584 y3 = _mm_packus_epi16(y1, y2);
585 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
586 14, 10, 6, 2,
587 13, 9, 5, 1,
588 12, 8, 4, 0);
589 y4 = _mm_shuffle_epi8(y3, T4x4);
590 _mm_storeu_si128((__m128i *)dst, y4);
591 pY += 4;
592 pU += 4;
593 pV += 4;
594 dst = (__m128i *)dst + 1;
595 }
596 }
597
rsdIntrinsicConvolve5x5_K(void * dst,const void * y0,const void * y1,const void * y2,const void * y3,const void * y4,const short * coef,uint32_t count)598 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
599 const void *y1, const void *y2,
600 const void *y3, const void *y4,
601 const short *coef, uint32_t count) {
602 __m128i x;
603 __m128i c0, c2, c4, c6, c8, c10, c12;
604 __m128i c14, c16, c18, c20, c22, c24;
605 __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
606 __m128i p0, p1, p2, p3, p4, p5, p6, p7;
607 __m128i p8, p9, p10, p11, p12, p13, p14, p15;
608 __m128i p16, p17, p18, p19, p20, p21, p22, p23;
609 __m128i p24, p25, p26, p27, p28, p29, p30, p31;
610 __m128i p32, p33, p34, p35, p36, p37, p38, p39;
611 __m128i o0, o1, o2, o3;
612 uint32_t i;
613
614 x = _mm_loadl_epi64((const __m128i *)(coef+0));
615 c0 = _mm_shuffle_epi32(x, 0x00);
616 c2 = _mm_shuffle_epi32(x, 0x55);
617
618 x = _mm_loadl_epi64((const __m128i *)(coef+4));
619 c4 = _mm_shuffle_epi32(x, 0x00);
620 c6 = _mm_shuffle_epi32(x, 0x55);
621
622 x = _mm_loadl_epi64((const __m128i *)(coef+8));
623 c8 = _mm_shuffle_epi32(x, 0x00);
624 c10 = _mm_shuffle_epi32(x, 0x55);
625
626 x = _mm_loadl_epi64((const __m128i *)(coef+12));
627 c12 = _mm_shuffle_epi32(x, 0x00);
628 c14 = _mm_shuffle_epi32(x, 0x55);
629
630 x = _mm_loadl_epi64((const __m128i *)(coef+16));
631 c16 = _mm_shuffle_epi32(x, 0x00);
632 c18 = _mm_shuffle_epi32(x, 0x55);
633
634 x = _mm_loadl_epi64((const __m128i *)(coef+20));
635 c20 = _mm_shuffle_epi32(x, 0x00);
636 c22 = _mm_shuffle_epi32(x, 0x55);
637
638 x = _mm_loadl_epi64((const __m128i *)(coef+24));
639 c24 = _mm_shuffle_epi32(x, 0x00);
640
641 for (i = 0; i < count; ++i) {
642
643 p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
644 p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
645 p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
646 p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
647 p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
648 p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
649 p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
650 p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
651
652 p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
653 p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
654 p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
655 p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
656 p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
657 p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
658 p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
659 p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
660
661 p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
662 p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
663 p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
664 p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
665 p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
666 p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
667 p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
668 p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
669
670 p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
671 p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
672 p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
673 p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
674 p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
675 p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
676 p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
677 p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
678
679 p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
680 p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
681 p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
682 p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
683 p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
684 p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
685 p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
686 p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
687
688 o0 = _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1), c0);
689 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3), c2));
690 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8), c4));
691 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10), c6));
692 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c8));
693 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
694 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
695 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
696 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
697 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
698 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
699 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
700 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
701 o0 = _mm_srai_epi32(o0, 8);
702
703 o1 = _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2), c0);
704 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c2));
705 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9), c4));
706 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11), c6));
707 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13), c8));
708 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
709 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
710 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
711 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
712 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
713 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
714 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
715 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
716 o1 = _mm_srai_epi32(o1, 8);
717
718 o2 = _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3), c0);
719 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5), c2));
720 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10), c4));
721 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c6));
722 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14), c8));
723 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
724 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
725 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
726 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
727 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
728 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
729 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
730 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
731 o2 = _mm_srai_epi32(o2, 8);
732
733 o3 = _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c0);
734 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6), c2));
735 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11), c4));
736 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13), c6));
737 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15), c8));
738 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
739 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
740 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
741 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
742 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
743 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
744 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
745 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
746 o3 = _mm_srai_epi32(o3, 8);
747
748 o0 = packus_epi32(o0, o1);
749 o2 = packus_epi32(o2, o3);
750 o0 = _mm_packus_epi16(o0, o2);
751 _mm_storeu_si128((__m128i *)dst, o0);
752
753 y0 = (const char *)y0 + 16;
754 y1 = (const char *)y1 + 16;
755 y2 = (const char *)y2 + 16;
756 y3 = (const char *)y3 + 16;
757 y4 = (const char *)y4 + 16;
758 dst = (char *)dst + 16;
759 }
760 }
761
rsdIntrinsicBlendSrcOver_K(void * dst,const void * src,uint32_t count8)762 void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
763 __m128i all1s, ina, ins;
764 __m128i in0, in1, out0, out1;
765 __m128i t0, t1, t2, t3;
766 uint32_t i;
767
768 all1s = _mm_set1_epi16(255);
769
770 for (i = 0; i < count8; ++i) {
771 in0 = _mm_loadu_si128((const __m128i *)src);
772 in1 = _mm_loadu_si128((const __m128i *)src + 1);
773 out0 = _mm_loadu_si128((const __m128i *)dst);
774 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
775
776 ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
777 ina = _mm_shufflelo_epi16(ins, 0xFF);
778 ina = _mm_shufflehi_epi16(ina, 0xFF);
779 t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
780 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
781 t0 = _mm_srli_epi16(t0, 8);
782 t0 = _mm_add_epi16(t0, ins);
783
784 ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
785 ina = _mm_shufflelo_epi16(ins, 0xFF);
786 ina = _mm_shufflehi_epi16(ina, 0xFF);
787 t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
788 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
789 t1 = _mm_srli_epi16(t1, 8);
790 t1 = _mm_add_epi16(t1, ins);
791
792 ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
793 ina = _mm_shufflelo_epi16(ins, 0xFF);
794 ina = _mm_shufflehi_epi16(ina, 0xFF);
795 t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
796 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
797 t2 = _mm_srli_epi16(t2, 8);
798 t2 = _mm_add_epi16(t2, ins);
799
800 ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
801 ina = _mm_shufflelo_epi16(ins, 0xFF);
802 ina = _mm_shufflehi_epi16(ina, 0xFF);
803 t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
804 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
805 t3 = _mm_srli_epi16(t3, 8);
806 t3 = _mm_add_epi16(t3, ins);
807
808 t0 = _mm_packus_epi16(t0, t1);
809 t2 = _mm_packus_epi16(t2, t3);
810 _mm_storeu_si128((__m128i *)dst, t0);
811 _mm_storeu_si128((__m128i *)dst + 1, t2);
812
813 src = (const __m128i *)src + 2;
814 dst = (__m128i *)dst + 2;
815 }
816 }
817
rsdIntrinsicBlendDstOver_K(void * dst,const void * src,uint32_t count8)818 void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
819 __m128i all1s, outa, outs;
820 __m128i in0, in1, out0, out1;
821 __m128i t0, t1, t2, t3;
822 uint32_t i;
823
824 all1s = _mm_set1_epi16(255);
825
826 for (i = 0; i < count8; ++i) {
827 in0 = _mm_loadu_si128((const __m128i *)src);
828 in1 = _mm_loadu_si128((const __m128i *)src + 1);
829 out0 = _mm_loadu_si128((const __m128i *)dst);
830 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
831
832
833 outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
834 outa = _mm_shufflelo_epi16(outs, 0xFF);
835 outa = _mm_shufflehi_epi16(outa, 0xFF);
836 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
837 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
838 t0 = _mm_srli_epi16(t0, 8);
839 t0 = _mm_add_epi16(t0, outs);
840
841 outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
842 outa = _mm_shufflelo_epi16(outs, 0xFF);
843 outa = _mm_shufflehi_epi16(outa, 0xFF);
844 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
845 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
846 t1 = _mm_srli_epi16(t1, 8);
847 t1 = _mm_add_epi16(t1, outs);
848
849 outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
850 outa = _mm_shufflelo_epi16(outs, 0xFF);
851 outa = _mm_shufflehi_epi16(outa, 0xFF);
852 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
853 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
854 t2 = _mm_srli_epi16(t2, 8);
855 t2 = _mm_add_epi16(t2, outs);
856
857 outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
858 outa = _mm_shufflelo_epi16(outs, 0xFF);
859 outa = _mm_shufflehi_epi16(outa, 0xFF);
860 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
861 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
862 t3 = _mm_srli_epi16(t3, 8);
863 t3 = _mm_add_epi16(t3, outs);
864
865 t0 = _mm_packus_epi16(t0, t1);
866 t2 = _mm_packus_epi16(t2, t3);
867 _mm_storeu_si128((__m128i *)dst, t0);
868 _mm_storeu_si128((__m128i *)dst + 1, t2);
869
870 src = (const __m128i *)src + 2;
871 dst = (__m128i *)dst + 2;
872 }
873 }
874
rsdIntrinsicBlendSrcIn_K(void * dst,const void * src,uint32_t count8)875 void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
876 __m128i outa;
877 __m128i in0, in1, out0, out1;
878 __m128i t0, t1, t2, t3;
879 uint32_t i;
880
881 for (i = 0; i < count8; ++i) {
882 in0 = _mm_loadu_si128((const __m128i *)src);
883 in1 = _mm_loadu_si128((const __m128i *)src + 1);
884 out0 = _mm_loadu_si128((const __m128i *)dst);
885 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
886
887 outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
888 outa = _mm_shufflelo_epi16(outa, 0xFF);
889 outa = _mm_shufflehi_epi16(outa, 0xFF);
890 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
891 t0 = _mm_mullo_epi16(t0, outa);
892 t0 = _mm_srli_epi16(t0, 8);
893
894 outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
895 outa = _mm_shufflelo_epi16(outa, 0xFF);
896 outa = _mm_shufflehi_epi16(outa, 0xFF);
897 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
898 t1 = _mm_mullo_epi16(t1, outa);
899 t1 = _mm_srli_epi16(t1, 8);
900
901 outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
902 outa = _mm_shufflelo_epi16(outa, 0xFF);
903 outa = _mm_shufflehi_epi16(outa, 0xFF);
904 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
905 t2 = _mm_mullo_epi16(t2, outa);
906 t2 = _mm_srli_epi16(t2, 8);
907
908 outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
909 outa = _mm_shufflelo_epi16(outa, 0xFF);
910 outa = _mm_shufflehi_epi16(outa, 0xFF);
911 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
912 t3 = _mm_mullo_epi16(t3, outa);
913 t3 = _mm_srli_epi16(t3, 8);
914
915 t0 = _mm_packus_epi16(t0, t1);
916 t2 = _mm_packus_epi16(t2, t3);
917 _mm_storeu_si128((__m128i *)dst, t0);
918 _mm_storeu_si128((__m128i *)dst + 1, t2);
919
920 src = (const __m128i *)src + 2;
921 dst = (__m128i *)dst + 2;
922 }
923 }
924
rsdIntrinsicBlendDstIn_K(void * dst,const void * src,uint32_t count8)925 void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
926 __m128i ina;
927 __m128i in0, in1, out0, out1;
928 __m128i t0, t1, t2, t3;
929 uint32_t i;
930
931 for (i = 0; i < count8; ++i) {
932 in0 = _mm_loadu_si128((const __m128i *)src);
933 in1 = _mm_loadu_si128((const __m128i *)src + 1);
934 out0 = _mm_loadu_si128((const __m128i *)dst);
935 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
936
937 ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
938 ina = _mm_shufflelo_epi16(ina, 0xFF);
939 ina = _mm_shufflehi_epi16(ina, 0xFF);
940 t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
941 t0 = _mm_mullo_epi16(t0, ina);
942 t0 = _mm_srli_epi16(t0, 8);
943
944 ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
945 ina = _mm_shufflelo_epi16(ina, 0xFF);
946 ina = _mm_shufflehi_epi16(ina, 0xFF);
947 t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
948 t1 = _mm_mullo_epi16(t1, ina);
949 t1 = _mm_srli_epi16(t1, 8);
950
951 ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
952 ina = _mm_shufflelo_epi16(ina, 0xFF);
953 ina = _mm_shufflehi_epi16(ina, 0xFF);
954 t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
955 t2 = _mm_mullo_epi16(t2, ina);
956 t2 = _mm_srli_epi16(t2, 8);
957
958 ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
959 ina = _mm_shufflelo_epi16(ina, 0xFF);
960 ina = _mm_shufflehi_epi16(ina, 0xFF);
961 t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
962 t3 = _mm_mullo_epi16(t3, ina);
963 t3 = _mm_srli_epi16(t3, 8);
964
965 t0 = _mm_packus_epi16(t0, t1);
966 t2 = _mm_packus_epi16(t2, t3);
967 _mm_storeu_si128((__m128i *)dst, t0);
968 _mm_storeu_si128((__m128i *)dst + 1, t2);
969
970 src = (const __m128i *)src + 2;
971 dst = (__m128i *)dst + 2;
972 }
973 }
974
rsdIntrinsicBlendSrcOut_K(void * dst,const void * src,uint32_t count8)975 void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
976 __m128i all1s, outa;
977 __m128i in0, in1, out0, out1;
978 __m128i t0, t1, t2, t3;
979 uint32_t i;
980
981 all1s = _mm_set1_epi16(255);
982
983 for (i = 0; i < count8; ++i) {
984 in0 = _mm_loadu_si128((const __m128i *)src);
985 in1 = _mm_loadu_si128((const __m128i *)src + 1);
986 out0 = _mm_loadu_si128((const __m128i *)dst);
987 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
988
989 outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
990 outa = _mm_shufflelo_epi16(outa, 0xFF);
991 outa = _mm_shufflehi_epi16(outa, 0xFF);
992 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
993 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
994 t0 = _mm_srli_epi16(t0, 8);
995
996 outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
997 outa = _mm_shufflelo_epi16(outa, 0xFF);
998 outa = _mm_shufflehi_epi16(outa, 0xFF);
999 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1000 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
1001 t1 = _mm_srli_epi16(t1, 8);
1002
1003 outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1004 outa = _mm_shufflelo_epi16(outa, 0xFF);
1005 outa = _mm_shufflehi_epi16(outa, 0xFF);
1006 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1007 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
1008 t2 = _mm_srli_epi16(t2, 8);
1009
1010 outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1011 outa = _mm_shufflelo_epi16(outa, 0xFF);
1012 outa = _mm_shufflehi_epi16(outa, 0xFF);
1013 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1014 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
1015 t3 = _mm_srli_epi16(t3, 8);
1016
1017 t0 = _mm_packus_epi16(t0, t1);
1018 t2 = _mm_packus_epi16(t2, t3);
1019 _mm_storeu_si128((__m128i *)dst, t0);
1020 _mm_storeu_si128((__m128i *)dst + 1, t2);
1021
1022 src = (const __m128i *)src + 2;
1023 dst = (__m128i *)dst + 2;
1024 }
1025 }
1026
rsdIntrinsicBlendDstOut_K(void * dst,const void * src,uint32_t count8)1027 void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
1028 __m128i all1s, ina;
1029 __m128i in0, in1, out0, out1;
1030 __m128i t0, t1, t2, t3;
1031 uint32_t i;
1032
1033 all1s = _mm_set1_epi16(255);
1034
1035 for (i = 0; i < count8; ++i) {
1036 in0 = _mm_loadu_si128((const __m128i *)src);
1037 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1038 out0 = _mm_loadu_si128((const __m128i *)dst);
1039 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1040
1041 ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1042 ina = _mm_shufflelo_epi16(ina, 0xFF);
1043 ina = _mm_shufflehi_epi16(ina, 0xFF);
1044 t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1045 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
1046 t0 = _mm_srli_epi16(t0, 8);
1047
1048 ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1049 ina = _mm_shufflelo_epi16(ina, 0xFF);
1050 ina = _mm_shufflehi_epi16(ina, 0xFF);
1051 t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1052 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
1053 t1 = _mm_srli_epi16(t1, 8);
1054
1055 ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1056 ina = _mm_shufflelo_epi16(ina, 0xFF);
1057 ina = _mm_shufflehi_epi16(ina, 0xFF);
1058 t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1059 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
1060 t2 = _mm_srli_epi16(t2, 8);
1061
1062 ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1063 ina = _mm_shufflelo_epi16(ina, 0xFF);
1064 ina = _mm_shufflehi_epi16(ina, 0xFF);
1065 t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1066 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
1067 t3 = _mm_srli_epi16(t3, 8);
1068
1069 t0 = _mm_packus_epi16(t0, t1);
1070 t2 = _mm_packus_epi16(t2, t3);
1071 _mm_storeu_si128((__m128i *)dst, t0);
1072 _mm_storeu_si128((__m128i *)dst + 1, t2);
1073
1074 src = (const __m128i *)src + 2;
1075 dst = (__m128i *)dst + 2;
1076 }
1077 }
1078
rsdIntrinsicBlendSrcAtop_K(void * dst,const void * src,uint32_t count8)1079 void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
1080 const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1081 __m128i all1s, ina, outa, ins, outs;
1082 __m128i in0, in1, out0, out1;
1083 __m128i t0, t1, t2, t3;
1084 uint32_t i;
1085
1086 all1s = _mm_set1_epi16(255);
1087
1088 for (i = 0; i < count8; ++i) {
1089 in0 = _mm_loadu_si128((const __m128i *)src);
1090 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1091 out0 = _mm_loadu_si128((const __m128i *)dst);
1092 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1093
1094 ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1095 ina = _mm_shufflelo_epi16(ins, 0xFF);
1096 ina = _mm_shufflehi_epi16(ina, 0xFF);
1097 outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1098 outa = _mm_shufflelo_epi16(outs, 0xFF);
1099 outa = _mm_shufflehi_epi16(outa, 0xFF);
1100 t0 = _mm_sub_epi16(all1s, ina);
1101 t0 = _mm_mullo_epi16(t0, outs);
1102 t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
1103 t0 = _mm_srli_epi16(t0, 8);
1104
1105 ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1106 ina = _mm_shufflelo_epi16(ins, 0xFF);
1107 ina = _mm_shufflehi_epi16(ina, 0xFF);
1108 outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1109 outa = _mm_shufflelo_epi16(outs, 0xFF);
1110 outa = _mm_shufflehi_epi16(outa, 0xFF);
1111 t1 = _mm_sub_epi16(all1s, ina);
1112 t1 = _mm_mullo_epi16(t1, outs);
1113 t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
1114 t1 = _mm_srli_epi16(t1, 8);
1115
1116 ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1117 ina = _mm_shufflelo_epi16(ins, 0xFF);
1118 ina = _mm_shufflehi_epi16(ina, 0xFF);
1119 outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1120 outa = _mm_shufflelo_epi16(outs, 0xFF);
1121 outa = _mm_shufflehi_epi16(outa, 0xFF);
1122 t2 = _mm_sub_epi16(all1s, ina);
1123 t2 = _mm_mullo_epi16(t2, outs);
1124 t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
1125 t2 = _mm_srli_epi16(t2, 8);
1126
1127 ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1128 ina = _mm_shufflelo_epi16(ins, 0xFF);
1129 ina = _mm_shufflehi_epi16(ina, 0xFF);
1130 outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1131 outa = _mm_shufflelo_epi16(outs, 0xFF);
1132 outa = _mm_shufflehi_epi16(outa, 0xFF);
1133 t3 = _mm_sub_epi16(all1s, ina);
1134 t3 = _mm_mullo_epi16(t3, outs);
1135 t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
1136 t3 = _mm_srli_epi16(t3, 8);
1137
1138 t0 = _mm_packus_epi16(t0, t1);
1139 t0 = blendv_epi8(t0, out0, M0001);
1140 t2 = _mm_packus_epi16(t2, t3);
1141 t2 = blendv_epi8(t2, out1, M0001);
1142 _mm_storeu_si128((__m128i *)dst, t0);
1143 _mm_storeu_si128((__m128i *)dst + 1, t2);
1144
1145 src = (const __m128i *)src + 2;
1146 dst = (__m128i *)dst + 2;
1147 }
1148 }
1149
rsdIntrinsicBlendDstAtop_K(void * dst,const void * src,uint32_t count8)1150 void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
1151 const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1152 __m128i all1s, ina, ins, outa, outs;
1153 __m128i in0, in1, out0, out1;
1154 __m128i t0, t1, t2, t3;
1155 uint32_t i;
1156
1157 all1s = _mm_set1_epi16(255);
1158
1159 for (i = 0; i < count8; ++i) {
1160 in0 = _mm_loadu_si128((const __m128i *)src);
1161 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1162 out0 = _mm_loadu_si128((const __m128i *)dst);
1163 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1164
1165 ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1166 ina = _mm_shufflelo_epi16(ins, 0xFF);
1167 ina = _mm_shufflehi_epi16(ina, 0xFF);
1168 outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1169 outa = _mm_shufflelo_epi16(outs, 0xFF);
1170 outa = _mm_shufflehi_epi16(outa, 0xFF);
1171 t0 = _mm_sub_epi16(all1s, outa);
1172 t0 = _mm_mullo_epi16(t0, ins);
1173 t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
1174 t0 = _mm_srli_epi16(t0, 8);
1175
1176 ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1177 ina = _mm_shufflelo_epi16(ins, 0xFF);
1178 ina = _mm_shufflehi_epi16(ina, 0xFF);
1179 outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1180 outa = _mm_shufflelo_epi16(outs, 0xFF);
1181 outa = _mm_shufflehi_epi16(outa, 0xFF);
1182 t1 = _mm_sub_epi16(all1s, outa);
1183 t1 = _mm_mullo_epi16(t1, ins);
1184 t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
1185 t1 = _mm_srli_epi16(t1, 8);
1186
1187 ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1188 ina = _mm_shufflelo_epi16(ins, 0xFF);
1189 ina = _mm_shufflehi_epi16(ina, 0xFF);
1190 outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1191 outa = _mm_shufflelo_epi16(outs, 0xFF);
1192 outa = _mm_shufflehi_epi16(outa, 0xFF);
1193 t2 = _mm_sub_epi16(all1s, outa);
1194 t2 = _mm_mullo_epi16(t2, ins);
1195 t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
1196 t2 = _mm_srli_epi16(t2, 8);
1197
1198 ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1199 ina = _mm_shufflelo_epi16(ins, 0xFF);
1200 ina = _mm_shufflehi_epi16(ina, 0xFF);
1201 outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1202 outa = _mm_shufflelo_epi16(outs, 0xFF);
1203 outa = _mm_shufflehi_epi16(outa, 0xFF);
1204 t3 = _mm_sub_epi16(all1s, outa);
1205 t3 = _mm_mullo_epi16(t3, ins);
1206 t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
1207 t3 = _mm_srli_epi16(t3, 8);
1208
1209 t0 = _mm_packus_epi16(t0, t1);
1210 t0 = blendv_epi8(t0, in0, M0001);
1211 t2 = _mm_packus_epi16(t2, t3);
1212 t2 = blendv_epi8(t2, in1, M0001);
1213 _mm_storeu_si128((__m128i *)dst, t0);
1214 _mm_storeu_si128((__m128i *)dst + 1, t2);
1215
1216 src = (const __m128i *)src + 2;
1217 dst = (__m128i *)dst + 2;
1218 }
1219 }
1220
rsdIntrinsicBlendXor_K(void * dst,const void * src,uint32_t count8)1221 void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
1222 __m128i in0, in1, out0, out1;
1223 uint32_t i;
1224
1225 for (i = 0; i < count8; ++i) {
1226 in0 = _mm_loadu_si128((const __m128i *)src);
1227 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1228 out0 = _mm_loadu_si128((const __m128i *)dst);
1229 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1230
1231 out0 = _mm_xor_si128(out0, in0);
1232 out1 = _mm_xor_si128(out1, in1);
1233
1234 _mm_storeu_si128((__m128i *)dst, out0);
1235 _mm_storeu_si128((__m128i *)dst + 1, out1);
1236
1237 src = (const __m128i *)src + 2;
1238 dst = (__m128i *)dst + 2;
1239 }
1240 }
1241
rsdIntrinsicBlendMultiply_K(void * dst,const void * src,uint32_t count8)1242 void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
1243 __m128i in0, in1, out0, out1;
1244 __m128i t0, t1, t2, t3;
1245 uint32_t i;
1246
1247 for (i = 0; i < count8; ++i) {
1248 in0 = _mm_loadu_si128((const __m128i *)src);
1249 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1250 out0 = _mm_loadu_si128((const __m128i *)dst);
1251 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1252
1253 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1254 t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
1255 t0 = _mm_srli_epi16(t0, 8);
1256
1257 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1258 t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
1259 t1 = _mm_srli_epi16(t1, 8);
1260
1261 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1262 t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
1263 t2 = _mm_srli_epi16(t2, 8);
1264
1265 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1266 t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
1267 t3 = _mm_srli_epi16(t3, 8);
1268
1269 t0 = _mm_packus_epi16(t0, t1);
1270 t2 = _mm_packus_epi16(t2, t3);
1271 _mm_storeu_si128((__m128i *)dst, t0);
1272 _mm_storeu_si128((__m128i *)dst + 1, t2);
1273
1274 src = (const __m128i *)src + 2;
1275 dst = (__m128i *)dst + 2;
1276 }
1277 }
1278
rsdIntrinsicBlendAdd_K(void * dst,const void * src,uint32_t count8)1279 void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
1280 __m128i in0, in1, out0, out1;
1281 uint32_t i;
1282
1283 for (i = 0; i < count8; ++i) {
1284 in0 = _mm_loadu_si128((const __m128i *)src);
1285 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1286 out0 = _mm_loadu_si128((const __m128i *)dst);
1287 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1288
1289 out0 = _mm_adds_epu8(out0, in0);
1290 out1 = _mm_adds_epu8(out1, in1);
1291
1292 _mm_storeu_si128((__m128i *)dst, out0);
1293 _mm_storeu_si128((__m128i *)dst + 1, out1);
1294
1295 src = (const __m128i *)src + 2;
1296 dst = (__m128i *)dst + 2;
1297 }
1298 }
1299
rsdIntrinsicBlendSub_K(void * dst,const void * src,uint32_t count8)1300 void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
1301 __m128i in0, in1, out0, out1;
1302 uint32_t i;
1303
1304 for (i = 0; i < count8; ++i) {
1305 in0 = _mm_loadu_si128((const __m128i *)src);
1306 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1307 out0 = _mm_loadu_si128((const __m128i *)dst);
1308 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1309
1310 out0 = _mm_subs_epu8(out0, in0);
1311 out1 = _mm_subs_epu8(out1, in1);
1312
1313 _mm_storeu_si128((__m128i *)dst, out0);
1314 _mm_storeu_si128((__m128i *)dst + 1, out1);
1315
1316 src = (const __m128i *)src + 2;
1317 dst = (__m128i *)dst + 2;
1318 }
1319 }
1320
1321 } // namespace renderscript
1322