1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <stdint.h>
18 #include <x86intrin.h>
19 
20 namespace renderscript {
21 
22 /* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
cvtepu8_epi32(__m128i x)23 static inline __m128i cvtepu8_epi32(__m128i x) {
24 #if defined(__SSE4_1__)
25     return _mm_cvtepu8_epi32(x);
26 #elif defined(__SSSE3__)
27     const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
28     x = _mm_shuffle_epi8(x, M8to32);
29     return x;
30 #else
31 #   error "Require at least SSSE3"
32 #endif
33 }
34 
packus_epi32(__m128i lo,__m128i hi)35 static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
36 #if defined(__SSE4_1__)
37     return _mm_packus_epi32(lo, hi);
38 #elif defined(__SSSE3__)
39     const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
40     const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
41     const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
42     const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
43     lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
44     lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
45     hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
46     hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
47     return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
48                         _mm_shuffle_epi8(hi, M32to16H));
49 #else
50 #   error "Require at least SSSE3"
51 #endif
52 }
53 
mullo_epi32(__m128i x,__m128i y)54 static inline __m128i mullo_epi32(__m128i x, __m128i y) {
55 #if defined(__SSE4_1__)
56     return _mm_mullo_epi32(x, y);
57 #elif defined(__SSSE3__)
58     const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
59     __m128i even = _mm_mul_epu32(x, y);
60     __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
61                                 _mm_srli_si128(y, 4));
62     even = _mm_and_si128(even, Meven);
63     odd = _mm_and_si128(odd, Meven);
64     return _mm_or_si128(even, _mm_slli_si128(odd, 4));
65 #else
66 #   error "Require at least SSSE3"
67 #endif
68 }
69 
70 /* 'mask' must packed 8-bit of 0x00 or 0xff */
blendv_epi8(__m128i x,__m128i y,__m128i mask)71 static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
72 #if defined(__SSE4_1__)
73     return _mm_blendv_epi8(x, y, mask);
74 #elif defined(__SSSE3__)
75     return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
76 #else
77 #   error "Require at least SSSE3"
78 #endif
79 }
80 
rsdIntrinsicConvolve3x3_K(void * dst,const void * y0,const void * y1,const void * y2,const short * coef,uint32_t count)81 extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
82                                           const void *y1, const void *y2,
83                                           const short *coef, uint32_t count) {
84     __m128i x;
85     __m128i c0, c2, c4, c6, c8;
86     __m128i r0, r1, r2;
87     __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
88     __m128i o0, o1;
89     uint32_t i;
90 
91     x = _mm_loadl_epi64((const __m128i *)(coef+0));
92     c0 = _mm_shuffle_epi32(x, 0x00);
93     c2 = _mm_shuffle_epi32(x, 0x55);
94     x = _mm_loadl_epi64((const __m128i *)(coef+4));
95     c4 = _mm_shuffle_epi32(x, 0x00);
96     c6 = _mm_shuffle_epi32(x, 0x55);
97     x = _mm_loadl_epi64((const __m128i *)(coef+8));
98     c8 = _mm_shuffle_epi32(x, 0x00);
99 
100     for (i = 0; i < count; ++i) {
101 
102         p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
103         p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
104         p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
105         p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
106         p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
107         p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
108         p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
109         p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
110         p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
111         p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
112         p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
113         p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
114 
115         o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
116         o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
117 
118         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
119         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
120 
121         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
122         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
123 
124         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
125         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
126 
127         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
128         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
129 
130         o0 = _mm_srai_epi32(o0, 8);
131         o1 = _mm_srai_epi32(o1, 8);
132 
133         o0 = packus_epi32(o0, o1);
134         o0 = _mm_packus_epi16(o0, o0);
135         _mm_storel_epi64((__m128i *)dst, o0);
136 
137         y0 = (const char *)y0 + 8;
138         y1 = (const char *)y1 + 8;
139         y2 = (const char *)y2 + 8;
140         dst = (char *)dst + 8;
141     }
142 }
143 
rsdIntrinsicColorMatrix4x4_K(void * dst,const void * src,const short * coef,uint32_t count)144 void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
145                                   const short *coef, uint32_t count) {
146     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
147                                       14, 10, 6, 2,
148                                       13,  9, 5, 1,
149                                       12,  8, 4, 0);
150 
151     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
152     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
153     __m128i c0, c1, c2, c3;
154     __m128i i4, o4;
155     __m128i xy, zw;
156     __m128i x2, y2, z2, w2;
157     uint32_t i;
158 
159     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
160     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
161     c0 = _mm_unpacklo_epi16(c0, c1);
162 
163     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
164     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
165     c2 = _mm_unpacklo_epi16(c2, c3);
166 
167     for (i = 0; i < count; ++i) {
168         i4 = _mm_load_si128((const __m128i *)src);
169         xy = _mm_shuffle_epi8(i4, Mxy);
170         zw = _mm_shuffle_epi8(i4, Mzw);
171 
172         x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
173         y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
174         z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
175         w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
176 
177         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
178         y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
179         z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
180         w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
181 
182         x2 = _mm_srai_epi32(x2, 8);
183         y2 = _mm_srai_epi32(y2, 8);
184         z2 = _mm_srai_epi32(z2, 8);
185         w2 = _mm_srai_epi32(w2, 8);
186 
187         x2 = packus_epi32(x2, y2);
188         z2 = packus_epi32(z2, w2);
189         o4 = _mm_packus_epi16(x2, z2);
190 
191         o4 = _mm_shuffle_epi8(o4, T4x4);
192         _mm_storeu_si128((__m128i *)dst, o4);
193 
194         src = (const char *)src + 16;
195         dst = (char *)dst + 16;
196     }
197 }
198 
rsdIntrinsicColorMatrix3x3_K(void * dst,const void * src,const short * coef,uint32_t count)199 void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
200                                   const short *coef, uint32_t count) {
201     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
202                                       14, 10, 6, 2,
203                                       13,  9, 5, 1,
204                                       12,  8, 4, 0);
205 
206     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
207     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
208 
209     __m128i c0, c1, c2, c3;
210     __m128i i4, o4;
211     __m128i xy, zw;
212     __m128i x2, y2, z2, w2;
213     uint32_t i;
214 
215     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
216     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
217     c0 = _mm_unpacklo_epi16(c0, c1);
218 
219     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
220     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
221     c2 = _mm_unpacklo_epi16(c2, c3);
222 
223     for (i = 0; i < count; ++i) {
224         i4 = _mm_loadu_si128((const __m128i *)src);
225         xy = _mm_shuffle_epi8(i4, Mxy);
226         zw = _mm_shuffle_epi8(i4, Mzw);
227 
228         x2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
229         y2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
230         z2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
231 
232         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
233         y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
234         z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
235 
236         x2 = _mm_srai_epi32(x2, 8);
237         y2 = _mm_srai_epi32(y2, 8);
238         z2 = _mm_srai_epi32(z2, 8);
239         w2 = _mm_srli_epi32(zw, 16);
240 
241         x2 = packus_epi32(x2, y2);
242         z2 = packus_epi32(z2, w2);
243         o4 = _mm_packus_epi16(x2, z2);
244 
245         o4 = _mm_shuffle_epi8(o4, T4x4);
246         _mm_storeu_si128((__m128i *)dst, o4);
247 
248         src = (const char *)src + 16;
249         dst = (char *)dst + 16;
250     }
251 }
252 
rsdIntrinsicColorMatrixDot_K(void * dst,const void * src,const short * coef,uint32_t count)253 void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
254                                   const short *coef, uint32_t count) {
255     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
256                                       14, 10, 6, 2,
257                                       13,  9, 5, 1,
258                                       12,  8, 4, 0);
259     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
260     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
261     __m128i c0, c1, c2, c3;
262     __m128i i4, o4;
263     __m128i xy, zw;
264     __m128i x2, y2, z2, w2;
265     uint32_t i;
266 
267     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
268     c0 = _mm_shufflelo_epi16(c0, 0);
269     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
270     c1 = _mm_shufflelo_epi16(c1, 0);
271     c0 = _mm_unpacklo_epi16(c0, c1);
272 
273     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
274     c2 = _mm_shufflelo_epi16(c2, 0);
275     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
276     c3 = _mm_shufflelo_epi16(c3, 0);
277     c2 = _mm_unpacklo_epi16(c2, c3);
278 
279     for (i = 0; i < count; ++i) {
280         i4 = _mm_loadu_si128((const __m128i *)src);
281 
282         xy = _mm_shuffle_epi8(i4, Mxy);
283         zw = _mm_shuffle_epi8(i4, Mzw);
284 
285         x2 =  _mm_madd_epi16(xy, c0);
286         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
287 
288         x2 = _mm_srai_epi32(x2, 8);
289         y2 = x2;
290         z2 = x2;
291         w2 = _mm_srli_epi32(zw, 16);
292 
293         x2 = packus_epi32(x2, y2);
294         z2 = packus_epi32(z2, w2);
295         o4 = _mm_packus_epi16(x2, z2);
296 
297         o4 = _mm_shuffle_epi8(o4, T4x4);
298         _mm_storeu_si128((__m128i *)dst, o4);
299 
300         src = (const char *)src + 16;
301         dst = (char *)dst + 16;
302     }
303 }
304 
rsdIntrinsicBlurVFU4_K(void * dst,const void * pin,int stride,const void * gptr,int rct,int x1,int x2)305 void rsdIntrinsicBlurVFU4_K(void *dst,
306                           const void *pin, int stride, const void *gptr,
307                           int rct, int x1, int x2) {
308     const char *pi;
309     __m128i pi0, pi1;
310     __m128 pf0, pf1;
311     __m128 bp0, bp1;
312     __m128 x;
313     int r;
314 
315     for (; x1 < x2; x1 += 2) {
316         pi = (const char *)pin + (x1 << 2);
317         bp0 = _mm_setzero_ps();
318         bp1 = _mm_setzero_ps();
319 
320         for (r = 0; r < rct; ++r) {
321             x = _mm_load_ss((const float *)gptr + r);
322             x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
323 
324             pi0 = _mm_cvtsi32_si128(*(const int *)pi);
325             pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
326 
327             pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
328             pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
329 
330             bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
331             bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
332 
333             pi += stride;
334         }
335 
336         _mm_storeu_ps((float *)dst, bp0);
337         _mm_storeu_ps((float *)dst + 4, bp1);
338         dst = (char *)dst + 32;
339     }
340 }
341 
rsdIntrinsicBlurHFU4_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)342 void rsdIntrinsicBlurHFU4_K(void *dst,
343                           const void *pin, const void *gptr,
344                           int rct, int x1, int x2) {
345     const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
346     const float *pi;
347     __m128 pf, x, y;
348     __m128i o;
349     int r;
350 
351     for (; x1 < x2; ++x1) {
352         /* rct is define as 2*r+1 by the caller */
353         x = _mm_load_ss((const float *)gptr);
354         x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
355 
356         pi = (const float *)pin + (x1 << 2);
357         pf = _mm_mul_ps(x, _mm_load_ps(pi));
358 
359         for (r = 1; r < rct; r += 2) {
360             x = _mm_load_ss((const float *)gptr + r);
361             y = _mm_load_ss((const float *)gptr + r + 1);
362             x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
363             y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
364 
365             pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
366             pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
367         }
368 
369         o = _mm_cvtps_epi32(pf);
370         *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
371         dst = (char *)dst + 4;
372     }
373 }
374 
rsdIntrinsicBlurHFU1_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)375 void rsdIntrinsicBlurHFU1_K(void *dst,
376                           const void *pin, const void *gptr,
377                           int rct, int x1, int x2) {
378     const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
379     const float *pi;
380     __m128 pf, g0, g1, g2, g3, gx, p0, p1;
381     __m128i o;
382     int r;
383 
384     for (; x1 < x2; x1+=4) {
385         g0 = _mm_load_ss((const float *)gptr);
386         g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
387 
388         pi = (const float *)pin + x1;
389         pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
390 
391         for (r = 1; r < rct; r += 4) {
392             gx = _mm_loadu_ps((const float *)gptr + r);
393             p0 = _mm_loadu_ps(pi + r);
394             p1 = _mm_loadu_ps(pi + r + 4);
395 
396             g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
397             pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
398             g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
399             pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
400             g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
401             pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
402             g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
403             pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
404         }
405 
406         o = _mm_cvtps_epi32(pf);
407         *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
408         dst = (char *)dst + 4;
409     }
410 }
411 
rsdIntrinsicYuv_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)412 void rsdIntrinsicYuv_K(void *dst,
413                        const unsigned char *pY, const unsigned char *pUV,
414                        uint32_t count, const short *param) {
415     __m128i biasY, biasUV;
416     __m128i c0, c1, c2, c3, c4;
417 
418     biasY = _mm_set1_epi32(param[8]);   /*  16 */
419     biasUV = _mm_set1_epi32(param[16]); /* 128 */
420 
421     c0 = _mm_set1_epi32(param[0]);  /*  298 */
422     c1 = _mm_set1_epi32(param[1]);  /*  409 */
423     c2 = _mm_set1_epi32(param[2]);  /* -100 */
424     c3 = _mm_set1_epi32(param[3]);  /*  516 */
425     c4 = _mm_set1_epi32(param[4]);  /* -208 */
426 
427     __m128i Y, UV, U, V, R, G, B, A;
428 
429     A = _mm_set1_epi32(255);
430     uint32_t i;
431 
432     for (i = 0; i < (count << 1); ++i) {
433         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
434         UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
435 
436         Y = _mm_sub_epi32(Y, biasY);
437         UV = _mm_sub_epi32(UV, biasUV);
438 
439         U = _mm_shuffle_epi32(UV, 0xf5);
440         V = _mm_shuffle_epi32(UV, 0xa0);
441 
442         Y = mullo_epi32(Y, c0);
443 
444         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
445         R = _mm_add_epi32(R, biasUV);
446         R = _mm_srai_epi32(R, 8);
447 
448         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
449         G = _mm_add_epi32(G, mullo_epi32(V, c4));
450         G = _mm_add_epi32(G, biasUV);
451         G = _mm_srai_epi32(G, 8);
452 
453         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
454         B = _mm_add_epi32(B, biasUV);
455         B = _mm_srai_epi32(B, 8);
456 
457         __m128i y1, y2, y3, y4;
458 
459         y1 = packus_epi32(R, G);
460         y2 = packus_epi32(B, A);
461         y3 = _mm_packus_epi16(y1, y2);
462         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
463                                           14, 10, 6, 2,
464                                           13,  9, 5, 1,
465                                           12,  8, 4, 0);
466         y4 = _mm_shuffle_epi8(y3, T4x4);
467         _mm_storeu_si128((__m128i *)dst, y4);
468         pY += 4;
469         pUV += 4;
470         dst = (__m128i *)dst + 1;
471     }
472 }
473 
rsdIntrinsicYuvR_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)474 void rsdIntrinsicYuvR_K(void *dst,
475                        const unsigned char *pY, const unsigned char *pUV,
476                        uint32_t count, const short *param) {
477     __m128i biasY, biasUV;
478     __m128i c0, c1, c2, c3, c4;
479 
480     biasY = _mm_set1_epi32(param[8]);   /*  16 */
481     biasUV = _mm_set1_epi32(param[16]); /* 128 */
482 
483     c0 = _mm_set1_epi32(param[0]);  /*  298 */
484     c1 = _mm_set1_epi32(param[1]);  /*  409 */
485     c2 = _mm_set1_epi32(param[2]);  /* -100 */
486     c3 = _mm_set1_epi32(param[3]);  /*  516 */
487     c4 = _mm_set1_epi32(param[4]);  /* -208 */
488 
489     __m128i Y, UV, U, V, R, G, B, A;
490 
491     A = _mm_set1_epi32(255);
492     uint32_t i;
493 
494     for (i = 0; i < (count << 1); ++i) {
495         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
496         UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
497 
498         Y = _mm_sub_epi32(Y, biasY);
499         UV = _mm_sub_epi32(UV, biasUV);
500 
501         V = _mm_shuffle_epi32(UV, 0xf5);
502         U = _mm_shuffle_epi32(UV, 0xa0);
503 
504         Y = mullo_epi32(Y, c0);
505 
506         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
507         R = _mm_add_epi32(R, biasUV);
508         R = _mm_srai_epi32(R, 8);
509 
510         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
511         G = _mm_add_epi32(G, mullo_epi32(V, c4));
512         G = _mm_add_epi32(G, biasUV);
513         G = _mm_srai_epi32(G, 8);
514 
515         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
516         B = _mm_add_epi32(B, biasUV);
517         B = _mm_srai_epi32(B, 8);
518 
519         __m128i y1, y2, y3, y4;
520 
521         y1 = packus_epi32(R, G);
522         y2 = packus_epi32(B, A);
523         y3 = _mm_packus_epi16(y1, y2);
524         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
525                                           14, 10, 6, 2,
526                                           13,  9, 5, 1,
527                                           12,  8, 4, 0);
528         y4 = _mm_shuffle_epi8(y3, T4x4);
529         _mm_storeu_si128((__m128i *)dst, y4);
530         pY += 4;
531         pUV += 4;
532         dst = (__m128i *)dst + 1;
533     }
534 }
535 
rsdIntrinsicYuv2_K(void * dst,const unsigned char * pY,const unsigned char * pU,const unsigned char * pV,uint32_t count,const short * param)536 void rsdIntrinsicYuv2_K(void *dst,
537                        const unsigned char *pY, const unsigned char *pU,
538                        const unsigned char *pV, uint32_t count, const short *param) {
539     __m128i biasY, biasUV;
540     __m128i c0, c1, c2, c3, c4;
541 
542     biasY = _mm_set1_epi32(param[8]);   /*  16 */
543     biasUV = _mm_set1_epi32(param[16]); /* 128 */
544 
545     c0 = _mm_set1_epi32(param[0]);  /*  298 */
546     c1 = _mm_set1_epi32(param[1]);  /*  409 */
547     c2 = _mm_set1_epi32(param[2]);  /* -100 */
548     c3 = _mm_set1_epi32(param[3]);  /*  516 */
549     c4 = _mm_set1_epi32(param[4]);  /* -208 */
550 
551     __m128i Y, U, V, R, G, B, A;
552 
553     A = _mm_set1_epi32(255);
554     uint32_t i;
555 
556     for (i = 0; i < (count << 1); ++i) {
557         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
558         U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
559 		V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
560 
561         Y = _mm_sub_epi32(Y, biasY);
562         U = _mm_sub_epi32(U, biasUV);
563 		V = _mm_sub_epi32(V, biasUV);
564 
565         Y = mullo_epi32(Y, c0);
566 
567         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
568         R = _mm_add_epi32(R, biasUV);
569         R = _mm_srai_epi32(R, 8);
570 
571         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
572         G = _mm_add_epi32(G, mullo_epi32(V, c4));
573         G = _mm_add_epi32(G, biasUV);
574         G = _mm_srai_epi32(G, 8);
575 
576         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
577         B = _mm_add_epi32(B, biasUV);
578         B = _mm_srai_epi32(B, 8);
579 
580         __m128i y1, y2, y3, y4;
581 
582         y1 = packus_epi32(R, G);
583         y2 = packus_epi32(B, A);
584         y3 = _mm_packus_epi16(y1, y2);
585         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
586                                           14, 10, 6, 2,
587                                           13,  9, 5, 1,
588                                           12,  8, 4, 0);
589         y4 = _mm_shuffle_epi8(y3, T4x4);
590         _mm_storeu_si128((__m128i *)dst, y4);
591         pY += 4;
592         pU += 4;
593 		pV += 4;
594         dst = (__m128i *)dst + 1;
595     }
596 }
597 
rsdIntrinsicConvolve5x5_K(void * dst,const void * y0,const void * y1,const void * y2,const void * y3,const void * y4,const short * coef,uint32_t count)598 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
599                                           const void *y1, const void *y2,
600                                           const void *y3, const void *y4,
601                                           const short *coef, uint32_t count) {
602     __m128i x;
603     __m128i c0, c2, c4, c6, c8, c10, c12;
604     __m128i c14, c16, c18, c20, c22, c24;
605     __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
606     __m128i p0,  p1,  p2,  p3,  p4,  p5,  p6,  p7;
607     __m128i p8,  p9, p10, p11, p12, p13, p14, p15;
608     __m128i p16, p17, p18, p19, p20, p21, p22, p23;
609     __m128i p24, p25, p26, p27, p28, p29, p30, p31;
610     __m128i p32, p33, p34, p35, p36, p37, p38, p39;
611     __m128i o0, o1, o2, o3;
612     uint32_t i;
613 
614     x = _mm_loadl_epi64((const __m128i *)(coef+0));
615     c0  = _mm_shuffle_epi32(x, 0x00);
616     c2  = _mm_shuffle_epi32(x, 0x55);
617 
618     x = _mm_loadl_epi64((const __m128i *)(coef+4));
619     c4  = _mm_shuffle_epi32(x, 0x00);
620     c6  = _mm_shuffle_epi32(x, 0x55);
621 
622     x = _mm_loadl_epi64((const __m128i *)(coef+8));
623     c8  = _mm_shuffle_epi32(x, 0x00);
624     c10  = _mm_shuffle_epi32(x, 0x55);
625 
626     x = _mm_loadl_epi64((const __m128i *)(coef+12));
627     c12  = _mm_shuffle_epi32(x, 0x00);
628     c14  = _mm_shuffle_epi32(x, 0x55);
629 
630     x = _mm_loadl_epi64((const __m128i *)(coef+16));
631     c16  = _mm_shuffle_epi32(x, 0x00);
632     c18  = _mm_shuffle_epi32(x, 0x55);
633 
634     x = _mm_loadl_epi64((const __m128i *)(coef+20));
635     c20  = _mm_shuffle_epi32(x, 0x00);
636     c22  = _mm_shuffle_epi32(x, 0x55);
637 
638     x = _mm_loadl_epi64((const __m128i *)(coef+24));
639     c24  = _mm_shuffle_epi32(x, 0x00);
640 
641     for (i = 0; i < count; ++i) {
642 
643         p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
644         p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
645         p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
646         p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
647         p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
648         p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
649         p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
650         p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
651 
652         p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
653         p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
654         p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
655         p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
656         p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
657         p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
658         p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
659         p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
660 
661         p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
662         p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
663         p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
664         p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
665         p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
666         p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
667         p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
668         p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
669 
670         p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
671         p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
672         p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
673         p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
674         p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
675         p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
676         p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
677         p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
678 
679         p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
680         p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
681         p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
682         p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
683         p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
684         p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
685         p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
686         p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
687 
688         o0 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1),  c0);
689         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3),  c2));
690         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8),  c4));
691         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10),  c6));
692         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c8));
693         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
694         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
695         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
696         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
697         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
698         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
699         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
700         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
701         o0 = _mm_srai_epi32(o0, 8);
702 
703         o1 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2),  c0);
704         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c2));
705         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9),  c4));
706         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11),  c6));
707         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13),  c8));
708         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
709         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
710         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
711         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
712         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
713         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
714         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
715         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
716         o1 = _mm_srai_epi32(o1, 8);
717 
718         o2 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3),  c0);
719         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5),  c2));
720         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10),  c4));
721         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c6));
722         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14),  c8));
723         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
724         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
725         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
726         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
727         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
728         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
729         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
730         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
731         o2 = _mm_srai_epi32(o2, 8);
732 
733         o3 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c0);
734         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6),  c2));
735         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11),  c4));
736         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13),  c6));
737         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15),  c8));
738         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
739         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
740         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
741         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
742         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
743         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
744         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
745         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
746         o3 = _mm_srai_epi32(o3, 8);
747 
748         o0 = packus_epi32(o0, o1);
749         o2 = packus_epi32(o2, o3);
750         o0 = _mm_packus_epi16(o0, o2);
751         _mm_storeu_si128((__m128i *)dst, o0);
752 
753         y0 = (const char *)y0 + 16;
754         y1 = (const char *)y1 + 16;
755         y2 = (const char *)y2 + 16;
756         y3 = (const char *)y3 + 16;
757         y4 = (const char *)y4 + 16;
758         dst = (char *)dst + 16;
759     }
760 }
761 
rsdIntrinsicBlendSrcOver_K(void * dst,const void * src,uint32_t count8)762 void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
763     __m128i all1s, ina, ins;
764     __m128i in0, in1, out0, out1;
765     __m128i t0, t1, t2, t3;
766     uint32_t i;
767 
768     all1s = _mm_set1_epi16(255);
769 
770     for (i = 0; i < count8; ++i) {
771         in0 = _mm_loadu_si128((const __m128i *)src);
772         in1 = _mm_loadu_si128((const __m128i *)src + 1);
773         out0 = _mm_loadu_si128((const __m128i *)dst);
774         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
775 
776         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
777         ina = _mm_shufflelo_epi16(ins, 0xFF);
778         ina = _mm_shufflehi_epi16(ina, 0xFF);
779         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
780         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
781         t0 = _mm_srli_epi16(t0, 8);
782         t0 = _mm_add_epi16(t0, ins);
783 
784         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
785         ina = _mm_shufflelo_epi16(ins, 0xFF);
786         ina = _mm_shufflehi_epi16(ina, 0xFF);
787         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
788         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
789         t1 = _mm_srli_epi16(t1, 8);
790         t1 = _mm_add_epi16(t1, ins);
791 
792         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
793         ina = _mm_shufflelo_epi16(ins, 0xFF);
794         ina = _mm_shufflehi_epi16(ina, 0xFF);
795         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
796         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
797         t2 = _mm_srli_epi16(t2, 8);
798         t2 = _mm_add_epi16(t2, ins);
799 
800         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
801         ina = _mm_shufflelo_epi16(ins, 0xFF);
802         ina = _mm_shufflehi_epi16(ina, 0xFF);
803         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
804         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
805         t3 = _mm_srli_epi16(t3, 8);
806         t3 = _mm_add_epi16(t3, ins);
807 
808         t0 = _mm_packus_epi16(t0, t1);
809         t2 = _mm_packus_epi16(t2, t3);
810         _mm_storeu_si128((__m128i *)dst, t0);
811         _mm_storeu_si128((__m128i *)dst + 1, t2);
812 
813         src = (const __m128i *)src + 2;
814         dst = (__m128i *)dst + 2;
815     }
816 }
817 
rsdIntrinsicBlendDstOver_K(void * dst,const void * src,uint32_t count8)818 void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
819     __m128i all1s, outa, outs;
820     __m128i in0, in1, out0, out1;
821     __m128i t0, t1, t2, t3;
822     uint32_t i;
823 
824     all1s = _mm_set1_epi16(255);
825 
826     for (i = 0; i < count8; ++i) {
827         in0 = _mm_loadu_si128((const __m128i *)src);
828         in1 = _mm_loadu_si128((const __m128i *)src + 1);
829         out0 = _mm_loadu_si128((const __m128i *)dst);
830         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
831 
832 
833         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
834         outa = _mm_shufflelo_epi16(outs, 0xFF);
835         outa = _mm_shufflehi_epi16(outa, 0xFF);
836         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
837         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
838         t0 = _mm_srli_epi16(t0, 8);
839         t0 = _mm_add_epi16(t0, outs);
840 
841         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
842         outa = _mm_shufflelo_epi16(outs, 0xFF);
843         outa = _mm_shufflehi_epi16(outa, 0xFF);
844         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
845         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
846         t1 = _mm_srli_epi16(t1, 8);
847         t1 = _mm_add_epi16(t1, outs);
848 
849         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
850         outa = _mm_shufflelo_epi16(outs, 0xFF);
851         outa = _mm_shufflehi_epi16(outa, 0xFF);
852         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
853         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
854         t2 = _mm_srli_epi16(t2, 8);
855         t2 = _mm_add_epi16(t2, outs);
856 
857         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
858         outa = _mm_shufflelo_epi16(outs, 0xFF);
859         outa = _mm_shufflehi_epi16(outa, 0xFF);
860         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
861         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
862         t3 = _mm_srli_epi16(t3, 8);
863         t3 = _mm_add_epi16(t3, outs);
864 
865         t0 = _mm_packus_epi16(t0, t1);
866         t2 = _mm_packus_epi16(t2, t3);
867         _mm_storeu_si128((__m128i *)dst, t0);
868         _mm_storeu_si128((__m128i *)dst + 1, t2);
869 
870         src = (const __m128i *)src + 2;
871         dst = (__m128i *)dst + 2;
872     }
873 }
874 
rsdIntrinsicBlendSrcIn_K(void * dst,const void * src,uint32_t count8)875 void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
876     __m128i outa;
877     __m128i in0, in1, out0, out1;
878     __m128i t0, t1, t2, t3;
879     uint32_t i;
880 
881     for (i = 0; i < count8; ++i) {
882         in0 = _mm_loadu_si128((const __m128i *)src);
883         in1 = _mm_loadu_si128((const __m128i *)src + 1);
884         out0 = _mm_loadu_si128((const __m128i *)dst);
885         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
886 
887         outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
888         outa = _mm_shufflelo_epi16(outa, 0xFF);
889         outa = _mm_shufflehi_epi16(outa, 0xFF);
890         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
891         t0 = _mm_mullo_epi16(t0, outa);
892         t0 = _mm_srli_epi16(t0, 8);
893 
894         outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
895         outa = _mm_shufflelo_epi16(outa, 0xFF);
896         outa = _mm_shufflehi_epi16(outa, 0xFF);
897         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
898         t1 = _mm_mullo_epi16(t1, outa);
899         t1 = _mm_srli_epi16(t1, 8);
900 
901         outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
902         outa = _mm_shufflelo_epi16(outa, 0xFF);
903         outa = _mm_shufflehi_epi16(outa, 0xFF);
904         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
905         t2 = _mm_mullo_epi16(t2, outa);
906         t2 = _mm_srli_epi16(t2, 8);
907 
908         outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
909         outa = _mm_shufflelo_epi16(outa, 0xFF);
910         outa = _mm_shufflehi_epi16(outa, 0xFF);
911         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
912         t3 = _mm_mullo_epi16(t3, outa);
913         t3 = _mm_srli_epi16(t3, 8);
914 
915         t0 = _mm_packus_epi16(t0, t1);
916         t2 = _mm_packus_epi16(t2, t3);
917         _mm_storeu_si128((__m128i *)dst, t0);
918         _mm_storeu_si128((__m128i *)dst + 1, t2);
919 
920         src = (const __m128i *)src + 2;
921         dst = (__m128i *)dst + 2;
922     }
923 }
924 
rsdIntrinsicBlendDstIn_K(void * dst,const void * src,uint32_t count8)925 void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
926     __m128i ina;
927     __m128i in0, in1, out0, out1;
928     __m128i t0, t1, t2, t3;
929     uint32_t i;
930 
931     for (i = 0; i < count8; ++i) {
932         in0 = _mm_loadu_si128((const __m128i *)src);
933         in1 = _mm_loadu_si128((const __m128i *)src + 1);
934         out0 = _mm_loadu_si128((const __m128i *)dst);
935         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
936 
937         ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
938         ina = _mm_shufflelo_epi16(ina, 0xFF);
939         ina = _mm_shufflehi_epi16(ina, 0xFF);
940         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
941         t0 = _mm_mullo_epi16(t0, ina);
942         t0 = _mm_srli_epi16(t0, 8);
943 
944         ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
945         ina = _mm_shufflelo_epi16(ina, 0xFF);
946         ina = _mm_shufflehi_epi16(ina, 0xFF);
947         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
948         t1 = _mm_mullo_epi16(t1, ina);
949         t1 = _mm_srli_epi16(t1, 8);
950 
951         ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
952         ina = _mm_shufflelo_epi16(ina, 0xFF);
953         ina = _mm_shufflehi_epi16(ina, 0xFF);
954         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
955         t2 = _mm_mullo_epi16(t2, ina);
956         t2 = _mm_srli_epi16(t2, 8);
957 
958         ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
959         ina = _mm_shufflelo_epi16(ina, 0xFF);
960         ina = _mm_shufflehi_epi16(ina, 0xFF);
961         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
962         t3 = _mm_mullo_epi16(t3, ina);
963         t3 = _mm_srli_epi16(t3, 8);
964 
965         t0 = _mm_packus_epi16(t0, t1);
966         t2 = _mm_packus_epi16(t2, t3);
967         _mm_storeu_si128((__m128i *)dst, t0);
968         _mm_storeu_si128((__m128i *)dst + 1, t2);
969 
970         src = (const __m128i *)src + 2;
971         dst = (__m128i *)dst + 2;
972     }
973 }
974 
rsdIntrinsicBlendSrcOut_K(void * dst,const void * src,uint32_t count8)975 void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
976     __m128i all1s, outa;
977     __m128i in0, in1, out0, out1;
978     __m128i t0, t1, t2, t3;
979     uint32_t i;
980 
981     all1s = _mm_set1_epi16(255);
982 
983     for (i = 0; i < count8; ++i) {
984         in0 = _mm_loadu_si128((const __m128i *)src);
985         in1 = _mm_loadu_si128((const __m128i *)src + 1);
986         out0 = _mm_loadu_si128((const __m128i *)dst);
987         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
988 
989         outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
990         outa = _mm_shufflelo_epi16(outa, 0xFF);
991         outa = _mm_shufflehi_epi16(outa, 0xFF);
992         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
993         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
994         t0 = _mm_srli_epi16(t0, 8);
995 
996         outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
997         outa = _mm_shufflelo_epi16(outa, 0xFF);
998         outa = _mm_shufflehi_epi16(outa, 0xFF);
999         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1000         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
1001         t1 = _mm_srli_epi16(t1, 8);
1002 
1003         outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1004         outa = _mm_shufflelo_epi16(outa, 0xFF);
1005         outa = _mm_shufflehi_epi16(outa, 0xFF);
1006         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1007         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
1008         t2 = _mm_srli_epi16(t2, 8);
1009 
1010         outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1011         outa = _mm_shufflelo_epi16(outa, 0xFF);
1012         outa = _mm_shufflehi_epi16(outa, 0xFF);
1013         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1014         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
1015         t3 = _mm_srli_epi16(t3, 8);
1016 
1017         t0 = _mm_packus_epi16(t0, t1);
1018         t2 = _mm_packus_epi16(t2, t3);
1019         _mm_storeu_si128((__m128i *)dst, t0);
1020         _mm_storeu_si128((__m128i *)dst + 1, t2);
1021 
1022         src = (const __m128i *)src + 2;
1023         dst = (__m128i *)dst + 2;
1024     }
1025 }
1026 
rsdIntrinsicBlendDstOut_K(void * dst,const void * src,uint32_t count8)1027 void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
1028     __m128i all1s, ina;
1029     __m128i in0, in1, out0, out1;
1030     __m128i t0, t1, t2, t3;
1031     uint32_t i;
1032 
1033     all1s = _mm_set1_epi16(255);
1034 
1035     for (i = 0; i < count8; ++i) {
1036         in0 = _mm_loadu_si128((const __m128i *)src);
1037         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1038         out0 = _mm_loadu_si128((const __m128i *)dst);
1039         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1040 
1041         ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1042         ina = _mm_shufflelo_epi16(ina, 0xFF);
1043         ina = _mm_shufflehi_epi16(ina, 0xFF);
1044         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1045         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
1046         t0 = _mm_srli_epi16(t0, 8);
1047 
1048         ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1049         ina = _mm_shufflelo_epi16(ina, 0xFF);
1050         ina = _mm_shufflehi_epi16(ina, 0xFF);
1051         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1052         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
1053         t1 = _mm_srli_epi16(t1, 8);
1054 
1055         ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1056         ina = _mm_shufflelo_epi16(ina, 0xFF);
1057         ina = _mm_shufflehi_epi16(ina, 0xFF);
1058         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1059         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
1060         t2 = _mm_srli_epi16(t2, 8);
1061 
1062         ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1063         ina = _mm_shufflelo_epi16(ina, 0xFF);
1064         ina = _mm_shufflehi_epi16(ina, 0xFF);
1065         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1066         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
1067         t3 = _mm_srli_epi16(t3, 8);
1068 
1069         t0 = _mm_packus_epi16(t0, t1);
1070         t2 = _mm_packus_epi16(t2, t3);
1071         _mm_storeu_si128((__m128i *)dst, t0);
1072         _mm_storeu_si128((__m128i *)dst + 1, t2);
1073 
1074         src = (const __m128i *)src + 2;
1075         dst = (__m128i *)dst + 2;
1076     }
1077 }
1078 
rsdIntrinsicBlendSrcAtop_K(void * dst,const void * src,uint32_t count8)1079 void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
1080     const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1081     __m128i all1s, ina, outa, ins, outs;
1082     __m128i in0, in1, out0, out1;
1083     __m128i t0, t1, t2, t3;
1084     uint32_t i;
1085 
1086     all1s = _mm_set1_epi16(255);
1087 
1088     for (i = 0; i < count8; ++i) {
1089         in0 = _mm_loadu_si128((const __m128i *)src);
1090         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1091         out0 = _mm_loadu_si128((const __m128i *)dst);
1092         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1093 
1094         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1095         ina = _mm_shufflelo_epi16(ins, 0xFF);
1096         ina = _mm_shufflehi_epi16(ina, 0xFF);
1097         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1098         outa = _mm_shufflelo_epi16(outs, 0xFF);
1099         outa = _mm_shufflehi_epi16(outa, 0xFF);
1100         t0 = _mm_sub_epi16(all1s, ina);
1101         t0 = _mm_mullo_epi16(t0, outs);
1102         t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
1103         t0 = _mm_srli_epi16(t0, 8);
1104 
1105         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1106         ina = _mm_shufflelo_epi16(ins, 0xFF);
1107         ina = _mm_shufflehi_epi16(ina, 0xFF);
1108         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1109         outa = _mm_shufflelo_epi16(outs, 0xFF);
1110         outa = _mm_shufflehi_epi16(outa, 0xFF);
1111         t1 = _mm_sub_epi16(all1s, ina);
1112         t1 = _mm_mullo_epi16(t1, outs);
1113         t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
1114         t1 = _mm_srli_epi16(t1, 8);
1115 
1116         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1117         ina = _mm_shufflelo_epi16(ins, 0xFF);
1118         ina = _mm_shufflehi_epi16(ina, 0xFF);
1119         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1120         outa = _mm_shufflelo_epi16(outs, 0xFF);
1121         outa = _mm_shufflehi_epi16(outa, 0xFF);
1122         t2 = _mm_sub_epi16(all1s, ina);
1123         t2 = _mm_mullo_epi16(t2, outs);
1124         t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
1125         t2 = _mm_srli_epi16(t2, 8);
1126 
1127         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1128         ina = _mm_shufflelo_epi16(ins, 0xFF);
1129         ina = _mm_shufflehi_epi16(ina, 0xFF);
1130         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1131         outa = _mm_shufflelo_epi16(outs, 0xFF);
1132         outa = _mm_shufflehi_epi16(outa, 0xFF);
1133         t3 = _mm_sub_epi16(all1s, ina);
1134         t3 = _mm_mullo_epi16(t3, outs);
1135         t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
1136         t3 = _mm_srli_epi16(t3, 8);
1137 
1138         t0 = _mm_packus_epi16(t0, t1);
1139         t0 = blendv_epi8(t0, out0, M0001);
1140         t2 = _mm_packus_epi16(t2, t3);
1141         t2 = blendv_epi8(t2, out1, M0001);
1142         _mm_storeu_si128((__m128i *)dst, t0);
1143         _mm_storeu_si128((__m128i *)dst + 1, t2);
1144 
1145         src = (const __m128i *)src + 2;
1146         dst = (__m128i *)dst + 2;
1147     }
1148 }
1149 
rsdIntrinsicBlendDstAtop_K(void * dst,const void * src,uint32_t count8)1150 void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
1151     const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1152     __m128i all1s, ina, ins, outa, outs;
1153     __m128i in0, in1, out0, out1;
1154     __m128i t0, t1, t2, t3;
1155     uint32_t i;
1156 
1157     all1s = _mm_set1_epi16(255);
1158 
1159     for (i = 0; i < count8; ++i) {
1160         in0 = _mm_loadu_si128((const __m128i *)src);
1161         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1162         out0 = _mm_loadu_si128((const __m128i *)dst);
1163         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1164 
1165         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1166         ina = _mm_shufflelo_epi16(ins, 0xFF);
1167         ina = _mm_shufflehi_epi16(ina, 0xFF);
1168         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1169         outa = _mm_shufflelo_epi16(outs, 0xFF);
1170         outa = _mm_shufflehi_epi16(outa, 0xFF);
1171         t0 = _mm_sub_epi16(all1s, outa);
1172         t0 = _mm_mullo_epi16(t0, ins);
1173         t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
1174         t0 = _mm_srli_epi16(t0, 8);
1175 
1176         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1177         ina = _mm_shufflelo_epi16(ins, 0xFF);
1178         ina = _mm_shufflehi_epi16(ina, 0xFF);
1179         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1180         outa = _mm_shufflelo_epi16(outs, 0xFF);
1181         outa = _mm_shufflehi_epi16(outa, 0xFF);
1182         t1 = _mm_sub_epi16(all1s, outa);
1183         t1 = _mm_mullo_epi16(t1, ins);
1184         t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
1185         t1 = _mm_srli_epi16(t1, 8);
1186 
1187         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1188         ina = _mm_shufflelo_epi16(ins, 0xFF);
1189         ina = _mm_shufflehi_epi16(ina, 0xFF);
1190         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1191         outa = _mm_shufflelo_epi16(outs, 0xFF);
1192         outa = _mm_shufflehi_epi16(outa, 0xFF);
1193         t2 = _mm_sub_epi16(all1s, outa);
1194         t2 = _mm_mullo_epi16(t2, ins);
1195         t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
1196         t2 = _mm_srli_epi16(t2, 8);
1197 
1198         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1199         ina = _mm_shufflelo_epi16(ins, 0xFF);
1200         ina = _mm_shufflehi_epi16(ina, 0xFF);
1201         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1202         outa = _mm_shufflelo_epi16(outs, 0xFF);
1203         outa = _mm_shufflehi_epi16(outa, 0xFF);
1204         t3 = _mm_sub_epi16(all1s, outa);
1205         t3 = _mm_mullo_epi16(t3, ins);
1206         t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
1207         t3 = _mm_srli_epi16(t3, 8);
1208 
1209         t0 = _mm_packus_epi16(t0, t1);
1210         t0 = blendv_epi8(t0, in0, M0001);
1211         t2 = _mm_packus_epi16(t2, t3);
1212         t2 = blendv_epi8(t2, in1, M0001);
1213         _mm_storeu_si128((__m128i *)dst, t0);
1214         _mm_storeu_si128((__m128i *)dst + 1, t2);
1215 
1216         src = (const __m128i *)src + 2;
1217         dst = (__m128i *)dst + 2;
1218     }
1219 }
1220 
rsdIntrinsicBlendXor_K(void * dst,const void * src,uint32_t count8)1221 void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
1222     __m128i in0, in1, out0, out1;
1223     uint32_t i;
1224 
1225     for (i = 0; i < count8; ++i) {
1226         in0 = _mm_loadu_si128((const __m128i *)src);
1227         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1228         out0 = _mm_loadu_si128((const __m128i *)dst);
1229         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1230 
1231         out0 = _mm_xor_si128(out0, in0);
1232         out1 = _mm_xor_si128(out1, in1);
1233 
1234         _mm_storeu_si128((__m128i *)dst, out0);
1235         _mm_storeu_si128((__m128i *)dst + 1, out1);
1236 
1237         src = (const __m128i *)src + 2;
1238         dst = (__m128i *)dst + 2;
1239     }
1240 }
1241 
rsdIntrinsicBlendMultiply_K(void * dst,const void * src,uint32_t count8)1242 void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
1243     __m128i in0, in1, out0, out1;
1244     __m128i t0, t1, t2, t3;
1245     uint32_t i;
1246 
1247     for (i = 0; i < count8; ++i) {
1248         in0 = _mm_loadu_si128((const __m128i *)src);
1249         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1250         out0 = _mm_loadu_si128((const __m128i *)dst);
1251         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1252 
1253         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1254         t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
1255         t0 = _mm_srli_epi16(t0, 8);
1256 
1257         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1258         t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
1259         t1 = _mm_srli_epi16(t1, 8);
1260 
1261         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1262         t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
1263         t2 = _mm_srli_epi16(t2, 8);
1264 
1265         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1266         t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
1267         t3 = _mm_srli_epi16(t3, 8);
1268 
1269         t0 = _mm_packus_epi16(t0, t1);
1270         t2 = _mm_packus_epi16(t2, t3);
1271         _mm_storeu_si128((__m128i *)dst, t0);
1272         _mm_storeu_si128((__m128i *)dst + 1, t2);
1273 
1274         src = (const __m128i *)src + 2;
1275         dst = (__m128i *)dst + 2;
1276     }
1277 }
1278 
rsdIntrinsicBlendAdd_K(void * dst,const void * src,uint32_t count8)1279 void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
1280     __m128i in0, in1, out0, out1;
1281     uint32_t i;
1282 
1283     for (i = 0; i < count8; ++i) {
1284         in0 = _mm_loadu_si128((const __m128i *)src);
1285         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1286         out0 = _mm_loadu_si128((const __m128i *)dst);
1287         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1288 
1289         out0 = _mm_adds_epu8(out0, in0);
1290         out1 = _mm_adds_epu8(out1, in1);
1291 
1292         _mm_storeu_si128((__m128i *)dst, out0);
1293         _mm_storeu_si128((__m128i *)dst + 1, out1);
1294 
1295         src = (const __m128i *)src + 2;
1296         dst = (__m128i *)dst + 2;
1297     }
1298 }
1299 
rsdIntrinsicBlendSub_K(void * dst,const void * src,uint32_t count8)1300 void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
1301     __m128i in0, in1, out0, out1;
1302     uint32_t i;
1303 
1304     for (i = 0; i < count8; ++i) {
1305         in0 = _mm_loadu_si128((const __m128i *)src);
1306         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1307         out0 = _mm_loadu_si128((const __m128i *)dst);
1308         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1309 
1310         out0 = _mm_subs_epu8(out0, in0);
1311         out1 = _mm_subs_epu8(out1, in1);
1312 
1313         _mm_storeu_si128((__m128i *)dst, out0);
1314         _mm_storeu_si128((__m128i *)dst + 1, out1);
1315 
1316         src = (const __m128i *)src + 2;
1317         dst = (__m128i *)dst + 2;
1318     }
1319 }
1320 
1321 }  // namespace renderscript
1322