xref: /aosp_15_r20/external/skia/src/opts/SkBlitRow_opts.h (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1*c8dee2aaSAndroid Build Coastguard Worker /*
2*c8dee2aaSAndroid Build Coastguard Worker  * Copyright 2015 Google Inc.
3*c8dee2aaSAndroid Build Coastguard Worker  *
4*c8dee2aaSAndroid Build Coastguard Worker  * Use of this source code is governed by a BSD-style license that can be
5*c8dee2aaSAndroid Build Coastguard Worker  * found in the LICENSE file.
6*c8dee2aaSAndroid Build Coastguard Worker  */
7*c8dee2aaSAndroid Build Coastguard Worker 
8*c8dee2aaSAndroid Build Coastguard Worker #ifndef SkBlitRow_opts_DEFINED
9*c8dee2aaSAndroid Build Coastguard Worker #define SkBlitRow_opts_DEFINED
10*c8dee2aaSAndroid Build Coastguard Worker 
11*c8dee2aaSAndroid Build Coastguard Worker #include "include/private/SkColorData.h"
12*c8dee2aaSAndroid Build Coastguard Worker #include "src/base/SkMSAN.h"
13*c8dee2aaSAndroid Build Coastguard Worker #include "src/base/SkVx.h"
14*c8dee2aaSAndroid Build Coastguard Worker 
15*c8dee2aaSAndroid Build Coastguard Worker // Helpers for blit_row_s32a_opaque(),
16*c8dee2aaSAndroid Build Coastguard Worker // then blit_row_s32a_opaque() itself,
17*c8dee2aaSAndroid Build Coastguard Worker // then unrelated blit_row_color32() at the bottom.
18*c8dee2aaSAndroid Build Coastguard Worker //
19*c8dee2aaSAndroid Build Coastguard Worker // To keep Skia resistant to timing attacks, it's important not to branch on pixel data.
20*c8dee2aaSAndroid Build Coastguard Worker // In particular, don't be tempted to [v]ptest, pmovmskb, etc. to branch on the source alpha.
21*c8dee2aaSAndroid Build Coastguard Worker 
22*c8dee2aaSAndroid Build Coastguard Worker #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
23*c8dee2aaSAndroid Build Coastguard Worker     #include <immintrin.h>
24*c8dee2aaSAndroid Build Coastguard Worker 
SkPMSrcOver_AVX2(const __m256i & src,const __m256i & dst)25*c8dee2aaSAndroid Build Coastguard Worker     static inline __m256i SkPMSrcOver_AVX2(const __m256i& src, const __m256i& dst) {
26*c8dee2aaSAndroid Build Coastguard Worker         // Abstractly srcover is
27*c8dee2aaSAndroid Build Coastguard Worker         //     b = s + d*(1-srcA)
28*c8dee2aaSAndroid Build Coastguard Worker         //
29*c8dee2aaSAndroid Build Coastguard Worker         // In terms of unorm8 bytes, that works out to
30*c8dee2aaSAndroid Build Coastguard Worker         //     b = s + (d*(255-srcA) + 127) / 255
31*c8dee2aaSAndroid Build Coastguard Worker         //
32*c8dee2aaSAndroid Build Coastguard Worker         // But we approximate that to within a bit with
33*c8dee2aaSAndroid Build Coastguard Worker         //     b = s + (d*(255-srcA) + d) / 256
34*c8dee2aaSAndroid Build Coastguard Worker         // a.k.a
35*c8dee2aaSAndroid Build Coastguard Worker         //     b = s + (d*(256-srcA)) >> 8
36*c8dee2aaSAndroid Build Coastguard Worker 
37*c8dee2aaSAndroid Build Coastguard Worker         // The bottleneck of this math is the multiply, and we want to do it as
38*c8dee2aaSAndroid Build Coastguard Worker         // narrowly as possible, here getting inputs into 16-bit lanes and
39*c8dee2aaSAndroid Build Coastguard Worker         // using 16-bit multiplies.  We can do twice as many multiplies at once
40*c8dee2aaSAndroid Build Coastguard Worker         // as using naive 32-bit multiplies, and on top of that, the 16-bit multiplies
41*c8dee2aaSAndroid Build Coastguard Worker         // are themselves a couple cycles quicker.  Win-win.
42*c8dee2aaSAndroid Build Coastguard Worker 
43*c8dee2aaSAndroid Build Coastguard Worker         // We'll get everything in 16-bit lanes for two multiplies, one
44*c8dee2aaSAndroid Build Coastguard Worker         // handling dst red and blue, the other green and alpha.  (They're
45*c8dee2aaSAndroid Build Coastguard Worker         // conveniently 16-bits apart, you see.) We don't need the individual
46*c8dee2aaSAndroid Build Coastguard Worker         // src channels beyond alpha until the very end when we do the "s + "
47*c8dee2aaSAndroid Build Coastguard Worker         // add, and we don't even need to unpack them; the adds cannot overflow.
48*c8dee2aaSAndroid Build Coastguard Worker 
49*c8dee2aaSAndroid Build Coastguard Worker         // Shuffle each pixel's srcA to the low byte of each 16-bit half of the pixel.
50*c8dee2aaSAndroid Build Coastguard Worker         const int _ = -1;   // fills a literal 0 byte.
51*c8dee2aaSAndroid Build Coastguard Worker         __m256i srcA_x2 = _mm256_shuffle_epi8(src,
52*c8dee2aaSAndroid Build Coastguard Worker                 _mm256_setr_epi8(3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_,
53*c8dee2aaSAndroid Build Coastguard Worker                                  3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_));
54*c8dee2aaSAndroid Build Coastguard Worker         __m256i scale_x2 = _mm256_sub_epi16(_mm256_set1_epi16(256),
55*c8dee2aaSAndroid Build Coastguard Worker                                             srcA_x2);
56*c8dee2aaSAndroid Build Coastguard Worker 
57*c8dee2aaSAndroid Build Coastguard Worker         // Scale red and blue, leaving results in the low byte of each 16-bit lane.
58*c8dee2aaSAndroid Build Coastguard Worker         __m256i rb = _mm256_and_si256(_mm256_set1_epi32(0x00ff00ff), dst);
59*c8dee2aaSAndroid Build Coastguard Worker         rb = _mm256_mullo_epi16(rb, scale_x2);
60*c8dee2aaSAndroid Build Coastguard Worker         rb = _mm256_srli_epi16 (rb, 8);
61*c8dee2aaSAndroid Build Coastguard Worker 
62*c8dee2aaSAndroid Build Coastguard Worker         // Scale green and alpha, leaving results in the high byte, masking off the low bits.
63*c8dee2aaSAndroid Build Coastguard Worker         __m256i ga = _mm256_srli_epi16(dst, 8);
64*c8dee2aaSAndroid Build Coastguard Worker         ga = _mm256_mullo_epi16(ga, scale_x2);
65*c8dee2aaSAndroid Build Coastguard Worker         ga = _mm256_andnot_si256(_mm256_set1_epi32(0x00ff00ff), ga);
66*c8dee2aaSAndroid Build Coastguard Worker 
67*c8dee2aaSAndroid Build Coastguard Worker         return _mm256_adds_epu8(src, _mm256_or_si256(rb, ga));
68*c8dee2aaSAndroid Build Coastguard Worker     }
69*c8dee2aaSAndroid Build Coastguard Worker #endif
70*c8dee2aaSAndroid Build Coastguard Worker 
71*c8dee2aaSAndroid Build Coastguard Worker #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
72*c8dee2aaSAndroid Build Coastguard Worker     #include <immintrin.h>
73*c8dee2aaSAndroid Build Coastguard Worker 
SkPMSrcOver_SSE2(const __m128i & src,const __m128i & dst)74*c8dee2aaSAndroid Build Coastguard Worker     static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
75*c8dee2aaSAndroid Build Coastguard Worker         __m128i scale = _mm_sub_epi32(_mm_set1_epi32(256),
76*c8dee2aaSAndroid Build Coastguard Worker                                       _mm_srli_epi32(src, 24));
77*c8dee2aaSAndroid Build Coastguard Worker         __m128i scale_x2 = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);
78*c8dee2aaSAndroid Build Coastguard Worker 
79*c8dee2aaSAndroid Build Coastguard Worker         __m128i rb = _mm_and_si128(_mm_set1_epi32(0x00ff00ff), dst);
80*c8dee2aaSAndroid Build Coastguard Worker         rb = _mm_mullo_epi16(rb, scale_x2);
81*c8dee2aaSAndroid Build Coastguard Worker         rb = _mm_srli_epi16(rb, 8);
82*c8dee2aaSAndroid Build Coastguard Worker 
83*c8dee2aaSAndroid Build Coastguard Worker         __m128i ga = _mm_srli_epi16(dst, 8);
84*c8dee2aaSAndroid Build Coastguard Worker         ga = _mm_mullo_epi16(ga, scale_x2);
85*c8dee2aaSAndroid Build Coastguard Worker         ga = _mm_andnot_si128(_mm_set1_epi32(0x00ff00ff), ga);
86*c8dee2aaSAndroid Build Coastguard Worker 
87*c8dee2aaSAndroid Build Coastguard Worker         return _mm_adds_epu8(src, _mm_or_si128(rb, ga));
88*c8dee2aaSAndroid Build Coastguard Worker     }
89*c8dee2aaSAndroid Build Coastguard Worker #endif
90*c8dee2aaSAndroid Build Coastguard Worker 
91*c8dee2aaSAndroid Build Coastguard Worker #if defined(SK_ARM_HAS_NEON)
92*c8dee2aaSAndroid Build Coastguard Worker     #include <arm_neon.h>
93*c8dee2aaSAndroid Build Coastguard Worker 
94*c8dee2aaSAndroid Build Coastguard Worker     // SkMulDiv255Round() applied to each lane.
SkMulDiv255Round_neon8(uint8x8_t x,uint8x8_t y)95*c8dee2aaSAndroid Build Coastguard Worker     static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) {
96*c8dee2aaSAndroid Build Coastguard Worker         uint16x8_t prod = vmull_u8(x, y);
97*c8dee2aaSAndroid Build Coastguard Worker         return vraddhn_u16(prod, vrshrq_n_u16(prod, 8));
98*c8dee2aaSAndroid Build Coastguard Worker     }
99*c8dee2aaSAndroid Build Coastguard Worker 
SkPMSrcOver_neon8(uint8x8x4_t dst,uint8x8x4_t src)100*c8dee2aaSAndroid Build Coastguard Worker     static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) {
101*c8dee2aaSAndroid Build Coastguard Worker         uint8x8_t nalphas = vmvn_u8(src.val[3]);  // 256 - alpha
102*c8dee2aaSAndroid Build Coastguard Worker         return {
103*c8dee2aaSAndroid Build Coastguard Worker             vqadd_u8(src.val[0], SkMulDiv255Round_neon8(nalphas,  dst.val[0])),
104*c8dee2aaSAndroid Build Coastguard Worker             vqadd_u8(src.val[1], SkMulDiv255Round_neon8(nalphas,  dst.val[1])),
105*c8dee2aaSAndroid Build Coastguard Worker             vqadd_u8(src.val[2], SkMulDiv255Round_neon8(nalphas,  dst.val[2])),
106*c8dee2aaSAndroid Build Coastguard Worker             vqadd_u8(src.val[3], SkMulDiv255Round_neon8(nalphas,  dst.val[3])),
107*c8dee2aaSAndroid Build Coastguard Worker         };
108*c8dee2aaSAndroid Build Coastguard Worker     }
109*c8dee2aaSAndroid Build Coastguard Worker 
110*c8dee2aaSAndroid Build Coastguard Worker     // Variant assuming dst and src contain the color components of two consecutive pixels.
SkPMSrcOver_neon2(uint8x8_t dst,uint8x8_t src)111*c8dee2aaSAndroid Build Coastguard Worker     static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
112*c8dee2aaSAndroid Build Coastguard Worker         const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303);
113*c8dee2aaSAndroid Build Coastguard Worker         uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices));
114*c8dee2aaSAndroid Build Coastguard Worker         return vqadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst));
115*c8dee2aaSAndroid Build Coastguard Worker     }
116*c8dee2aaSAndroid Build Coastguard Worker 
117*c8dee2aaSAndroid Build Coastguard Worker #endif
118*c8dee2aaSAndroid Build Coastguard Worker 
119*c8dee2aaSAndroid Build Coastguard Worker #if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
120*c8dee2aaSAndroid Build Coastguard Worker     #include <lasxintrin.h>
121*c8dee2aaSAndroid Build Coastguard Worker 
SkPMSrcOver_LASX(const __m256i & src,const __m256i & dst)122*c8dee2aaSAndroid Build Coastguard Worker     static inline __m256i SkPMSrcOver_LASX(const __m256i& src, const __m256i& dst) {
123*c8dee2aaSAndroid Build Coastguard Worker         __m256i val = __lasx_xvreplgr2vr_w(256);
124*c8dee2aaSAndroid Build Coastguard Worker         __m256i scale = __lasx_xvsub_w(val, __lasx_xvsrli_w(src, 24));
125*c8dee2aaSAndroid Build Coastguard Worker         __m256i scale_x2 = __lasx_xvor_v(__lasx_xvslli_w(scale, 16), scale);
126*c8dee2aaSAndroid Build Coastguard Worker 
127*c8dee2aaSAndroid Build Coastguard Worker         val = __lasx_xvreplgr2vr_w(0x00ff00ff);
128*c8dee2aaSAndroid Build Coastguard Worker         __m256i rb = __lasx_xvand_v(val, dst);
129*c8dee2aaSAndroid Build Coastguard Worker         rb = __lasx_xvmul_h(rb, scale_x2);
130*c8dee2aaSAndroid Build Coastguard Worker         rb = __lasx_xvsrli_h(rb, 8);
131*c8dee2aaSAndroid Build Coastguard Worker 
132*c8dee2aaSAndroid Build Coastguard Worker         __m256i ga = __lasx_xvsrli_h(dst, 8);
133*c8dee2aaSAndroid Build Coastguard Worker         ga = __lasx_xvmul_h(ga, scale_x2);
134*c8dee2aaSAndroid Build Coastguard Worker         ga = __lasx_xvandn_v(val, ga);
135*c8dee2aaSAndroid Build Coastguard Worker 
136*c8dee2aaSAndroid Build Coastguard Worker         return __lasx_xvsadd_bu(src, __lasx_xvor_v(rb, ga));
137*c8dee2aaSAndroid Build Coastguard Worker     }
138*c8dee2aaSAndroid Build Coastguard Worker #endif
139*c8dee2aaSAndroid Build Coastguard Worker 
140*c8dee2aaSAndroid Build Coastguard Worker #if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
141*c8dee2aaSAndroid Build Coastguard Worker     #include <lsxintrin.h>
142*c8dee2aaSAndroid Build Coastguard Worker 
SkPMSrcOver_LSX(const __m128i & src,const __m128i & dst)143*c8dee2aaSAndroid Build Coastguard Worker     static inline __m128i SkPMSrcOver_LSX(const __m128i& src, const __m128i& dst) {
144*c8dee2aaSAndroid Build Coastguard Worker         __m128i val = __lsx_vreplgr2vr_w(256);
145*c8dee2aaSAndroid Build Coastguard Worker         __m128i scale = __lsx_vsub_w(val, __lsx_vsrli_w(src, 24));
146*c8dee2aaSAndroid Build Coastguard Worker         __m128i scale_x2 = __lsx_vor_v(__lsx_vslli_w(scale, 16), scale);
147*c8dee2aaSAndroid Build Coastguard Worker 
148*c8dee2aaSAndroid Build Coastguard Worker         val = __lsx_vreplgr2vr_w(0x00ff00ff);
149*c8dee2aaSAndroid Build Coastguard Worker         __m128i rb = __lsx_vand_v(val, dst);
150*c8dee2aaSAndroid Build Coastguard Worker         rb = __lsx_vmul_h(rb, scale_x2);
151*c8dee2aaSAndroid Build Coastguard Worker         rb = __lsx_vsrli_h(rb, 8);
152*c8dee2aaSAndroid Build Coastguard Worker 
153*c8dee2aaSAndroid Build Coastguard Worker         __m128i ga = __lsx_vsrli_h(dst, 8);
154*c8dee2aaSAndroid Build Coastguard Worker         ga = __lsx_vmul_h(ga, scale_x2);
155*c8dee2aaSAndroid Build Coastguard Worker         ga = __lsx_vandn_v(val, ga);
156*c8dee2aaSAndroid Build Coastguard Worker 
157*c8dee2aaSAndroid Build Coastguard Worker         return __lsx_vsadd_bu(src, __lsx_vor_v(rb, ga));
158*c8dee2aaSAndroid Build Coastguard Worker     }
159*c8dee2aaSAndroid Build Coastguard Worker #endif
160*c8dee2aaSAndroid Build Coastguard Worker 
161*c8dee2aaSAndroid Build Coastguard Worker namespace SK_OPTS_NS {
162*c8dee2aaSAndroid Build Coastguard Worker 
163*c8dee2aaSAndroid Build Coastguard Worker /*not static*/
blit_row_s32a_opaque(SkPMColor * dst,const SkPMColor * src,int len,U8CPU alpha)164*c8dee2aaSAndroid Build Coastguard Worker inline void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
165*c8dee2aaSAndroid Build Coastguard Worker     SkASSERT(alpha == 0xFF);
166*c8dee2aaSAndroid Build Coastguard Worker     sk_msan_assert_initialized(src, src+len);
167*c8dee2aaSAndroid Build Coastguard Worker 
168*c8dee2aaSAndroid Build Coastguard Worker #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
169*c8dee2aaSAndroid Build Coastguard Worker     while (len >= 8) {
170*c8dee2aaSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i*)dst,
171*c8dee2aaSAndroid Build Coastguard Worker                             SkPMSrcOver_AVX2(_mm256_loadu_si256((const __m256i*)src),
172*c8dee2aaSAndroid Build Coastguard Worker                                              _mm256_loadu_si256((const __m256i*)dst)));
173*c8dee2aaSAndroid Build Coastguard Worker         src += 8;
174*c8dee2aaSAndroid Build Coastguard Worker         dst += 8;
175*c8dee2aaSAndroid Build Coastguard Worker         len -= 8;
176*c8dee2aaSAndroid Build Coastguard Worker     }
177*c8dee2aaSAndroid Build Coastguard Worker #endif
178*c8dee2aaSAndroid Build Coastguard Worker 
179*c8dee2aaSAndroid Build Coastguard Worker #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
180*c8dee2aaSAndroid Build Coastguard Worker     while (len >= 4) {
181*c8dee2aaSAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i*)dst, SkPMSrcOver_SSE2(_mm_loadu_si128((const __m128i*)src),
182*c8dee2aaSAndroid Build Coastguard Worker                                                          _mm_loadu_si128((const __m128i*)dst)));
183*c8dee2aaSAndroid Build Coastguard Worker         src += 4;
184*c8dee2aaSAndroid Build Coastguard Worker         dst += 4;
185*c8dee2aaSAndroid Build Coastguard Worker         len -= 4;
186*c8dee2aaSAndroid Build Coastguard Worker     }
187*c8dee2aaSAndroid Build Coastguard Worker #endif
188*c8dee2aaSAndroid Build Coastguard Worker 
189*c8dee2aaSAndroid Build Coastguard Worker #if defined(SK_ARM_HAS_NEON)
190*c8dee2aaSAndroid Build Coastguard Worker     while (len >= 8) {
191*c8dee2aaSAndroid Build Coastguard Worker         vst4_u8((uint8_t*)dst, SkPMSrcOver_neon8(vld4_u8((const uint8_t*)dst),
192*c8dee2aaSAndroid Build Coastguard Worker                                                  vld4_u8((const uint8_t*)src)));
193*c8dee2aaSAndroid Build Coastguard Worker         src += 8;
194*c8dee2aaSAndroid Build Coastguard Worker         dst += 8;
195*c8dee2aaSAndroid Build Coastguard Worker         len -= 8;
196*c8dee2aaSAndroid Build Coastguard Worker     }
197*c8dee2aaSAndroid Build Coastguard Worker 
198*c8dee2aaSAndroid Build Coastguard Worker     while (len >= 2) {
199*c8dee2aaSAndroid Build Coastguard Worker         vst1_u8((uint8_t*)dst, SkPMSrcOver_neon2(vld1_u8((const uint8_t*)dst),
200*c8dee2aaSAndroid Build Coastguard Worker                                                  vld1_u8((const uint8_t*)src)));
201*c8dee2aaSAndroid Build Coastguard Worker         src += 2;
202*c8dee2aaSAndroid Build Coastguard Worker         dst += 2;
203*c8dee2aaSAndroid Build Coastguard Worker         len -= 2;
204*c8dee2aaSAndroid Build Coastguard Worker     }
205*c8dee2aaSAndroid Build Coastguard Worker 
206*c8dee2aaSAndroid Build Coastguard Worker     if (len != 0) {
207*c8dee2aaSAndroid Build Coastguard Worker         uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8((uint64_t)*dst),
208*c8dee2aaSAndroid Build Coastguard Worker                                              vcreate_u8((uint64_t)*src));
209*c8dee2aaSAndroid Build Coastguard Worker         vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0);
210*c8dee2aaSAndroid Build Coastguard Worker     }
211*c8dee2aaSAndroid Build Coastguard Worker     return;
212*c8dee2aaSAndroid Build Coastguard Worker #endif
213*c8dee2aaSAndroid Build Coastguard Worker 
214*c8dee2aaSAndroid Build Coastguard Worker #if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
215*c8dee2aaSAndroid Build Coastguard Worker     while (len >= 8) {
216*c8dee2aaSAndroid Build Coastguard Worker         __lasx_xvst(SkPMSrcOver_LASX(__lasx_xvld(src, 0),
217*c8dee2aaSAndroid Build Coastguard Worker                                      __lasx_xvld(dst, 0)), (__m256i*)dst, 0);
218*c8dee2aaSAndroid Build Coastguard Worker         src += 8;
219*c8dee2aaSAndroid Build Coastguard Worker         dst += 8;
220*c8dee2aaSAndroid Build Coastguard Worker         len -= 8;
221*c8dee2aaSAndroid Build Coastguard Worker     }
222*c8dee2aaSAndroid Build Coastguard Worker #endif
223*c8dee2aaSAndroid Build Coastguard Worker 
224*c8dee2aaSAndroid Build Coastguard Worker #if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
225*c8dee2aaSAndroid Build Coastguard Worker     while (len >= 4) {
226*c8dee2aaSAndroid Build Coastguard Worker         __lsx_vst(SkPMSrcOver_LSX(__lsx_vld(src, 0),
227*c8dee2aaSAndroid Build Coastguard Worker                                   __lsx_vld(dst, 0)), (__m128i*)dst, 0);
228*c8dee2aaSAndroid Build Coastguard Worker         src += 4;
229*c8dee2aaSAndroid Build Coastguard Worker         dst += 4;
230*c8dee2aaSAndroid Build Coastguard Worker         len -= 4;
231*c8dee2aaSAndroid Build Coastguard Worker     }
232*c8dee2aaSAndroid Build Coastguard Worker #endif
233*c8dee2aaSAndroid Build Coastguard Worker 
234*c8dee2aaSAndroid Build Coastguard Worker     while (len --> 0) {
235*c8dee2aaSAndroid Build Coastguard Worker         *dst = SkPMSrcOver(*src, *dst);
236*c8dee2aaSAndroid Build Coastguard Worker         src++;
237*c8dee2aaSAndroid Build Coastguard Worker         dst++;
238*c8dee2aaSAndroid Build Coastguard Worker     }
239*c8dee2aaSAndroid Build Coastguard Worker }
240*c8dee2aaSAndroid Build Coastguard Worker 
241*c8dee2aaSAndroid Build Coastguard Worker // Blend constant color over count dst pixels
242*c8dee2aaSAndroid Build Coastguard Worker /*not static*/
blit_row_color32(SkPMColor * dst,int count,SkPMColor color)243*c8dee2aaSAndroid Build Coastguard Worker inline void blit_row_color32(SkPMColor* dst, int count, SkPMColor color) {
244*c8dee2aaSAndroid Build Coastguard Worker     constexpr int N = 4;  // 8, 16 also reasonable choices
245*c8dee2aaSAndroid Build Coastguard Worker     using U32 = skvx::Vec<  N, uint32_t>;
246*c8dee2aaSAndroid Build Coastguard Worker     using U16 = skvx::Vec<4*N, uint16_t>;
247*c8dee2aaSAndroid Build Coastguard Worker     using U8  = skvx::Vec<4*N, uint8_t>;
248*c8dee2aaSAndroid Build Coastguard Worker 
249*c8dee2aaSAndroid Build Coastguard Worker     auto kernel = [color](U32 src) {
250*c8dee2aaSAndroid Build Coastguard Worker         unsigned invA = 255 - SkGetPackedA32(color);
251*c8dee2aaSAndroid Build Coastguard Worker         invA += invA >> 7;
252*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(0 < invA && invA < 256);  // We handle alpha == 0 or alpha == 255 specially.
253*c8dee2aaSAndroid Build Coastguard Worker 
254*c8dee2aaSAndroid Build Coastguard Worker         // (src * invA + (color << 8) + 128) >> 8
255*c8dee2aaSAndroid Build Coastguard Worker         // Should all fit in 16 bits.
256*c8dee2aaSAndroid Build Coastguard Worker         U8 s = sk_bit_cast<U8>(src),
257*c8dee2aaSAndroid Build Coastguard Worker            a = U8(invA);
258*c8dee2aaSAndroid Build Coastguard Worker         U16 c = skvx::cast<uint16_t>(sk_bit_cast<U8>(U32(color))),
259*c8dee2aaSAndroid Build Coastguard Worker             d = (mull(s,a) + (c << 8) + 128)>>8;
260*c8dee2aaSAndroid Build Coastguard Worker         return sk_bit_cast<U32>(skvx::cast<uint8_t>(d));
261*c8dee2aaSAndroid Build Coastguard Worker     };
262*c8dee2aaSAndroid Build Coastguard Worker 
263*c8dee2aaSAndroid Build Coastguard Worker     while (count >= N) {
264*c8dee2aaSAndroid Build Coastguard Worker         kernel(U32::Load(dst)).store(dst);
265*c8dee2aaSAndroid Build Coastguard Worker         dst   += N;
266*c8dee2aaSAndroid Build Coastguard Worker         count -= N;
267*c8dee2aaSAndroid Build Coastguard Worker     }
268*c8dee2aaSAndroid Build Coastguard Worker     while (count --> 0) {
269*c8dee2aaSAndroid Build Coastguard Worker         *dst = kernel(U32{*dst})[0];
270*c8dee2aaSAndroid Build Coastguard Worker         dst++;
271*c8dee2aaSAndroid Build Coastguard Worker     }
272*c8dee2aaSAndroid Build Coastguard Worker }
273*c8dee2aaSAndroid Build Coastguard Worker 
274*c8dee2aaSAndroid Build Coastguard Worker }  // namespace SK_OPTS_NS
275*c8dee2aaSAndroid Build Coastguard Worker 
276*c8dee2aaSAndroid Build Coastguard Worker #endif//SkBlitRow_opts_DEFINED
277