xref: /aosp_15_r20/external/skia/src/opts/SkBitmapProcState_opts.h (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1 /*
2  * Copyright 2018 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef SkBitmapProcState_opts_DEFINED
9 #define SkBitmapProcState_opts_DEFINED
10 
11 #include "src/base/SkMSAN.h"
12 #include "src/base/SkVx.h"
13 #include "src/core/SkBitmapProcState.h"
14 
15 // SkBitmapProcState optimized Shader, Sample, or Matrix procs.
16 //
17 // Only S32_alpha_D32_filter_DX exploits instructions beyond
18 // our common baseline SSE2/NEON instruction sets, so that's
19 // all that lives here.
20 //
21 // The rest are scattershot at the moment but I want to get them
22 // all migrated to be normal code inside SkBitmapProcState.cpp.
23 
24 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
25     #include <immintrin.h>
26 #elif defined(SK_ARM_HAS_NEON)
27     #include <arm_neon.h>
28 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
29     #include <lasxintrin.h>
30 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
31     #include <lsxintrin.h>
32 #endif
33 
34 namespace SK_OPTS_NS {
35 
36 // This same basic packing scheme is used throughout the file.
37 template <typename U32, typename Out>
decode_packed_coordinates_and_weight(U32 packed,Out * v0,Out * v1,Out * w)38 static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
39     *v0 = (packed >> 18);       // Integer coordinate x0 or y0.
40     *v1 = (packed & 0x3fff);    // Integer coordinate x1 or y1.
41     *w  = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w.
42 }
43 
44 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
45 
46     /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)47     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
48                                  const uint32_t* xy, int count, uint32_t* colors) {
49         SkASSERT(count > 0 && colors != nullptr);
50         SkASSERT(s.fBilerp);
51         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
52         SkASSERT(s.fAlphaScale <= 256);
53 
54         // interpolate_in_x() is the crux of the SSSE3 implementation,
55         // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
56         auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
57                                    uint32_t B0, uint32_t B1,
58                                    __m128i interlaced_x_weights) {
59             // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp.
60             //
61             // It takes two arguments interlaced byte-wise:
62             //    - first  arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...]
63             //    - second arg: [ w,W, ... 7 more pairs of   signed 8-bit values ...]
64             // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ].
65             //
66             // That's why we go to all this trouble to make interlaced_x_weights,
67             // and here we're about to interlace A0 with A1 and B0 with B1 to match.
68             //
69             // Our interlaced_x_weights are all in [0,16], and so we need not worry about
70             // the signedness of that input nor about the signedness of the output.
71 
72             __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
73                     interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
74 
75             return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
76                                      interlaced_x_weights);
77         };
78 
79         // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
80         // Returns two pixels, with each color channel in a 16-bit lane of the __m128i.
81         auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
82                                           uint32_t A2, uint32_t A3,
83                                           uint32_t B0, uint32_t B1,
84                                           uint32_t B2, uint32_t B3,
85                                           __m128i interlaced_x_weights,
86                                           int wy) {
87             // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights.
88             __m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
89                     bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
90 
91             // Interpolate in Y.  As in the SSE2 code, we calculate top*(16-wy) + bot*wy
92             // as 16*top + (bot-top)*wy to save a multiply.
93             __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4),
94                                        _mm_mullo_epi16(_mm_sub_epi16(bot, top),
95                                                        _mm_set1_epi16(wy)));
96 
97             // Scale down by total max weight 16x16 = 256.
98             px = _mm_srli_epi16(px, 8);
99 
100             // Scale by alpha if needed.
101             if (s.fAlphaScale < 256) {
102                 px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8);
103             }
104             return px;
105         };
106 
107         // We're in _DX mode here, so we're only varying in X.
108         // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
109         // All the other entries in xy will be pairs of X coordinates and the X weight.
110         int y0, y1, wy;
111         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
112 
113         auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
114              row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
115 
116         while (count >= 4) {
117             // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.
118             int x0[4],
119                 x1[4];
120             __m128i wx;
121 
122             // decode_packed_coordinates_and_weight(), 4x.
123             __m128i packed = _mm_loadu_si128((const __m128i*)xy);
124             _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18));
125             _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
126             wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));  // [0,15]
127 
128             // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,
129             // and sixteen minus that as wl for pixels on the left at x0.
130             __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
131                     wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
132 
133             // We need to interlace wl and wr for _mm_maddubs_epi16().
134             __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),
135                     interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);
136 
137             enum { A,B,C,D };
138 
139             // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
140             // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
141             __m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]],
142                                                 row1[x0[A]], row1[x1[A]],
143                                                 row0[x0[B]], row0[x1[B]],
144                                                 row1[x0[B]], row1[x1[B]],
145                                                 interlaced_x_weights_AB, wy);
146 
147             // Once more with the other half of the x-weights for two more pixels C,D.
148             __m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]],
149                                                 row1[x0[C]], row1[x1[C]],
150                                                 row0[x0[D]], row0[x1[D]],
151                                                 row1[x0[D]], row1[x1[D]],
152                                                 interlaced_x_weights_CD, wy);
153 
154             // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
155             _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD));
156             xy     += 4;
157             colors += 4;
158             count  -= 4;
159         }
160 
161         while (count --> 0) {
162             // This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
163             int x0, x1, wx;
164             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
165 
166             // As above, splat out wx four times as wr, and sixteen minus that as wl.
167             __m128i wr = _mm_set1_epi8(wx),     // This splats it out 16 times, but that's fine.
168                     wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
169 
170             __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr);
171 
172             __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
173                                                row1[x0], row1[x1],
174                                                       0,        0,
175                                                       0,        0,
176                                                interlaced_x_weights, wy);
177 
178             *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128()));
179         }
180     }
181 
182 
183 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
184 
185     /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)186     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
187                                  const uint32_t* xy, int count, uint32_t* colors) {
188         SkASSERT(count > 0 && colors != nullptr);
189         SkASSERT(s.fBilerp);
190         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
191         SkASSERT(s.fAlphaScale <= 256);
192 
193         int y0, y1, wy;
194         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
195 
196         auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
197              row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
198 
199         // We'll put one pixel in the low 4 16-bit lanes to line up with wy,
200         // and another in the upper 4 16-bit lanes to line up with 16 - wy.
201         const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16(   wy),   // Bottom pixel goes here.
202                                                 _mm_set1_epi16(16-wy));  // Top pixel goes here.
203 
204         while (count --> 0) {
205             int x0, x1, wx;
206             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
207 
208             // Load the 4 pixels we're interpolating, in this grid:
209             //    | tl  tr |
210             //    | bl  br |
211             const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]),
212                           bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]);
213 
214             // We want to calculate a sum of 4 pixels weighted in two directions:
215             //
216             //  sum = tl * (16-wy) * (16-wx)
217             //      + bl * (   wy) * (16-wx)
218             //      + tr * (16-wy) * (   wx)
219             //      + br * (   wy) * (   wx)
220             //
221             // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
222             //
223             // We've already prepared allY as a vector containing [wy, 16-wy] as a way
224             // to apply those y-direction weights.  So we'll start on the x-direction
225             // first, grouping into left and right halves, lined up with allY:
226             //
227             //     L = [bl, tl]
228             //     R = [br, tr]
229             //
230             //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
231             //
232             // Rewriting that one more step, we can replace a multiply with a shift:
233             //
234             //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )
235             //
236             // That's how we'll actually do this math.
237 
238             __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),
239                     R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());
240 
241             __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4),
242                                           _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx)));
243 
244             __m128i sum_in_x = _mm_mullo_epi16(inner, allY);
245 
246             // sum = horizontalSum( ... )
247             __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8));
248 
249             // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
250             sum = _mm_srli_epi16(sum, 8);
251 
252             if (s.fAlphaScale < 256) {
253                 // Scale by alpha, which is in [0,256].
254                 sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
255                 sum = _mm_srli_epi16(sum, 8);
256             }
257 
258             // Pack back into 8-bit values and store.
259             *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
260         }
261     }
262 
263 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
264     /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)265     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
266                                  const uint32_t* xy, int count, uint32_t* colors) {
267         SkASSERT(count > 0 && colors != nullptr);
268         SkASSERT(s.fBilerp);
269         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
270         SkASSERT(s.fAlphaScale <= 256);
271 
272         int y0, y1, wy;
273         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
274 
275         auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
276              row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
277 
278         // We'll put one pixel in the low 16 16-bit lanes to line up with wy,
279         // and another in the upper 16 16-bit lanes to line up with 16 - wy.
280         __m256i allY = __lasx_xvilvl_d(__lasx_xvreplgr2vr_h(16-wy), __lasx_xvreplgr2vr_h(wy));
281 
282         while (count --> 0) {
283             int x0, x1, wx;
284             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
285 
286             // Load the 4 pixels we're interpolating, in this grid:
287             //    | tl  tr |
288             //    | bl  br |
289 
290             const __m256i zeros = __lasx_xvldi(0);
291             const __m256i tl = __lasx_xvinsgr2vr_w(zeros, row0[x0], 0),
292                           tr = __lasx_xvinsgr2vr_w(zeros, row0[x1], 0),
293                           bl = __lasx_xvinsgr2vr_w(zeros, row1[x0], 0),
294                           br = __lasx_xvinsgr2vr_w(zeros, row1[x1], 0);
295 
296             // We want to calculate a sum of 8 pixels weighted in two directions:
297             //
298             //  sum = tl * (16-wy) * (16-wx)
299             //      + bl * (   wy) * (16-wx)
300             //      + tr * (16-wy) * (   wx)
301             //      + br * (   wy) * (   wx)
302             //
303             // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
304             //
305             // We've already prepared allY as a vector containing [wy, 16-wy] as a way
306             // to apply those y-direction weights.  So we'll start on the x-direction
307             // first, grouping into left and right halves, lined up with allY:
308             //
309             //     L = [bl, tl]
310             //     R = [br, tr]
311             //
312             //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
313             //
314             // Rewriting that one more step, we can replace a multiply with a shift:
315             //
316             //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )
317             //
318             // That's how we'll actually do this math.
319 
320             __m256i L = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tl, bl)),
321                     R = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tr, br));
322 
323             __m256i inner = __lasx_xvadd_h(__lasx_xvslli_h(L, 4),
324                                            __lasx_xvmul_h(__lasx_xvsub_h(R,L),
325                                                           __lasx_xvreplgr2vr_h(wx)));
326 
327             __m256i sum_in_x = __lasx_xvmul_h(inner, allY);
328 
329             // sum = horizontalSum( ... )
330             __m256i sum = __lasx_xvadd_h(sum_in_x, __lasx_xvbsrl_v(sum_in_x, 8));
331 
332             // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
333             sum = __lasx_xvsrli_h(sum, 8);
334 
335             if (s.fAlphaScale < 256) {
336                 // Scale by alpha, which is in [0,256].
337                 sum = __lasx_xvmul_h(sum, __lasx_xvreplgr2vr_h(s.fAlphaScale));
338                 sum = __lasx_xvsrli_h(sum, 8);
339             }
340 
341             // Pack back into 8-bit values and store.
342             *colors++ = __lasx_xvpickve2gr_w(__lasx_xvpickev_b(__lasx_xvldi(0),
343                                                                __lasx_xvsat_hu(sum, 8)), 0);
344         }
345     }
346 
347 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
348 
349     /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)350     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
351                                  const uint32_t* xy, int count, uint32_t* colors) {
352         SkASSERT(count > 0 && colors != nullptr);
353         SkASSERT(s.fBilerp);
354         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
355         SkASSERT(s.fAlphaScale <= 256);
356 
357         int y0, y1, wy;
358         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
359 
360         auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
361              row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
362 
363         // We'll put one pixel in the low 8 16-bit lanes to line up with wy,
364         // and another in the upper 8 16-bit lanes to line up with 16 - wy.
365         __m128i allY = __lsx_vilvl_d(__lsx_vreplgr2vr_h(16-wy), __lsx_vreplgr2vr_h(wy));
366 
367         while (count --> 0) {
368             int x0, x1, wx;
369             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
370 
371             // Load the 4 pixels we're interpolating, in this grid:
372             //    | tl  tr |
373             //    | bl  br |
374             const __m128i zeros = __lsx_vldi(0);
375             const __m128i tl = __lsx_vinsgr2vr_w(zeros, row0[x0], 0),
376                           tr = __lsx_vinsgr2vr_w(zeros, row0[x1], 0),
377                           bl = __lsx_vinsgr2vr_w(zeros, row1[x0], 0),
378                           br = __lsx_vinsgr2vr_w(zeros, row1[x1], 0);
379 
380             // We want to calculate a sum of 8 pixels weighted in two directions:
381             //
382             //  sum = tl * (16-wy) * (16-wx)
383             //      + bl * (   wy) * (16-wx)
384             //      + tr * (16-wy) * (   wx)
385             //      + br * (   wy) * (   wx)
386             //
387             // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
388             //
389             // We've already prepared allY as a vector containing [wy, 16-wy] as a way
390             // to apply those y-direction weights.  So we'll start on the x-direction
391             // first, grouping into left and right halves, lined up with allY:
392             //
393             //     L = [bl, tl]
394             //     R = [br, tr]
395             //
396             //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
397             //
398             // Rewriting that one more step, we can replace a multiply with a shift:
399             //
400             //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )
401             //
402             // That's how we'll actually do this math.
403 
404 
405             __m128i L = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tl, bl)),
406                     R = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tr, br));
407 
408             __m128i inner = __lsx_vadd_h(__lsx_vslli_h(L, 4),
409                                          __lsx_vmul_h(__lsx_vsub_h(R,L),
410                                                       __lsx_vreplgr2vr_h(wx)));
411 
412             __m128i sum_in_x = __lsx_vmul_h(inner, allY);
413 
414             // sum = horizontalSum( ... )
415             __m128i sum = __lsx_vadd_h(sum_in_x, __lsx_vbsrl_v(sum_in_x, 8));
416 
417             // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
418             sum = __lsx_vsrli_h(sum, 8);
419 
420             if (s.fAlphaScale < 256) {
421                 // Scale by alpha, which is in [0,256].
422                 sum = __lsx_vmul_h(sum, __lsx_vreplgr2vr_h(s.fAlphaScale));
423                 sum = __lsx_vsrli_h(sum, 8);
424             }
425 
426             // Pack back into 8-bit values and store.
427             *colors++ = __lsx_vpickve2gr_w(__lsx_vpickev_b(__lsx_vldi(0),
428                                                            __lsx_vsat_hu(sum, 8)), 0);
429         }
430     }
431 
432 #else
433 
434     // The NEON code only actually differs from the portable code in the
435     // filtering step after we've loaded all four pixels we want to bilerp.
436 
437     #if defined(SK_ARM_HAS_NEON)
filter_and_scale_by_alpha(unsigned x,unsigned y,SkPMColor a00,SkPMColor a01,SkPMColor a10,SkPMColor a11,SkPMColor * dst,uint16_t scale)438         static void filter_and_scale_by_alpha(unsigned x, unsigned y,
439                                               SkPMColor a00, SkPMColor a01,
440                                               SkPMColor a10, SkPMColor a11,
441                                               SkPMColor *dst,
442                                               uint16_t scale) {
443             uint8x8_t vy, vconst16_8, v16_y, vres;
444             uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
445             uint32x2_t va0, va1;
446             uint16x8_t tmp1, tmp2;
447 
448             vy = vdup_n_u8(y);                // duplicate y into vy
449             vconst16_8 = vmov_n_u8(16);       // set up constant in vconst16_8
450             v16_y = vsub_u8(vconst16_8, vy);  // v16_y = 16-y
451 
452             va0 = vdup_n_u32(a00);            // duplicate a00
453             va1 = vdup_n_u32(a10);            // duplicate a10
454             va0 = vset_lane_u32(a01, va0, 1); // set top to a01
455             va1 = vset_lane_u32(a11, va1, 1); // set top to a11
456 
457             tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
458             tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);    // tmp2 = [a11|a10] * y
459 
460             vx = vdup_n_u16(x);                // duplicate x into vx
461             vconst16_16 = vmov_n_u16(16);      // set up constant in vconst16_16
462             v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
463 
464             tmp = vmul_u16(vget_high_u16(tmp1), vx);        // tmp  = a01 * x
465             tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);   // tmp += a11 * x
466             tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
467             tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
468 
469             if (scale < 256) {
470                 vscale = vdup_n_u16(scale);        // duplicate scale
471                 tmp = vshr_n_u16(tmp, 8);          // shift down result by 8
472                 tmp = vmul_u16(tmp, vscale);       // multiply result by scale
473             }
474 
475             vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8
476             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
477         }
478     #else
filter_and_scale_by_alpha(unsigned x,unsigned y,SkPMColor a00,SkPMColor a01,SkPMColor a10,SkPMColor a11,SkPMColor * dstColor,unsigned alphaScale)479         static void filter_and_scale_by_alpha(unsigned x, unsigned y,
480                                               SkPMColor a00, SkPMColor a01,
481                                               SkPMColor a10, SkPMColor a11,
482                                               SkPMColor* dstColor,
483                                               unsigned alphaScale) {
484             SkASSERT((unsigned)x <= 0xF);
485             SkASSERT((unsigned)y <= 0xF);
486             SkASSERT(alphaScale <= 256);
487 
488             int xy = x * y;
489             const uint32_t mask = 0xFF00FF;
490 
491             int scale = 256 - 16*y - 16*x + xy;
492             uint32_t lo = (a00 & mask) * scale;
493             uint32_t hi = ((a00 >> 8) & mask) * scale;
494 
495             scale = 16*x - xy;
496             lo += (a01 & mask) * scale;
497             hi += ((a01 >> 8) & mask) * scale;
498 
499             scale = 16*y - xy;
500             lo += (a10 & mask) * scale;
501             hi += ((a10 >> 8) & mask) * scale;
502 
503             lo += (a11 & mask) * xy;
504             hi += ((a11 >> 8) & mask) * xy;
505 
506             if (alphaScale < 256) {
507                 lo = ((lo >> 8) & mask) * alphaScale;
508                 hi = ((hi >> 8) & mask) * alphaScale;
509             }
510 
511             *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
512         }
513     #endif
514 
515 
516     /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,SkPMColor * colors)517     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
518                                  const uint32_t* xy, int count, SkPMColor* colors) {
519         SkASSERT(count > 0 && colors != nullptr);
520         SkASSERT(s.fBilerp);
521         SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
522         SkASSERT(s.fAlphaScale <= 256);
523 
524         int y0, y1, wy;
525         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
526 
527         auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
528              row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
529 
530         while (count --> 0) {
531             int x0, x1, wx;
532             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
533 
534             filter_and_scale_by_alpha(wx, wy,
535                                       row0[x0], row0[x1],
536                                       row1[x0], row1[x1],
537                                       colors++,
538                                       s.fAlphaScale);
539         }
540     }
541 
542 #endif
543 
544 #if defined(SK_ARM_HAS_NEON)
545     /*not static*/ inline
S32_alpha_D32_filter_DXDY(const SkBitmapProcState & s,const uint32_t * xy,int count,SkPMColor * colors)546     void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s,
547                                    const uint32_t* xy, int count, SkPMColor* colors) {
548         SkASSERT(count > 0 && colors != nullptr);
549         SkASSERT(s.fBilerp);
550         SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
551         SkASSERT(s.fAlphaScale <= 256);
552 
553         auto src = (const char*)s.fPixmap.addr();
554         size_t rb = s.fPixmap.rowBytes();
555 
556         while (count --> 0) {
557             int y0, y1, wy,
558                 x0, x1, wx;
559             decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
560             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
561 
562             auto row0 = (const uint32_t*)(src + y0*rb),
563                  row1 = (const uint32_t*)(src + y1*rb);
564 
565             filter_and_scale_by_alpha(wx, wy,
566                                       row0[x0], row0[x1],
567                                       row1[x0], row1[x1],
568                                       colors++,
569                                       s.fAlphaScale);
570         }
571     }
572 #else
573     // It's not yet clear whether it's worthwhile specializing for other architectures.
574     constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&,
575                                                        const uint32_t*, int, SkPMColor*) = nullptr;
576 #endif
577 
578 }  // namespace SK_OPTS_NS
579 
580 namespace sktests {
581     template <typename U32, typename Out>
decode_packed_coordinates_and_weight(U32 packed,Out * v0,Out * v1,Out * w)582     void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
583         SK_OPTS_NS::decode_packed_coordinates_and_weight<U32, Out>(packed, v0, v1, w);
584     }
585 }
586 
587 #endif
588