xref: /aosp_15_r20/external/skia/src/opts/SkBitmapProcState_opts.h (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1*c8dee2aaSAndroid Build Coastguard Worker /*
2*c8dee2aaSAndroid Build Coastguard Worker  * Copyright 2018 Google Inc.
3*c8dee2aaSAndroid Build Coastguard Worker  *
4*c8dee2aaSAndroid Build Coastguard Worker  * Use of this source code is governed by a BSD-style license that can be
5*c8dee2aaSAndroid Build Coastguard Worker  * found in the LICENSE file.
6*c8dee2aaSAndroid Build Coastguard Worker  */
7*c8dee2aaSAndroid Build Coastguard Worker 
8*c8dee2aaSAndroid Build Coastguard Worker #ifndef SkBitmapProcState_opts_DEFINED
9*c8dee2aaSAndroid Build Coastguard Worker #define SkBitmapProcState_opts_DEFINED
10*c8dee2aaSAndroid Build Coastguard Worker 
11*c8dee2aaSAndroid Build Coastguard Worker #include "src/base/SkMSAN.h"
12*c8dee2aaSAndroid Build Coastguard Worker #include "src/base/SkVx.h"
13*c8dee2aaSAndroid Build Coastguard Worker #include "src/core/SkBitmapProcState.h"
14*c8dee2aaSAndroid Build Coastguard Worker 
15*c8dee2aaSAndroid Build Coastguard Worker // SkBitmapProcState optimized Shader, Sample, or Matrix procs.
16*c8dee2aaSAndroid Build Coastguard Worker //
17*c8dee2aaSAndroid Build Coastguard Worker // Only S32_alpha_D32_filter_DX exploits instructions beyond
18*c8dee2aaSAndroid Build Coastguard Worker // our common baseline SSE2/NEON instruction sets, so that's
19*c8dee2aaSAndroid Build Coastguard Worker // all that lives here.
20*c8dee2aaSAndroid Build Coastguard Worker //
21*c8dee2aaSAndroid Build Coastguard Worker // The rest are scattershot at the moment but I want to get them
22*c8dee2aaSAndroid Build Coastguard Worker // all migrated to be normal code inside SkBitmapProcState.cpp.
23*c8dee2aaSAndroid Build Coastguard Worker 
24*c8dee2aaSAndroid Build Coastguard Worker #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
25*c8dee2aaSAndroid Build Coastguard Worker     #include <immintrin.h>
26*c8dee2aaSAndroid Build Coastguard Worker #elif defined(SK_ARM_HAS_NEON)
27*c8dee2aaSAndroid Build Coastguard Worker     #include <arm_neon.h>
28*c8dee2aaSAndroid Build Coastguard Worker #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
29*c8dee2aaSAndroid Build Coastguard Worker     #include <lasxintrin.h>
30*c8dee2aaSAndroid Build Coastguard Worker #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
31*c8dee2aaSAndroid Build Coastguard Worker     #include <lsxintrin.h>
32*c8dee2aaSAndroid Build Coastguard Worker #endif
33*c8dee2aaSAndroid Build Coastguard Worker 
34*c8dee2aaSAndroid Build Coastguard Worker namespace SK_OPTS_NS {
35*c8dee2aaSAndroid Build Coastguard Worker 
36*c8dee2aaSAndroid Build Coastguard Worker // This same basic packing scheme is used throughout the file.
37*c8dee2aaSAndroid Build Coastguard Worker template <typename U32, typename Out>
decode_packed_coordinates_and_weight(U32 packed,Out * v0,Out * v1,Out * w)38*c8dee2aaSAndroid Build Coastguard Worker static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
39*c8dee2aaSAndroid Build Coastguard Worker     *v0 = (packed >> 18);       // Integer coordinate x0 or y0.
40*c8dee2aaSAndroid Build Coastguard Worker     *v1 = (packed & 0x3fff);    // Integer coordinate x1 or y1.
41*c8dee2aaSAndroid Build Coastguard Worker     *w  = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w.
42*c8dee2aaSAndroid Build Coastguard Worker }
43*c8dee2aaSAndroid Build Coastguard Worker 
44*c8dee2aaSAndroid Build Coastguard Worker #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
45*c8dee2aaSAndroid Build Coastguard Worker 
46*c8dee2aaSAndroid Build Coastguard Worker     /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)47*c8dee2aaSAndroid Build Coastguard Worker     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
48*c8dee2aaSAndroid Build Coastguard Worker                                  const uint32_t* xy, int count, uint32_t* colors) {
49*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(count > 0 && colors != nullptr);
50*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(s.fBilerp);
51*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
52*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(s.fAlphaScale <= 256);
53*c8dee2aaSAndroid Build Coastguard Worker 
54*c8dee2aaSAndroid Build Coastguard Worker         // interpolate_in_x() is the crux of the SSSE3 implementation,
55*c8dee2aaSAndroid Build Coastguard Worker         // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
56*c8dee2aaSAndroid Build Coastguard Worker         auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
57*c8dee2aaSAndroid Build Coastguard Worker                                    uint32_t B0, uint32_t B1,
58*c8dee2aaSAndroid Build Coastguard Worker                                    __m128i interlaced_x_weights) {
59*c8dee2aaSAndroid Build Coastguard Worker             // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp.
60*c8dee2aaSAndroid Build Coastguard Worker             //
61*c8dee2aaSAndroid Build Coastguard Worker             // It takes two arguments interlaced byte-wise:
62*c8dee2aaSAndroid Build Coastguard Worker             //    - first  arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...]
63*c8dee2aaSAndroid Build Coastguard Worker             //    - second arg: [ w,W, ... 7 more pairs of   signed 8-bit values ...]
64*c8dee2aaSAndroid Build Coastguard Worker             // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ].
65*c8dee2aaSAndroid Build Coastguard Worker             //
66*c8dee2aaSAndroid Build Coastguard Worker             // That's why we go to all this trouble to make interlaced_x_weights,
67*c8dee2aaSAndroid Build Coastguard Worker             // and here we're about to interlace A0 with A1 and B0 with B1 to match.
68*c8dee2aaSAndroid Build Coastguard Worker             //
69*c8dee2aaSAndroid Build Coastguard Worker             // Our interlaced_x_weights are all in [0,16], and so we need not worry about
70*c8dee2aaSAndroid Build Coastguard Worker             // the signedness of that input nor about the signedness of the output.
71*c8dee2aaSAndroid Build Coastguard Worker 
72*c8dee2aaSAndroid Build Coastguard Worker             __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
73*c8dee2aaSAndroid Build Coastguard Worker                     interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
74*c8dee2aaSAndroid Build Coastguard Worker 
75*c8dee2aaSAndroid Build Coastguard Worker             return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
76*c8dee2aaSAndroid Build Coastguard Worker                                      interlaced_x_weights);
77*c8dee2aaSAndroid Build Coastguard Worker         };
78*c8dee2aaSAndroid Build Coastguard Worker 
79*c8dee2aaSAndroid Build Coastguard Worker         // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
80*c8dee2aaSAndroid Build Coastguard Worker         // Returns two pixels, with each color channel in a 16-bit lane of the __m128i.
81*c8dee2aaSAndroid Build Coastguard Worker         auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
82*c8dee2aaSAndroid Build Coastguard Worker                                           uint32_t A2, uint32_t A3,
83*c8dee2aaSAndroid Build Coastguard Worker                                           uint32_t B0, uint32_t B1,
84*c8dee2aaSAndroid Build Coastguard Worker                                           uint32_t B2, uint32_t B3,
85*c8dee2aaSAndroid Build Coastguard Worker                                           __m128i interlaced_x_weights,
86*c8dee2aaSAndroid Build Coastguard Worker                                           int wy) {
87*c8dee2aaSAndroid Build Coastguard Worker             // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights.
88*c8dee2aaSAndroid Build Coastguard Worker             __m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
89*c8dee2aaSAndroid Build Coastguard Worker                     bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
90*c8dee2aaSAndroid Build Coastguard Worker 
91*c8dee2aaSAndroid Build Coastguard Worker             // Interpolate in Y.  As in the SSE2 code, we calculate top*(16-wy) + bot*wy
92*c8dee2aaSAndroid Build Coastguard Worker             // as 16*top + (bot-top)*wy to save a multiply.
93*c8dee2aaSAndroid Build Coastguard Worker             __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4),
94*c8dee2aaSAndroid Build Coastguard Worker                                        _mm_mullo_epi16(_mm_sub_epi16(bot, top),
95*c8dee2aaSAndroid Build Coastguard Worker                                                        _mm_set1_epi16(wy)));
96*c8dee2aaSAndroid Build Coastguard Worker 
97*c8dee2aaSAndroid Build Coastguard Worker             // Scale down by total max weight 16x16 = 256.
98*c8dee2aaSAndroid Build Coastguard Worker             px = _mm_srli_epi16(px, 8);
99*c8dee2aaSAndroid Build Coastguard Worker 
100*c8dee2aaSAndroid Build Coastguard Worker             // Scale by alpha if needed.
101*c8dee2aaSAndroid Build Coastguard Worker             if (s.fAlphaScale < 256) {
102*c8dee2aaSAndroid Build Coastguard Worker                 px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8);
103*c8dee2aaSAndroid Build Coastguard Worker             }
104*c8dee2aaSAndroid Build Coastguard Worker             return px;
105*c8dee2aaSAndroid Build Coastguard Worker         };
106*c8dee2aaSAndroid Build Coastguard Worker 
107*c8dee2aaSAndroid Build Coastguard Worker         // We're in _DX mode here, so we're only varying in X.
108*c8dee2aaSAndroid Build Coastguard Worker         // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
109*c8dee2aaSAndroid Build Coastguard Worker         // All the other entries in xy will be pairs of X coordinates and the X weight.
110*c8dee2aaSAndroid Build Coastguard Worker         int y0, y1, wy;
111*c8dee2aaSAndroid Build Coastguard Worker         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
112*c8dee2aaSAndroid Build Coastguard Worker 
113*c8dee2aaSAndroid Build Coastguard Worker         auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
114*c8dee2aaSAndroid Build Coastguard Worker              row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
115*c8dee2aaSAndroid Build Coastguard Worker 
116*c8dee2aaSAndroid Build Coastguard Worker         while (count >= 4) {
117*c8dee2aaSAndroid Build Coastguard Worker             // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.
118*c8dee2aaSAndroid Build Coastguard Worker             int x0[4],
119*c8dee2aaSAndroid Build Coastguard Worker                 x1[4];
120*c8dee2aaSAndroid Build Coastguard Worker             __m128i wx;
121*c8dee2aaSAndroid Build Coastguard Worker 
122*c8dee2aaSAndroid Build Coastguard Worker             // decode_packed_coordinates_and_weight(), 4x.
123*c8dee2aaSAndroid Build Coastguard Worker             __m128i packed = _mm_loadu_si128((const __m128i*)xy);
124*c8dee2aaSAndroid Build Coastguard Worker             _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18));
125*c8dee2aaSAndroid Build Coastguard Worker             _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
126*c8dee2aaSAndroid Build Coastguard Worker             wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));  // [0,15]
127*c8dee2aaSAndroid Build Coastguard Worker 
128*c8dee2aaSAndroid Build Coastguard Worker             // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,
129*c8dee2aaSAndroid Build Coastguard Worker             // and sixteen minus that as wl for pixels on the left at x0.
130*c8dee2aaSAndroid Build Coastguard Worker             __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
131*c8dee2aaSAndroid Build Coastguard Worker                     wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
132*c8dee2aaSAndroid Build Coastguard Worker 
133*c8dee2aaSAndroid Build Coastguard Worker             // We need to interlace wl and wr for _mm_maddubs_epi16().
134*c8dee2aaSAndroid Build Coastguard Worker             __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),
135*c8dee2aaSAndroid Build Coastguard Worker                     interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);
136*c8dee2aaSAndroid Build Coastguard Worker 
137*c8dee2aaSAndroid Build Coastguard Worker             enum { A,B,C,D };
138*c8dee2aaSAndroid Build Coastguard Worker 
139*c8dee2aaSAndroid Build Coastguard Worker             // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
140*c8dee2aaSAndroid Build Coastguard Worker             // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
141*c8dee2aaSAndroid Build Coastguard Worker             __m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]],
142*c8dee2aaSAndroid Build Coastguard Worker                                                 row1[x0[A]], row1[x1[A]],
143*c8dee2aaSAndroid Build Coastguard Worker                                                 row0[x0[B]], row0[x1[B]],
144*c8dee2aaSAndroid Build Coastguard Worker                                                 row1[x0[B]], row1[x1[B]],
145*c8dee2aaSAndroid Build Coastguard Worker                                                 interlaced_x_weights_AB, wy);
146*c8dee2aaSAndroid Build Coastguard Worker 
147*c8dee2aaSAndroid Build Coastguard Worker             // Once more with the other half of the x-weights for two more pixels C,D.
148*c8dee2aaSAndroid Build Coastguard Worker             __m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]],
149*c8dee2aaSAndroid Build Coastguard Worker                                                 row1[x0[C]], row1[x1[C]],
150*c8dee2aaSAndroid Build Coastguard Worker                                                 row0[x0[D]], row0[x1[D]],
151*c8dee2aaSAndroid Build Coastguard Worker                                                 row1[x0[D]], row1[x1[D]],
152*c8dee2aaSAndroid Build Coastguard Worker                                                 interlaced_x_weights_CD, wy);
153*c8dee2aaSAndroid Build Coastguard Worker 
154*c8dee2aaSAndroid Build Coastguard Worker             // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
155*c8dee2aaSAndroid Build Coastguard Worker             _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD));
156*c8dee2aaSAndroid Build Coastguard Worker             xy     += 4;
157*c8dee2aaSAndroid Build Coastguard Worker             colors += 4;
158*c8dee2aaSAndroid Build Coastguard Worker             count  -= 4;
159*c8dee2aaSAndroid Build Coastguard Worker         }
160*c8dee2aaSAndroid Build Coastguard Worker 
161*c8dee2aaSAndroid Build Coastguard Worker         while (count --> 0) {
162*c8dee2aaSAndroid Build Coastguard Worker             // This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
163*c8dee2aaSAndroid Build Coastguard Worker             int x0, x1, wx;
164*c8dee2aaSAndroid Build Coastguard Worker             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
165*c8dee2aaSAndroid Build Coastguard Worker 
166*c8dee2aaSAndroid Build Coastguard Worker             // As above, splat out wx four times as wr, and sixteen minus that as wl.
167*c8dee2aaSAndroid Build Coastguard Worker             __m128i wr = _mm_set1_epi8(wx),     // This splats it out 16 times, but that's fine.
168*c8dee2aaSAndroid Build Coastguard Worker                     wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
169*c8dee2aaSAndroid Build Coastguard Worker 
170*c8dee2aaSAndroid Build Coastguard Worker             __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr);
171*c8dee2aaSAndroid Build Coastguard Worker 
172*c8dee2aaSAndroid Build Coastguard Worker             __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
173*c8dee2aaSAndroid Build Coastguard Worker                                                row1[x0], row1[x1],
174*c8dee2aaSAndroid Build Coastguard Worker                                                       0,        0,
175*c8dee2aaSAndroid Build Coastguard Worker                                                       0,        0,
176*c8dee2aaSAndroid Build Coastguard Worker                                                interlaced_x_weights, wy);
177*c8dee2aaSAndroid Build Coastguard Worker 
178*c8dee2aaSAndroid Build Coastguard Worker             *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128()));
179*c8dee2aaSAndroid Build Coastguard Worker         }
180*c8dee2aaSAndroid Build Coastguard Worker     }
181*c8dee2aaSAndroid Build Coastguard Worker 
182*c8dee2aaSAndroid Build Coastguard Worker 
183*c8dee2aaSAndroid Build Coastguard Worker #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
184*c8dee2aaSAndroid Build Coastguard Worker 
185*c8dee2aaSAndroid Build Coastguard Worker     /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)186*c8dee2aaSAndroid Build Coastguard Worker     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
187*c8dee2aaSAndroid Build Coastguard Worker                                  const uint32_t* xy, int count, uint32_t* colors) {
188*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(count > 0 && colors != nullptr);
189*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(s.fBilerp);
190*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
191*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(s.fAlphaScale <= 256);
192*c8dee2aaSAndroid Build Coastguard Worker 
193*c8dee2aaSAndroid Build Coastguard Worker         int y0, y1, wy;
194*c8dee2aaSAndroid Build Coastguard Worker         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
195*c8dee2aaSAndroid Build Coastguard Worker 
196*c8dee2aaSAndroid Build Coastguard Worker         auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
197*c8dee2aaSAndroid Build Coastguard Worker              row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
198*c8dee2aaSAndroid Build Coastguard Worker 
199*c8dee2aaSAndroid Build Coastguard Worker         // We'll put one pixel in the low 4 16-bit lanes to line up with wy,
200*c8dee2aaSAndroid Build Coastguard Worker         // and another in the upper 4 16-bit lanes to line up with 16 - wy.
201*c8dee2aaSAndroid Build Coastguard Worker         const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16(   wy),   // Bottom pixel goes here.
202*c8dee2aaSAndroid Build Coastguard Worker                                                 _mm_set1_epi16(16-wy));  // Top pixel goes here.
203*c8dee2aaSAndroid Build Coastguard Worker 
204*c8dee2aaSAndroid Build Coastguard Worker         while (count --> 0) {
205*c8dee2aaSAndroid Build Coastguard Worker             int x0, x1, wx;
206*c8dee2aaSAndroid Build Coastguard Worker             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
207*c8dee2aaSAndroid Build Coastguard Worker 
208*c8dee2aaSAndroid Build Coastguard Worker             // Load the 4 pixels we're interpolating, in this grid:
209*c8dee2aaSAndroid Build Coastguard Worker             //    | tl  tr |
210*c8dee2aaSAndroid Build Coastguard Worker             //    | bl  br |
211*c8dee2aaSAndroid Build Coastguard Worker             const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]),
212*c8dee2aaSAndroid Build Coastguard Worker                           bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]);
213*c8dee2aaSAndroid Build Coastguard Worker 
214*c8dee2aaSAndroid Build Coastguard Worker             // We want to calculate a sum of 4 pixels weighted in two directions:
215*c8dee2aaSAndroid Build Coastguard Worker             //
216*c8dee2aaSAndroid Build Coastguard Worker             //  sum = tl * (16-wy) * (16-wx)
217*c8dee2aaSAndroid Build Coastguard Worker             //      + bl * (   wy) * (16-wx)
218*c8dee2aaSAndroid Build Coastguard Worker             //      + tr * (16-wy) * (   wx)
219*c8dee2aaSAndroid Build Coastguard Worker             //      + br * (   wy) * (   wx)
220*c8dee2aaSAndroid Build Coastguard Worker             //
221*c8dee2aaSAndroid Build Coastguard Worker             // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
222*c8dee2aaSAndroid Build Coastguard Worker             //
223*c8dee2aaSAndroid Build Coastguard Worker             // We've already prepared allY as a vector containing [wy, 16-wy] as a way
224*c8dee2aaSAndroid Build Coastguard Worker             // to apply those y-direction weights.  So we'll start on the x-direction
225*c8dee2aaSAndroid Build Coastguard Worker             // first, grouping into left and right halves, lined up with allY:
226*c8dee2aaSAndroid Build Coastguard Worker             //
227*c8dee2aaSAndroid Build Coastguard Worker             //     L = [bl, tl]
228*c8dee2aaSAndroid Build Coastguard Worker             //     R = [br, tr]
229*c8dee2aaSAndroid Build Coastguard Worker             //
230*c8dee2aaSAndroid Build Coastguard Worker             //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
231*c8dee2aaSAndroid Build Coastguard Worker             //
232*c8dee2aaSAndroid Build Coastguard Worker             // Rewriting that one more step, we can replace a multiply with a shift:
233*c8dee2aaSAndroid Build Coastguard Worker             //
234*c8dee2aaSAndroid Build Coastguard Worker             //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )
235*c8dee2aaSAndroid Build Coastguard Worker             //
236*c8dee2aaSAndroid Build Coastguard Worker             // That's how we'll actually do this math.
237*c8dee2aaSAndroid Build Coastguard Worker 
238*c8dee2aaSAndroid Build Coastguard Worker             __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),
239*c8dee2aaSAndroid Build Coastguard Worker                     R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());
240*c8dee2aaSAndroid Build Coastguard Worker 
241*c8dee2aaSAndroid Build Coastguard Worker             __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4),
242*c8dee2aaSAndroid Build Coastguard Worker                                           _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx)));
243*c8dee2aaSAndroid Build Coastguard Worker 
244*c8dee2aaSAndroid Build Coastguard Worker             __m128i sum_in_x = _mm_mullo_epi16(inner, allY);
245*c8dee2aaSAndroid Build Coastguard Worker 
246*c8dee2aaSAndroid Build Coastguard Worker             // sum = horizontalSum( ... )
247*c8dee2aaSAndroid Build Coastguard Worker             __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8));
248*c8dee2aaSAndroid Build Coastguard Worker 
249*c8dee2aaSAndroid Build Coastguard Worker             // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
250*c8dee2aaSAndroid Build Coastguard Worker             sum = _mm_srli_epi16(sum, 8);
251*c8dee2aaSAndroid Build Coastguard Worker 
252*c8dee2aaSAndroid Build Coastguard Worker             if (s.fAlphaScale < 256) {
253*c8dee2aaSAndroid Build Coastguard Worker                 // Scale by alpha, which is in [0,256].
254*c8dee2aaSAndroid Build Coastguard Worker                 sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
255*c8dee2aaSAndroid Build Coastguard Worker                 sum = _mm_srli_epi16(sum, 8);
256*c8dee2aaSAndroid Build Coastguard Worker             }
257*c8dee2aaSAndroid Build Coastguard Worker 
258*c8dee2aaSAndroid Build Coastguard Worker             // Pack back into 8-bit values and store.
259*c8dee2aaSAndroid Build Coastguard Worker             *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
260*c8dee2aaSAndroid Build Coastguard Worker         }
261*c8dee2aaSAndroid Build Coastguard Worker     }
262*c8dee2aaSAndroid Build Coastguard Worker 
263*c8dee2aaSAndroid Build Coastguard Worker #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
264*c8dee2aaSAndroid Build Coastguard Worker     /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)265*c8dee2aaSAndroid Build Coastguard Worker     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
266*c8dee2aaSAndroid Build Coastguard Worker                                  const uint32_t* xy, int count, uint32_t* colors) {
267*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(count > 0 && colors != nullptr);
268*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(s.fBilerp);
269*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
270*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(s.fAlphaScale <= 256);
271*c8dee2aaSAndroid Build Coastguard Worker 
272*c8dee2aaSAndroid Build Coastguard Worker         int y0, y1, wy;
273*c8dee2aaSAndroid Build Coastguard Worker         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
274*c8dee2aaSAndroid Build Coastguard Worker 
275*c8dee2aaSAndroid Build Coastguard Worker         auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
276*c8dee2aaSAndroid Build Coastguard Worker              row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
277*c8dee2aaSAndroid Build Coastguard Worker 
278*c8dee2aaSAndroid Build Coastguard Worker         // We'll put one pixel in the low 16 16-bit lanes to line up with wy,
279*c8dee2aaSAndroid Build Coastguard Worker         // and another in the upper 16 16-bit lanes to line up with 16 - wy.
280*c8dee2aaSAndroid Build Coastguard Worker         __m256i allY = __lasx_xvilvl_d(__lasx_xvreplgr2vr_h(16-wy), __lasx_xvreplgr2vr_h(wy));
281*c8dee2aaSAndroid Build Coastguard Worker 
282*c8dee2aaSAndroid Build Coastguard Worker         while (count --> 0) {
283*c8dee2aaSAndroid Build Coastguard Worker             int x0, x1, wx;
284*c8dee2aaSAndroid Build Coastguard Worker             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
285*c8dee2aaSAndroid Build Coastguard Worker 
286*c8dee2aaSAndroid Build Coastguard Worker             // Load the 4 pixels we're interpolating, in this grid:
287*c8dee2aaSAndroid Build Coastguard Worker             //    | tl  tr |
288*c8dee2aaSAndroid Build Coastguard Worker             //    | bl  br |
289*c8dee2aaSAndroid Build Coastguard Worker 
290*c8dee2aaSAndroid Build Coastguard Worker             const __m256i zeros = __lasx_xvldi(0);
291*c8dee2aaSAndroid Build Coastguard Worker             const __m256i tl = __lasx_xvinsgr2vr_w(zeros, row0[x0], 0),
292*c8dee2aaSAndroid Build Coastguard Worker                           tr = __lasx_xvinsgr2vr_w(zeros, row0[x1], 0),
293*c8dee2aaSAndroid Build Coastguard Worker                           bl = __lasx_xvinsgr2vr_w(zeros, row1[x0], 0),
294*c8dee2aaSAndroid Build Coastguard Worker                           br = __lasx_xvinsgr2vr_w(zeros, row1[x1], 0);
295*c8dee2aaSAndroid Build Coastguard Worker 
296*c8dee2aaSAndroid Build Coastguard Worker             // We want to calculate a sum of 8 pixels weighted in two directions:
297*c8dee2aaSAndroid Build Coastguard Worker             //
298*c8dee2aaSAndroid Build Coastguard Worker             //  sum = tl * (16-wy) * (16-wx)
299*c8dee2aaSAndroid Build Coastguard Worker             //      + bl * (   wy) * (16-wx)
300*c8dee2aaSAndroid Build Coastguard Worker             //      + tr * (16-wy) * (   wx)
301*c8dee2aaSAndroid Build Coastguard Worker             //      + br * (   wy) * (   wx)
302*c8dee2aaSAndroid Build Coastguard Worker             //
303*c8dee2aaSAndroid Build Coastguard Worker             // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
304*c8dee2aaSAndroid Build Coastguard Worker             //
305*c8dee2aaSAndroid Build Coastguard Worker             // We've already prepared allY as a vector containing [wy, 16-wy] as a way
306*c8dee2aaSAndroid Build Coastguard Worker             // to apply those y-direction weights.  So we'll start on the x-direction
307*c8dee2aaSAndroid Build Coastguard Worker             // first, grouping into left and right halves, lined up with allY:
308*c8dee2aaSAndroid Build Coastguard Worker             //
309*c8dee2aaSAndroid Build Coastguard Worker             //     L = [bl, tl]
310*c8dee2aaSAndroid Build Coastguard Worker             //     R = [br, tr]
311*c8dee2aaSAndroid Build Coastguard Worker             //
312*c8dee2aaSAndroid Build Coastguard Worker             //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
313*c8dee2aaSAndroid Build Coastguard Worker             //
314*c8dee2aaSAndroid Build Coastguard Worker             // Rewriting that one more step, we can replace a multiply with a shift:
315*c8dee2aaSAndroid Build Coastguard Worker             //
316*c8dee2aaSAndroid Build Coastguard Worker             //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )
317*c8dee2aaSAndroid Build Coastguard Worker             //
318*c8dee2aaSAndroid Build Coastguard Worker             // That's how we'll actually do this math.
319*c8dee2aaSAndroid Build Coastguard Worker 
320*c8dee2aaSAndroid Build Coastguard Worker             __m256i L = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tl, bl)),
321*c8dee2aaSAndroid Build Coastguard Worker                     R = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tr, br));
322*c8dee2aaSAndroid Build Coastguard Worker 
323*c8dee2aaSAndroid Build Coastguard Worker             __m256i inner = __lasx_xvadd_h(__lasx_xvslli_h(L, 4),
324*c8dee2aaSAndroid Build Coastguard Worker                                            __lasx_xvmul_h(__lasx_xvsub_h(R,L),
325*c8dee2aaSAndroid Build Coastguard Worker                                                           __lasx_xvreplgr2vr_h(wx)));
326*c8dee2aaSAndroid Build Coastguard Worker 
327*c8dee2aaSAndroid Build Coastguard Worker             __m256i sum_in_x = __lasx_xvmul_h(inner, allY);
328*c8dee2aaSAndroid Build Coastguard Worker 
329*c8dee2aaSAndroid Build Coastguard Worker             // sum = horizontalSum( ... )
330*c8dee2aaSAndroid Build Coastguard Worker             __m256i sum = __lasx_xvadd_h(sum_in_x, __lasx_xvbsrl_v(sum_in_x, 8));
331*c8dee2aaSAndroid Build Coastguard Worker 
332*c8dee2aaSAndroid Build Coastguard Worker             // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
333*c8dee2aaSAndroid Build Coastguard Worker             sum = __lasx_xvsrli_h(sum, 8);
334*c8dee2aaSAndroid Build Coastguard Worker 
335*c8dee2aaSAndroid Build Coastguard Worker             if (s.fAlphaScale < 256) {
336*c8dee2aaSAndroid Build Coastguard Worker                 // Scale by alpha, which is in [0,256].
337*c8dee2aaSAndroid Build Coastguard Worker                 sum = __lasx_xvmul_h(sum, __lasx_xvreplgr2vr_h(s.fAlphaScale));
338*c8dee2aaSAndroid Build Coastguard Worker                 sum = __lasx_xvsrli_h(sum, 8);
339*c8dee2aaSAndroid Build Coastguard Worker             }
340*c8dee2aaSAndroid Build Coastguard Worker 
341*c8dee2aaSAndroid Build Coastguard Worker             // Pack back into 8-bit values and store.
342*c8dee2aaSAndroid Build Coastguard Worker             *colors++ = __lasx_xvpickve2gr_w(__lasx_xvpickev_b(__lasx_xvldi(0),
343*c8dee2aaSAndroid Build Coastguard Worker                                                                __lasx_xvsat_hu(sum, 8)), 0);
344*c8dee2aaSAndroid Build Coastguard Worker         }
345*c8dee2aaSAndroid Build Coastguard Worker     }
346*c8dee2aaSAndroid Build Coastguard Worker 
347*c8dee2aaSAndroid Build Coastguard Worker #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
348*c8dee2aaSAndroid Build Coastguard Worker 
349*c8dee2aaSAndroid Build Coastguard Worker     /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)350*c8dee2aaSAndroid Build Coastguard Worker     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
351*c8dee2aaSAndroid Build Coastguard Worker                                  const uint32_t* xy, int count, uint32_t* colors) {
352*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(count > 0 && colors != nullptr);
353*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(s.fBilerp);
354*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
355*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(s.fAlphaScale <= 256);
356*c8dee2aaSAndroid Build Coastguard Worker 
357*c8dee2aaSAndroid Build Coastguard Worker         int y0, y1, wy;
358*c8dee2aaSAndroid Build Coastguard Worker         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
359*c8dee2aaSAndroid Build Coastguard Worker 
360*c8dee2aaSAndroid Build Coastguard Worker         auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
361*c8dee2aaSAndroid Build Coastguard Worker              row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
362*c8dee2aaSAndroid Build Coastguard Worker 
363*c8dee2aaSAndroid Build Coastguard Worker         // We'll put one pixel in the low 8 16-bit lanes to line up with wy,
364*c8dee2aaSAndroid Build Coastguard Worker         // and another in the upper 8 16-bit lanes to line up with 16 - wy.
365*c8dee2aaSAndroid Build Coastguard Worker         __m128i allY = __lsx_vilvl_d(__lsx_vreplgr2vr_h(16-wy), __lsx_vreplgr2vr_h(wy));
366*c8dee2aaSAndroid Build Coastguard Worker 
367*c8dee2aaSAndroid Build Coastguard Worker         while (count --> 0) {
368*c8dee2aaSAndroid Build Coastguard Worker             int x0, x1, wx;
369*c8dee2aaSAndroid Build Coastguard Worker             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
370*c8dee2aaSAndroid Build Coastguard Worker 
371*c8dee2aaSAndroid Build Coastguard Worker             // Load the 4 pixels we're interpolating, in this grid:
372*c8dee2aaSAndroid Build Coastguard Worker             //    | tl  tr |
373*c8dee2aaSAndroid Build Coastguard Worker             //    | bl  br |
374*c8dee2aaSAndroid Build Coastguard Worker             const __m128i zeros = __lsx_vldi(0);
375*c8dee2aaSAndroid Build Coastguard Worker             const __m128i tl = __lsx_vinsgr2vr_w(zeros, row0[x0], 0),
376*c8dee2aaSAndroid Build Coastguard Worker                           tr = __lsx_vinsgr2vr_w(zeros, row0[x1], 0),
377*c8dee2aaSAndroid Build Coastguard Worker                           bl = __lsx_vinsgr2vr_w(zeros, row1[x0], 0),
378*c8dee2aaSAndroid Build Coastguard Worker                           br = __lsx_vinsgr2vr_w(zeros, row1[x1], 0);
379*c8dee2aaSAndroid Build Coastguard Worker 
380*c8dee2aaSAndroid Build Coastguard Worker             // We want to calculate a sum of 8 pixels weighted in two directions:
381*c8dee2aaSAndroid Build Coastguard Worker             //
382*c8dee2aaSAndroid Build Coastguard Worker             //  sum = tl * (16-wy) * (16-wx)
383*c8dee2aaSAndroid Build Coastguard Worker             //      + bl * (   wy) * (16-wx)
384*c8dee2aaSAndroid Build Coastguard Worker             //      + tr * (16-wy) * (   wx)
385*c8dee2aaSAndroid Build Coastguard Worker             //      + br * (   wy) * (   wx)
386*c8dee2aaSAndroid Build Coastguard Worker             //
387*c8dee2aaSAndroid Build Coastguard Worker             // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
388*c8dee2aaSAndroid Build Coastguard Worker             //
389*c8dee2aaSAndroid Build Coastguard Worker             // We've already prepared allY as a vector containing [wy, 16-wy] as a way
390*c8dee2aaSAndroid Build Coastguard Worker             // to apply those y-direction weights.  So we'll start on the x-direction
391*c8dee2aaSAndroid Build Coastguard Worker             // first, grouping into left and right halves, lined up with allY:
392*c8dee2aaSAndroid Build Coastguard Worker             //
393*c8dee2aaSAndroid Build Coastguard Worker             //     L = [bl, tl]
394*c8dee2aaSAndroid Build Coastguard Worker             //     R = [br, tr]
395*c8dee2aaSAndroid Build Coastguard Worker             //
396*c8dee2aaSAndroid Build Coastguard Worker             //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
397*c8dee2aaSAndroid Build Coastguard Worker             //
398*c8dee2aaSAndroid Build Coastguard Worker             // Rewriting that one more step, we can replace a multiply with a shift:
399*c8dee2aaSAndroid Build Coastguard Worker             //
400*c8dee2aaSAndroid Build Coastguard Worker             //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )
401*c8dee2aaSAndroid Build Coastguard Worker             //
402*c8dee2aaSAndroid Build Coastguard Worker             // That's how we'll actually do this math.
403*c8dee2aaSAndroid Build Coastguard Worker 
404*c8dee2aaSAndroid Build Coastguard Worker 
405*c8dee2aaSAndroid Build Coastguard Worker             __m128i L = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tl, bl)),
406*c8dee2aaSAndroid Build Coastguard Worker                     R = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tr, br));
407*c8dee2aaSAndroid Build Coastguard Worker 
408*c8dee2aaSAndroid Build Coastguard Worker             __m128i inner = __lsx_vadd_h(__lsx_vslli_h(L, 4),
409*c8dee2aaSAndroid Build Coastguard Worker                                          __lsx_vmul_h(__lsx_vsub_h(R,L),
410*c8dee2aaSAndroid Build Coastguard Worker                                                       __lsx_vreplgr2vr_h(wx)));
411*c8dee2aaSAndroid Build Coastguard Worker 
412*c8dee2aaSAndroid Build Coastguard Worker             __m128i sum_in_x = __lsx_vmul_h(inner, allY);
413*c8dee2aaSAndroid Build Coastguard Worker 
414*c8dee2aaSAndroid Build Coastguard Worker             // sum = horizontalSum( ... )
415*c8dee2aaSAndroid Build Coastguard Worker             __m128i sum = __lsx_vadd_h(sum_in_x, __lsx_vbsrl_v(sum_in_x, 8));
416*c8dee2aaSAndroid Build Coastguard Worker 
417*c8dee2aaSAndroid Build Coastguard Worker             // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
418*c8dee2aaSAndroid Build Coastguard Worker             sum = __lsx_vsrli_h(sum, 8);
419*c8dee2aaSAndroid Build Coastguard Worker 
420*c8dee2aaSAndroid Build Coastguard Worker             if (s.fAlphaScale < 256) {
421*c8dee2aaSAndroid Build Coastguard Worker                 // Scale by alpha, which is in [0,256].
422*c8dee2aaSAndroid Build Coastguard Worker                 sum = __lsx_vmul_h(sum, __lsx_vreplgr2vr_h(s.fAlphaScale));
423*c8dee2aaSAndroid Build Coastguard Worker                 sum = __lsx_vsrli_h(sum, 8);
424*c8dee2aaSAndroid Build Coastguard Worker             }
425*c8dee2aaSAndroid Build Coastguard Worker 
426*c8dee2aaSAndroid Build Coastguard Worker             // Pack back into 8-bit values and store.
427*c8dee2aaSAndroid Build Coastguard Worker             *colors++ = __lsx_vpickve2gr_w(__lsx_vpickev_b(__lsx_vldi(0),
428*c8dee2aaSAndroid Build Coastguard Worker                                                            __lsx_vsat_hu(sum, 8)), 0);
429*c8dee2aaSAndroid Build Coastguard Worker         }
430*c8dee2aaSAndroid Build Coastguard Worker     }
431*c8dee2aaSAndroid Build Coastguard Worker 
432*c8dee2aaSAndroid Build Coastguard Worker #else
433*c8dee2aaSAndroid Build Coastguard Worker 
434*c8dee2aaSAndroid Build Coastguard Worker     // The NEON code only actually differs from the portable code in the
435*c8dee2aaSAndroid Build Coastguard Worker     // filtering step after we've loaded all four pixels we want to bilerp.
436*c8dee2aaSAndroid Build Coastguard Worker 
437*c8dee2aaSAndroid Build Coastguard Worker     #if defined(SK_ARM_HAS_NEON)
filter_and_scale_by_alpha(unsigned x,unsigned y,SkPMColor a00,SkPMColor a01,SkPMColor a10,SkPMColor a11,SkPMColor * dst,uint16_t scale)438*c8dee2aaSAndroid Build Coastguard Worker         static void filter_and_scale_by_alpha(unsigned x, unsigned y,
439*c8dee2aaSAndroid Build Coastguard Worker                                               SkPMColor a00, SkPMColor a01,
440*c8dee2aaSAndroid Build Coastguard Worker                                               SkPMColor a10, SkPMColor a11,
441*c8dee2aaSAndroid Build Coastguard Worker                                               SkPMColor *dst,
442*c8dee2aaSAndroid Build Coastguard Worker                                               uint16_t scale) {
443*c8dee2aaSAndroid Build Coastguard Worker             uint8x8_t vy, vconst16_8, v16_y, vres;
444*c8dee2aaSAndroid Build Coastguard Worker             uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
445*c8dee2aaSAndroid Build Coastguard Worker             uint32x2_t va0, va1;
446*c8dee2aaSAndroid Build Coastguard Worker             uint16x8_t tmp1, tmp2;
447*c8dee2aaSAndroid Build Coastguard Worker 
448*c8dee2aaSAndroid Build Coastguard Worker             vy = vdup_n_u8(y);                // duplicate y into vy
449*c8dee2aaSAndroid Build Coastguard Worker             vconst16_8 = vmov_n_u8(16);       // set up constant in vconst16_8
450*c8dee2aaSAndroid Build Coastguard Worker             v16_y = vsub_u8(vconst16_8, vy);  // v16_y = 16-y
451*c8dee2aaSAndroid Build Coastguard Worker 
452*c8dee2aaSAndroid Build Coastguard Worker             va0 = vdup_n_u32(a00);            // duplicate a00
453*c8dee2aaSAndroid Build Coastguard Worker             va1 = vdup_n_u32(a10);            // duplicate a10
454*c8dee2aaSAndroid Build Coastguard Worker             va0 = vset_lane_u32(a01, va0, 1); // set top to a01
455*c8dee2aaSAndroid Build Coastguard Worker             va1 = vset_lane_u32(a11, va1, 1); // set top to a11
456*c8dee2aaSAndroid Build Coastguard Worker 
457*c8dee2aaSAndroid Build Coastguard Worker             tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
458*c8dee2aaSAndroid Build Coastguard Worker             tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);    // tmp2 = [a11|a10] * y
459*c8dee2aaSAndroid Build Coastguard Worker 
460*c8dee2aaSAndroid Build Coastguard Worker             vx = vdup_n_u16(x);                // duplicate x into vx
461*c8dee2aaSAndroid Build Coastguard Worker             vconst16_16 = vmov_n_u16(16);      // set up constant in vconst16_16
462*c8dee2aaSAndroid Build Coastguard Worker             v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
463*c8dee2aaSAndroid Build Coastguard Worker 
464*c8dee2aaSAndroid Build Coastguard Worker             tmp = vmul_u16(vget_high_u16(tmp1), vx);        // tmp  = a01 * x
465*c8dee2aaSAndroid Build Coastguard Worker             tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);   // tmp += a11 * x
466*c8dee2aaSAndroid Build Coastguard Worker             tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
467*c8dee2aaSAndroid Build Coastguard Worker             tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
468*c8dee2aaSAndroid Build Coastguard Worker 
469*c8dee2aaSAndroid Build Coastguard Worker             if (scale < 256) {
470*c8dee2aaSAndroid Build Coastguard Worker                 vscale = vdup_n_u16(scale);        // duplicate scale
471*c8dee2aaSAndroid Build Coastguard Worker                 tmp = vshr_n_u16(tmp, 8);          // shift down result by 8
472*c8dee2aaSAndroid Build Coastguard Worker                 tmp = vmul_u16(tmp, vscale);       // multiply result by scale
473*c8dee2aaSAndroid Build Coastguard Worker             }
474*c8dee2aaSAndroid Build Coastguard Worker 
475*c8dee2aaSAndroid Build Coastguard Worker             vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8
476*c8dee2aaSAndroid Build Coastguard Worker             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
477*c8dee2aaSAndroid Build Coastguard Worker         }
478*c8dee2aaSAndroid Build Coastguard Worker     #else
filter_and_scale_by_alpha(unsigned x,unsigned y,SkPMColor a00,SkPMColor a01,SkPMColor a10,SkPMColor a11,SkPMColor * dstColor,unsigned alphaScale)479*c8dee2aaSAndroid Build Coastguard Worker         static void filter_and_scale_by_alpha(unsigned x, unsigned y,
480*c8dee2aaSAndroid Build Coastguard Worker                                               SkPMColor a00, SkPMColor a01,
481*c8dee2aaSAndroid Build Coastguard Worker                                               SkPMColor a10, SkPMColor a11,
482*c8dee2aaSAndroid Build Coastguard Worker                                               SkPMColor* dstColor,
483*c8dee2aaSAndroid Build Coastguard Worker                                               unsigned alphaScale) {
484*c8dee2aaSAndroid Build Coastguard Worker             SkASSERT((unsigned)x <= 0xF);
485*c8dee2aaSAndroid Build Coastguard Worker             SkASSERT((unsigned)y <= 0xF);
486*c8dee2aaSAndroid Build Coastguard Worker             SkASSERT(alphaScale <= 256);
487*c8dee2aaSAndroid Build Coastguard Worker 
488*c8dee2aaSAndroid Build Coastguard Worker             int xy = x * y;
489*c8dee2aaSAndroid Build Coastguard Worker             const uint32_t mask = 0xFF00FF;
490*c8dee2aaSAndroid Build Coastguard Worker 
491*c8dee2aaSAndroid Build Coastguard Worker             int scale = 256 - 16*y - 16*x + xy;
492*c8dee2aaSAndroid Build Coastguard Worker             uint32_t lo = (a00 & mask) * scale;
493*c8dee2aaSAndroid Build Coastguard Worker             uint32_t hi = ((a00 >> 8) & mask) * scale;
494*c8dee2aaSAndroid Build Coastguard Worker 
495*c8dee2aaSAndroid Build Coastguard Worker             scale = 16*x - xy;
496*c8dee2aaSAndroid Build Coastguard Worker             lo += (a01 & mask) * scale;
497*c8dee2aaSAndroid Build Coastguard Worker             hi += ((a01 >> 8) & mask) * scale;
498*c8dee2aaSAndroid Build Coastguard Worker 
499*c8dee2aaSAndroid Build Coastguard Worker             scale = 16*y - xy;
500*c8dee2aaSAndroid Build Coastguard Worker             lo += (a10 & mask) * scale;
501*c8dee2aaSAndroid Build Coastguard Worker             hi += ((a10 >> 8) & mask) * scale;
502*c8dee2aaSAndroid Build Coastguard Worker 
503*c8dee2aaSAndroid Build Coastguard Worker             lo += (a11 & mask) * xy;
504*c8dee2aaSAndroid Build Coastguard Worker             hi += ((a11 >> 8) & mask) * xy;
505*c8dee2aaSAndroid Build Coastguard Worker 
506*c8dee2aaSAndroid Build Coastguard Worker             if (alphaScale < 256) {
507*c8dee2aaSAndroid Build Coastguard Worker                 lo = ((lo >> 8) & mask) * alphaScale;
508*c8dee2aaSAndroid Build Coastguard Worker                 hi = ((hi >> 8) & mask) * alphaScale;
509*c8dee2aaSAndroid Build Coastguard Worker             }
510*c8dee2aaSAndroid Build Coastguard Worker 
511*c8dee2aaSAndroid Build Coastguard Worker             *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
512*c8dee2aaSAndroid Build Coastguard Worker         }
513*c8dee2aaSAndroid Build Coastguard Worker     #endif
514*c8dee2aaSAndroid Build Coastguard Worker 
515*c8dee2aaSAndroid Build Coastguard Worker 
516*c8dee2aaSAndroid Build Coastguard Worker     /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,SkPMColor * colors)517*c8dee2aaSAndroid Build Coastguard Worker     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
518*c8dee2aaSAndroid Build Coastguard Worker                                  const uint32_t* xy, int count, SkPMColor* colors) {
519*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(count > 0 && colors != nullptr);
520*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(s.fBilerp);
521*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
522*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(s.fAlphaScale <= 256);
523*c8dee2aaSAndroid Build Coastguard Worker 
524*c8dee2aaSAndroid Build Coastguard Worker         int y0, y1, wy;
525*c8dee2aaSAndroid Build Coastguard Worker         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
526*c8dee2aaSAndroid Build Coastguard Worker 
527*c8dee2aaSAndroid Build Coastguard Worker         auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
528*c8dee2aaSAndroid Build Coastguard Worker              row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
529*c8dee2aaSAndroid Build Coastguard Worker 
530*c8dee2aaSAndroid Build Coastguard Worker         while (count --> 0) {
531*c8dee2aaSAndroid Build Coastguard Worker             int x0, x1, wx;
532*c8dee2aaSAndroid Build Coastguard Worker             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
533*c8dee2aaSAndroid Build Coastguard Worker 
534*c8dee2aaSAndroid Build Coastguard Worker             filter_and_scale_by_alpha(wx, wy,
535*c8dee2aaSAndroid Build Coastguard Worker                                       row0[x0], row0[x1],
536*c8dee2aaSAndroid Build Coastguard Worker                                       row1[x0], row1[x1],
537*c8dee2aaSAndroid Build Coastguard Worker                                       colors++,
538*c8dee2aaSAndroid Build Coastguard Worker                                       s.fAlphaScale);
539*c8dee2aaSAndroid Build Coastguard Worker         }
540*c8dee2aaSAndroid Build Coastguard Worker     }
541*c8dee2aaSAndroid Build Coastguard Worker 
542*c8dee2aaSAndroid Build Coastguard Worker #endif
543*c8dee2aaSAndroid Build Coastguard Worker 
544*c8dee2aaSAndroid Build Coastguard Worker #if defined(SK_ARM_HAS_NEON)
545*c8dee2aaSAndroid Build Coastguard Worker     /*not static*/ inline
S32_alpha_D32_filter_DXDY(const SkBitmapProcState & s,const uint32_t * xy,int count,SkPMColor * colors)546*c8dee2aaSAndroid Build Coastguard Worker     void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s,
547*c8dee2aaSAndroid Build Coastguard Worker                                    const uint32_t* xy, int count, SkPMColor* colors) {
548*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(count > 0 && colors != nullptr);
549*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(s.fBilerp);
550*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
551*c8dee2aaSAndroid Build Coastguard Worker         SkASSERT(s.fAlphaScale <= 256);
552*c8dee2aaSAndroid Build Coastguard Worker 
553*c8dee2aaSAndroid Build Coastguard Worker         auto src = (const char*)s.fPixmap.addr();
554*c8dee2aaSAndroid Build Coastguard Worker         size_t rb = s.fPixmap.rowBytes();
555*c8dee2aaSAndroid Build Coastguard Worker 
556*c8dee2aaSAndroid Build Coastguard Worker         while (count --> 0) {
557*c8dee2aaSAndroid Build Coastguard Worker             int y0, y1, wy,
558*c8dee2aaSAndroid Build Coastguard Worker                 x0, x1, wx;
559*c8dee2aaSAndroid Build Coastguard Worker             decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
560*c8dee2aaSAndroid Build Coastguard Worker             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
561*c8dee2aaSAndroid Build Coastguard Worker 
562*c8dee2aaSAndroid Build Coastguard Worker             auto row0 = (const uint32_t*)(src + y0*rb),
563*c8dee2aaSAndroid Build Coastguard Worker                  row1 = (const uint32_t*)(src + y1*rb);
564*c8dee2aaSAndroid Build Coastguard Worker 
565*c8dee2aaSAndroid Build Coastguard Worker             filter_and_scale_by_alpha(wx, wy,
566*c8dee2aaSAndroid Build Coastguard Worker                                       row0[x0], row0[x1],
567*c8dee2aaSAndroid Build Coastguard Worker                                       row1[x0], row1[x1],
568*c8dee2aaSAndroid Build Coastguard Worker                                       colors++,
569*c8dee2aaSAndroid Build Coastguard Worker                                       s.fAlphaScale);
570*c8dee2aaSAndroid Build Coastguard Worker         }
571*c8dee2aaSAndroid Build Coastguard Worker     }
572*c8dee2aaSAndroid Build Coastguard Worker #else
573*c8dee2aaSAndroid Build Coastguard Worker     // It's not yet clear whether it's worthwhile specializing for other architectures.
574*c8dee2aaSAndroid Build Coastguard Worker     constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&,
575*c8dee2aaSAndroid Build Coastguard Worker                                                        const uint32_t*, int, SkPMColor*) = nullptr;
576*c8dee2aaSAndroid Build Coastguard Worker #endif
577*c8dee2aaSAndroid Build Coastguard Worker 
578*c8dee2aaSAndroid Build Coastguard Worker }  // namespace SK_OPTS_NS
579*c8dee2aaSAndroid Build Coastguard Worker 
580*c8dee2aaSAndroid Build Coastguard Worker namespace sktests {
581*c8dee2aaSAndroid Build Coastguard Worker     template <typename U32, typename Out>
decode_packed_coordinates_and_weight(U32 packed,Out * v0,Out * v1,Out * w)582*c8dee2aaSAndroid Build Coastguard Worker     void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
583*c8dee2aaSAndroid Build Coastguard Worker         SK_OPTS_NS::decode_packed_coordinates_and_weight<U32, Out>(packed, v0, v1, w);
584*c8dee2aaSAndroid Build Coastguard Worker     }
585*c8dee2aaSAndroid Build Coastguard Worker }
586*c8dee2aaSAndroid Build Coastguard Worker 
587*c8dee2aaSAndroid Build Coastguard Worker #endif
588