1*c8dee2aaSAndroid Build Coastguard Worker /*
2*c8dee2aaSAndroid Build Coastguard Worker * Copyright 2018 Google Inc.
3*c8dee2aaSAndroid Build Coastguard Worker *
4*c8dee2aaSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license that can be
5*c8dee2aaSAndroid Build Coastguard Worker * found in the LICENSE file.
6*c8dee2aaSAndroid Build Coastguard Worker */
7*c8dee2aaSAndroid Build Coastguard Worker
8*c8dee2aaSAndroid Build Coastguard Worker #ifndef SkBitmapProcState_opts_DEFINED
9*c8dee2aaSAndroid Build Coastguard Worker #define SkBitmapProcState_opts_DEFINED
10*c8dee2aaSAndroid Build Coastguard Worker
11*c8dee2aaSAndroid Build Coastguard Worker #include "src/base/SkMSAN.h"
12*c8dee2aaSAndroid Build Coastguard Worker #include "src/base/SkVx.h"
13*c8dee2aaSAndroid Build Coastguard Worker #include "src/core/SkBitmapProcState.h"
14*c8dee2aaSAndroid Build Coastguard Worker
15*c8dee2aaSAndroid Build Coastguard Worker // SkBitmapProcState optimized Shader, Sample, or Matrix procs.
16*c8dee2aaSAndroid Build Coastguard Worker //
17*c8dee2aaSAndroid Build Coastguard Worker // Only S32_alpha_D32_filter_DX exploits instructions beyond
18*c8dee2aaSAndroid Build Coastguard Worker // our common baseline SSE2/NEON instruction sets, so that's
19*c8dee2aaSAndroid Build Coastguard Worker // all that lives here.
20*c8dee2aaSAndroid Build Coastguard Worker //
21*c8dee2aaSAndroid Build Coastguard Worker // The rest are scattershot at the moment but I want to get them
22*c8dee2aaSAndroid Build Coastguard Worker // all migrated to be normal code inside SkBitmapProcState.cpp.
23*c8dee2aaSAndroid Build Coastguard Worker
24*c8dee2aaSAndroid Build Coastguard Worker #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
25*c8dee2aaSAndroid Build Coastguard Worker #include <immintrin.h>
26*c8dee2aaSAndroid Build Coastguard Worker #elif defined(SK_ARM_HAS_NEON)
27*c8dee2aaSAndroid Build Coastguard Worker #include <arm_neon.h>
28*c8dee2aaSAndroid Build Coastguard Worker #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
29*c8dee2aaSAndroid Build Coastguard Worker #include <lasxintrin.h>
30*c8dee2aaSAndroid Build Coastguard Worker #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
31*c8dee2aaSAndroid Build Coastguard Worker #include <lsxintrin.h>
32*c8dee2aaSAndroid Build Coastguard Worker #endif
33*c8dee2aaSAndroid Build Coastguard Worker
34*c8dee2aaSAndroid Build Coastguard Worker namespace SK_OPTS_NS {
35*c8dee2aaSAndroid Build Coastguard Worker
36*c8dee2aaSAndroid Build Coastguard Worker // This same basic packing scheme is used throughout the file.
37*c8dee2aaSAndroid Build Coastguard Worker template <typename U32, typename Out>
decode_packed_coordinates_and_weight(U32 packed,Out * v0,Out * v1,Out * w)38*c8dee2aaSAndroid Build Coastguard Worker static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
39*c8dee2aaSAndroid Build Coastguard Worker *v0 = (packed >> 18); // Integer coordinate x0 or y0.
40*c8dee2aaSAndroid Build Coastguard Worker *v1 = (packed & 0x3fff); // Integer coordinate x1 or y1.
41*c8dee2aaSAndroid Build Coastguard Worker *w = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w.
42*c8dee2aaSAndroid Build Coastguard Worker }
43*c8dee2aaSAndroid Build Coastguard Worker
44*c8dee2aaSAndroid Build Coastguard Worker #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
45*c8dee2aaSAndroid Build Coastguard Worker
46*c8dee2aaSAndroid Build Coastguard Worker /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)47*c8dee2aaSAndroid Build Coastguard Worker void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
48*c8dee2aaSAndroid Build Coastguard Worker const uint32_t* xy, int count, uint32_t* colors) {
49*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(count > 0 && colors != nullptr);
50*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(s.fBilerp);
51*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
52*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(s.fAlphaScale <= 256);
53*c8dee2aaSAndroid Build Coastguard Worker
54*c8dee2aaSAndroid Build Coastguard Worker // interpolate_in_x() is the crux of the SSSE3 implementation,
55*c8dee2aaSAndroid Build Coastguard Worker // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
56*c8dee2aaSAndroid Build Coastguard Worker auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
57*c8dee2aaSAndroid Build Coastguard Worker uint32_t B0, uint32_t B1,
58*c8dee2aaSAndroid Build Coastguard Worker __m128i interlaced_x_weights) {
59*c8dee2aaSAndroid Build Coastguard Worker // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp.
60*c8dee2aaSAndroid Build Coastguard Worker //
61*c8dee2aaSAndroid Build Coastguard Worker // It takes two arguments interlaced byte-wise:
62*c8dee2aaSAndroid Build Coastguard Worker // - first arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...]
63*c8dee2aaSAndroid Build Coastguard Worker // - second arg: [ w,W, ... 7 more pairs of signed 8-bit values ...]
64*c8dee2aaSAndroid Build Coastguard Worker // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ].
65*c8dee2aaSAndroid Build Coastguard Worker //
66*c8dee2aaSAndroid Build Coastguard Worker // That's why we go to all this trouble to make interlaced_x_weights,
67*c8dee2aaSAndroid Build Coastguard Worker // and here we're about to interlace A0 with A1 and B0 with B1 to match.
68*c8dee2aaSAndroid Build Coastguard Worker //
69*c8dee2aaSAndroid Build Coastguard Worker // Our interlaced_x_weights are all in [0,16], and so we need not worry about
70*c8dee2aaSAndroid Build Coastguard Worker // the signedness of that input nor about the signedness of the output.
71*c8dee2aaSAndroid Build Coastguard Worker
72*c8dee2aaSAndroid Build Coastguard Worker __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
73*c8dee2aaSAndroid Build Coastguard Worker interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
74*c8dee2aaSAndroid Build Coastguard Worker
75*c8dee2aaSAndroid Build Coastguard Worker return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
76*c8dee2aaSAndroid Build Coastguard Worker interlaced_x_weights);
77*c8dee2aaSAndroid Build Coastguard Worker };
78*c8dee2aaSAndroid Build Coastguard Worker
79*c8dee2aaSAndroid Build Coastguard Worker // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
80*c8dee2aaSAndroid Build Coastguard Worker // Returns two pixels, with each color channel in a 16-bit lane of the __m128i.
81*c8dee2aaSAndroid Build Coastguard Worker auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
82*c8dee2aaSAndroid Build Coastguard Worker uint32_t A2, uint32_t A3,
83*c8dee2aaSAndroid Build Coastguard Worker uint32_t B0, uint32_t B1,
84*c8dee2aaSAndroid Build Coastguard Worker uint32_t B2, uint32_t B3,
85*c8dee2aaSAndroid Build Coastguard Worker __m128i interlaced_x_weights,
86*c8dee2aaSAndroid Build Coastguard Worker int wy) {
87*c8dee2aaSAndroid Build Coastguard Worker // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights.
88*c8dee2aaSAndroid Build Coastguard Worker __m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
89*c8dee2aaSAndroid Build Coastguard Worker bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
90*c8dee2aaSAndroid Build Coastguard Worker
91*c8dee2aaSAndroid Build Coastguard Worker // Interpolate in Y. As in the SSE2 code, we calculate top*(16-wy) + bot*wy
92*c8dee2aaSAndroid Build Coastguard Worker // as 16*top + (bot-top)*wy to save a multiply.
93*c8dee2aaSAndroid Build Coastguard Worker __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4),
94*c8dee2aaSAndroid Build Coastguard Worker _mm_mullo_epi16(_mm_sub_epi16(bot, top),
95*c8dee2aaSAndroid Build Coastguard Worker _mm_set1_epi16(wy)));
96*c8dee2aaSAndroid Build Coastguard Worker
97*c8dee2aaSAndroid Build Coastguard Worker // Scale down by total max weight 16x16 = 256.
98*c8dee2aaSAndroid Build Coastguard Worker px = _mm_srli_epi16(px, 8);
99*c8dee2aaSAndroid Build Coastguard Worker
100*c8dee2aaSAndroid Build Coastguard Worker // Scale by alpha if needed.
101*c8dee2aaSAndroid Build Coastguard Worker if (s.fAlphaScale < 256) {
102*c8dee2aaSAndroid Build Coastguard Worker px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8);
103*c8dee2aaSAndroid Build Coastguard Worker }
104*c8dee2aaSAndroid Build Coastguard Worker return px;
105*c8dee2aaSAndroid Build Coastguard Worker };
106*c8dee2aaSAndroid Build Coastguard Worker
107*c8dee2aaSAndroid Build Coastguard Worker // We're in _DX mode here, so we're only varying in X.
108*c8dee2aaSAndroid Build Coastguard Worker // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
109*c8dee2aaSAndroid Build Coastguard Worker // All the other entries in xy will be pairs of X coordinates and the X weight.
110*c8dee2aaSAndroid Build Coastguard Worker int y0, y1, wy;
111*c8dee2aaSAndroid Build Coastguard Worker decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
112*c8dee2aaSAndroid Build Coastguard Worker
113*c8dee2aaSAndroid Build Coastguard Worker auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
114*c8dee2aaSAndroid Build Coastguard Worker row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
115*c8dee2aaSAndroid Build Coastguard Worker
116*c8dee2aaSAndroid Build Coastguard Worker while (count >= 4) {
117*c8dee2aaSAndroid Build Coastguard Worker // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.
118*c8dee2aaSAndroid Build Coastguard Worker int x0[4],
119*c8dee2aaSAndroid Build Coastguard Worker x1[4];
120*c8dee2aaSAndroid Build Coastguard Worker __m128i wx;
121*c8dee2aaSAndroid Build Coastguard Worker
122*c8dee2aaSAndroid Build Coastguard Worker // decode_packed_coordinates_and_weight(), 4x.
123*c8dee2aaSAndroid Build Coastguard Worker __m128i packed = _mm_loadu_si128((const __m128i*)xy);
124*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18));
125*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
126*c8dee2aaSAndroid Build Coastguard Worker wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf)); // [0,15]
127*c8dee2aaSAndroid Build Coastguard Worker
128*c8dee2aaSAndroid Build Coastguard Worker // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,
129*c8dee2aaSAndroid Build Coastguard Worker // and sixteen minus that as wl for pixels on the left at x0.
130*c8dee2aaSAndroid Build Coastguard Worker __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
131*c8dee2aaSAndroid Build Coastguard Worker wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
132*c8dee2aaSAndroid Build Coastguard Worker
133*c8dee2aaSAndroid Build Coastguard Worker // We need to interlace wl and wr for _mm_maddubs_epi16().
134*c8dee2aaSAndroid Build Coastguard Worker __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),
135*c8dee2aaSAndroid Build Coastguard Worker interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);
136*c8dee2aaSAndroid Build Coastguard Worker
137*c8dee2aaSAndroid Build Coastguard Worker enum { A,B,C,D };
138*c8dee2aaSAndroid Build Coastguard Worker
139*c8dee2aaSAndroid Build Coastguard Worker // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
140*c8dee2aaSAndroid Build Coastguard Worker // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
141*c8dee2aaSAndroid Build Coastguard Worker __m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]],
142*c8dee2aaSAndroid Build Coastguard Worker row1[x0[A]], row1[x1[A]],
143*c8dee2aaSAndroid Build Coastguard Worker row0[x0[B]], row0[x1[B]],
144*c8dee2aaSAndroid Build Coastguard Worker row1[x0[B]], row1[x1[B]],
145*c8dee2aaSAndroid Build Coastguard Worker interlaced_x_weights_AB, wy);
146*c8dee2aaSAndroid Build Coastguard Worker
147*c8dee2aaSAndroid Build Coastguard Worker // Once more with the other half of the x-weights for two more pixels C,D.
148*c8dee2aaSAndroid Build Coastguard Worker __m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]],
149*c8dee2aaSAndroid Build Coastguard Worker row1[x0[C]], row1[x1[C]],
150*c8dee2aaSAndroid Build Coastguard Worker row0[x0[D]], row0[x1[D]],
151*c8dee2aaSAndroid Build Coastguard Worker row1[x0[D]], row1[x1[D]],
152*c8dee2aaSAndroid Build Coastguard Worker interlaced_x_weights_CD, wy);
153*c8dee2aaSAndroid Build Coastguard Worker
154*c8dee2aaSAndroid Build Coastguard Worker // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
155*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD));
156*c8dee2aaSAndroid Build Coastguard Worker xy += 4;
157*c8dee2aaSAndroid Build Coastguard Worker colors += 4;
158*c8dee2aaSAndroid Build Coastguard Worker count -= 4;
159*c8dee2aaSAndroid Build Coastguard Worker }
160*c8dee2aaSAndroid Build Coastguard Worker
161*c8dee2aaSAndroid Build Coastguard Worker while (count --> 0) {
162*c8dee2aaSAndroid Build Coastguard Worker // This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
163*c8dee2aaSAndroid Build Coastguard Worker int x0, x1, wx;
164*c8dee2aaSAndroid Build Coastguard Worker decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
165*c8dee2aaSAndroid Build Coastguard Worker
166*c8dee2aaSAndroid Build Coastguard Worker // As above, splat out wx four times as wr, and sixteen minus that as wl.
167*c8dee2aaSAndroid Build Coastguard Worker __m128i wr = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine.
168*c8dee2aaSAndroid Build Coastguard Worker wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
169*c8dee2aaSAndroid Build Coastguard Worker
170*c8dee2aaSAndroid Build Coastguard Worker __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr);
171*c8dee2aaSAndroid Build Coastguard Worker
172*c8dee2aaSAndroid Build Coastguard Worker __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
173*c8dee2aaSAndroid Build Coastguard Worker row1[x0], row1[x1],
174*c8dee2aaSAndroid Build Coastguard Worker 0, 0,
175*c8dee2aaSAndroid Build Coastguard Worker 0, 0,
176*c8dee2aaSAndroid Build Coastguard Worker interlaced_x_weights, wy);
177*c8dee2aaSAndroid Build Coastguard Worker
178*c8dee2aaSAndroid Build Coastguard Worker *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128()));
179*c8dee2aaSAndroid Build Coastguard Worker }
180*c8dee2aaSAndroid Build Coastguard Worker }
181*c8dee2aaSAndroid Build Coastguard Worker
182*c8dee2aaSAndroid Build Coastguard Worker
183*c8dee2aaSAndroid Build Coastguard Worker #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
184*c8dee2aaSAndroid Build Coastguard Worker
185*c8dee2aaSAndroid Build Coastguard Worker /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)186*c8dee2aaSAndroid Build Coastguard Worker void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
187*c8dee2aaSAndroid Build Coastguard Worker const uint32_t* xy, int count, uint32_t* colors) {
188*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(count > 0 && colors != nullptr);
189*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(s.fBilerp);
190*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
191*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(s.fAlphaScale <= 256);
192*c8dee2aaSAndroid Build Coastguard Worker
193*c8dee2aaSAndroid Build Coastguard Worker int y0, y1, wy;
194*c8dee2aaSAndroid Build Coastguard Worker decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
195*c8dee2aaSAndroid Build Coastguard Worker
196*c8dee2aaSAndroid Build Coastguard Worker auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
197*c8dee2aaSAndroid Build Coastguard Worker row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
198*c8dee2aaSAndroid Build Coastguard Worker
199*c8dee2aaSAndroid Build Coastguard Worker // We'll put one pixel in the low 4 16-bit lanes to line up with wy,
200*c8dee2aaSAndroid Build Coastguard Worker // and another in the upper 4 16-bit lanes to line up with 16 - wy.
201*c8dee2aaSAndroid Build Coastguard Worker const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), // Bottom pixel goes here.
202*c8dee2aaSAndroid Build Coastguard Worker _mm_set1_epi16(16-wy)); // Top pixel goes here.
203*c8dee2aaSAndroid Build Coastguard Worker
204*c8dee2aaSAndroid Build Coastguard Worker while (count --> 0) {
205*c8dee2aaSAndroid Build Coastguard Worker int x0, x1, wx;
206*c8dee2aaSAndroid Build Coastguard Worker decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
207*c8dee2aaSAndroid Build Coastguard Worker
208*c8dee2aaSAndroid Build Coastguard Worker // Load the 4 pixels we're interpolating, in this grid:
209*c8dee2aaSAndroid Build Coastguard Worker // | tl tr |
210*c8dee2aaSAndroid Build Coastguard Worker // | bl br |
211*c8dee2aaSAndroid Build Coastguard Worker const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]),
212*c8dee2aaSAndroid Build Coastguard Worker bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]);
213*c8dee2aaSAndroid Build Coastguard Worker
214*c8dee2aaSAndroid Build Coastguard Worker // We want to calculate a sum of 4 pixels weighted in two directions:
215*c8dee2aaSAndroid Build Coastguard Worker //
216*c8dee2aaSAndroid Build Coastguard Worker // sum = tl * (16-wy) * (16-wx)
217*c8dee2aaSAndroid Build Coastguard Worker // + bl * ( wy) * (16-wx)
218*c8dee2aaSAndroid Build Coastguard Worker // + tr * (16-wy) * ( wx)
219*c8dee2aaSAndroid Build Coastguard Worker // + br * ( wy) * ( wx)
220*c8dee2aaSAndroid Build Coastguard Worker //
221*c8dee2aaSAndroid Build Coastguard Worker // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
222*c8dee2aaSAndroid Build Coastguard Worker //
223*c8dee2aaSAndroid Build Coastguard Worker // We've already prepared allY as a vector containing [wy, 16-wy] as a way
224*c8dee2aaSAndroid Build Coastguard Worker // to apply those y-direction weights. So we'll start on the x-direction
225*c8dee2aaSAndroid Build Coastguard Worker // first, grouping into left and right halves, lined up with allY:
226*c8dee2aaSAndroid Build Coastguard Worker //
227*c8dee2aaSAndroid Build Coastguard Worker // L = [bl, tl]
228*c8dee2aaSAndroid Build Coastguard Worker // R = [br, tr]
229*c8dee2aaSAndroid Build Coastguard Worker //
230*c8dee2aaSAndroid Build Coastguard Worker // sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
231*c8dee2aaSAndroid Build Coastguard Worker //
232*c8dee2aaSAndroid Build Coastguard Worker // Rewriting that one more step, we can replace a multiply with a shift:
233*c8dee2aaSAndroid Build Coastguard Worker //
234*c8dee2aaSAndroid Build Coastguard Worker // sum = horizontalSum( allY * (16*L + (R-L)*wx) )
235*c8dee2aaSAndroid Build Coastguard Worker //
236*c8dee2aaSAndroid Build Coastguard Worker // That's how we'll actually do this math.
237*c8dee2aaSAndroid Build Coastguard Worker
238*c8dee2aaSAndroid Build Coastguard Worker __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),
239*c8dee2aaSAndroid Build Coastguard Worker R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());
240*c8dee2aaSAndroid Build Coastguard Worker
241*c8dee2aaSAndroid Build Coastguard Worker __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4),
242*c8dee2aaSAndroid Build Coastguard Worker _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx)));
243*c8dee2aaSAndroid Build Coastguard Worker
244*c8dee2aaSAndroid Build Coastguard Worker __m128i sum_in_x = _mm_mullo_epi16(inner, allY);
245*c8dee2aaSAndroid Build Coastguard Worker
246*c8dee2aaSAndroid Build Coastguard Worker // sum = horizontalSum( ... )
247*c8dee2aaSAndroid Build Coastguard Worker __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8));
248*c8dee2aaSAndroid Build Coastguard Worker
249*c8dee2aaSAndroid Build Coastguard Worker // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
250*c8dee2aaSAndroid Build Coastguard Worker sum = _mm_srli_epi16(sum, 8);
251*c8dee2aaSAndroid Build Coastguard Worker
252*c8dee2aaSAndroid Build Coastguard Worker if (s.fAlphaScale < 256) {
253*c8dee2aaSAndroid Build Coastguard Worker // Scale by alpha, which is in [0,256].
254*c8dee2aaSAndroid Build Coastguard Worker sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
255*c8dee2aaSAndroid Build Coastguard Worker sum = _mm_srli_epi16(sum, 8);
256*c8dee2aaSAndroid Build Coastguard Worker }
257*c8dee2aaSAndroid Build Coastguard Worker
258*c8dee2aaSAndroid Build Coastguard Worker // Pack back into 8-bit values and store.
259*c8dee2aaSAndroid Build Coastguard Worker *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
260*c8dee2aaSAndroid Build Coastguard Worker }
261*c8dee2aaSAndroid Build Coastguard Worker }
262*c8dee2aaSAndroid Build Coastguard Worker
263*c8dee2aaSAndroid Build Coastguard Worker #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
264*c8dee2aaSAndroid Build Coastguard Worker /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)265*c8dee2aaSAndroid Build Coastguard Worker void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
266*c8dee2aaSAndroid Build Coastguard Worker const uint32_t* xy, int count, uint32_t* colors) {
267*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(count > 0 && colors != nullptr);
268*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(s.fBilerp);
269*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
270*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(s.fAlphaScale <= 256);
271*c8dee2aaSAndroid Build Coastguard Worker
272*c8dee2aaSAndroid Build Coastguard Worker int y0, y1, wy;
273*c8dee2aaSAndroid Build Coastguard Worker decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
274*c8dee2aaSAndroid Build Coastguard Worker
275*c8dee2aaSAndroid Build Coastguard Worker auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
276*c8dee2aaSAndroid Build Coastguard Worker row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
277*c8dee2aaSAndroid Build Coastguard Worker
278*c8dee2aaSAndroid Build Coastguard Worker // We'll put one pixel in the low 16 16-bit lanes to line up with wy,
279*c8dee2aaSAndroid Build Coastguard Worker // and another in the upper 16 16-bit lanes to line up with 16 - wy.
280*c8dee2aaSAndroid Build Coastguard Worker __m256i allY = __lasx_xvilvl_d(__lasx_xvreplgr2vr_h(16-wy), __lasx_xvreplgr2vr_h(wy));
281*c8dee2aaSAndroid Build Coastguard Worker
282*c8dee2aaSAndroid Build Coastguard Worker while (count --> 0) {
283*c8dee2aaSAndroid Build Coastguard Worker int x0, x1, wx;
284*c8dee2aaSAndroid Build Coastguard Worker decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
285*c8dee2aaSAndroid Build Coastguard Worker
286*c8dee2aaSAndroid Build Coastguard Worker // Load the 4 pixels we're interpolating, in this grid:
287*c8dee2aaSAndroid Build Coastguard Worker // | tl tr |
288*c8dee2aaSAndroid Build Coastguard Worker // | bl br |
289*c8dee2aaSAndroid Build Coastguard Worker
290*c8dee2aaSAndroid Build Coastguard Worker const __m256i zeros = __lasx_xvldi(0);
291*c8dee2aaSAndroid Build Coastguard Worker const __m256i tl = __lasx_xvinsgr2vr_w(zeros, row0[x0], 0),
292*c8dee2aaSAndroid Build Coastguard Worker tr = __lasx_xvinsgr2vr_w(zeros, row0[x1], 0),
293*c8dee2aaSAndroid Build Coastguard Worker bl = __lasx_xvinsgr2vr_w(zeros, row1[x0], 0),
294*c8dee2aaSAndroid Build Coastguard Worker br = __lasx_xvinsgr2vr_w(zeros, row1[x1], 0);
295*c8dee2aaSAndroid Build Coastguard Worker
296*c8dee2aaSAndroid Build Coastguard Worker // We want to calculate a sum of 8 pixels weighted in two directions:
297*c8dee2aaSAndroid Build Coastguard Worker //
298*c8dee2aaSAndroid Build Coastguard Worker // sum = tl * (16-wy) * (16-wx)
299*c8dee2aaSAndroid Build Coastguard Worker // + bl * ( wy) * (16-wx)
300*c8dee2aaSAndroid Build Coastguard Worker // + tr * (16-wy) * ( wx)
301*c8dee2aaSAndroid Build Coastguard Worker // + br * ( wy) * ( wx)
302*c8dee2aaSAndroid Build Coastguard Worker //
303*c8dee2aaSAndroid Build Coastguard Worker // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
304*c8dee2aaSAndroid Build Coastguard Worker //
305*c8dee2aaSAndroid Build Coastguard Worker // We've already prepared allY as a vector containing [wy, 16-wy] as a way
306*c8dee2aaSAndroid Build Coastguard Worker // to apply those y-direction weights. So we'll start on the x-direction
307*c8dee2aaSAndroid Build Coastguard Worker // first, grouping into left and right halves, lined up with allY:
308*c8dee2aaSAndroid Build Coastguard Worker //
309*c8dee2aaSAndroid Build Coastguard Worker // L = [bl, tl]
310*c8dee2aaSAndroid Build Coastguard Worker // R = [br, tr]
311*c8dee2aaSAndroid Build Coastguard Worker //
312*c8dee2aaSAndroid Build Coastguard Worker // sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
313*c8dee2aaSAndroid Build Coastguard Worker //
314*c8dee2aaSAndroid Build Coastguard Worker // Rewriting that one more step, we can replace a multiply with a shift:
315*c8dee2aaSAndroid Build Coastguard Worker //
316*c8dee2aaSAndroid Build Coastguard Worker // sum = horizontalSum( allY * (16*L + (R-L)*wx) )
317*c8dee2aaSAndroid Build Coastguard Worker //
318*c8dee2aaSAndroid Build Coastguard Worker // That's how we'll actually do this math.
319*c8dee2aaSAndroid Build Coastguard Worker
320*c8dee2aaSAndroid Build Coastguard Worker __m256i L = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tl, bl)),
321*c8dee2aaSAndroid Build Coastguard Worker R = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tr, br));
322*c8dee2aaSAndroid Build Coastguard Worker
323*c8dee2aaSAndroid Build Coastguard Worker __m256i inner = __lasx_xvadd_h(__lasx_xvslli_h(L, 4),
324*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvmul_h(__lasx_xvsub_h(R,L),
325*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvreplgr2vr_h(wx)));
326*c8dee2aaSAndroid Build Coastguard Worker
327*c8dee2aaSAndroid Build Coastguard Worker __m256i sum_in_x = __lasx_xvmul_h(inner, allY);
328*c8dee2aaSAndroid Build Coastguard Worker
329*c8dee2aaSAndroid Build Coastguard Worker // sum = horizontalSum( ... )
330*c8dee2aaSAndroid Build Coastguard Worker __m256i sum = __lasx_xvadd_h(sum_in_x, __lasx_xvbsrl_v(sum_in_x, 8));
331*c8dee2aaSAndroid Build Coastguard Worker
332*c8dee2aaSAndroid Build Coastguard Worker // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
333*c8dee2aaSAndroid Build Coastguard Worker sum = __lasx_xvsrli_h(sum, 8);
334*c8dee2aaSAndroid Build Coastguard Worker
335*c8dee2aaSAndroid Build Coastguard Worker if (s.fAlphaScale < 256) {
336*c8dee2aaSAndroid Build Coastguard Worker // Scale by alpha, which is in [0,256].
337*c8dee2aaSAndroid Build Coastguard Worker sum = __lasx_xvmul_h(sum, __lasx_xvreplgr2vr_h(s.fAlphaScale));
338*c8dee2aaSAndroid Build Coastguard Worker sum = __lasx_xvsrli_h(sum, 8);
339*c8dee2aaSAndroid Build Coastguard Worker }
340*c8dee2aaSAndroid Build Coastguard Worker
341*c8dee2aaSAndroid Build Coastguard Worker // Pack back into 8-bit values and store.
342*c8dee2aaSAndroid Build Coastguard Worker *colors++ = __lasx_xvpickve2gr_w(__lasx_xvpickev_b(__lasx_xvldi(0),
343*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvsat_hu(sum, 8)), 0);
344*c8dee2aaSAndroid Build Coastguard Worker }
345*c8dee2aaSAndroid Build Coastguard Worker }
346*c8dee2aaSAndroid Build Coastguard Worker
347*c8dee2aaSAndroid Build Coastguard Worker #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
348*c8dee2aaSAndroid Build Coastguard Worker
349*c8dee2aaSAndroid Build Coastguard Worker /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)350*c8dee2aaSAndroid Build Coastguard Worker void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
351*c8dee2aaSAndroid Build Coastguard Worker const uint32_t* xy, int count, uint32_t* colors) {
352*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(count > 0 && colors != nullptr);
353*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(s.fBilerp);
354*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
355*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(s.fAlphaScale <= 256);
356*c8dee2aaSAndroid Build Coastguard Worker
357*c8dee2aaSAndroid Build Coastguard Worker int y0, y1, wy;
358*c8dee2aaSAndroid Build Coastguard Worker decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
359*c8dee2aaSAndroid Build Coastguard Worker
360*c8dee2aaSAndroid Build Coastguard Worker auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
361*c8dee2aaSAndroid Build Coastguard Worker row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
362*c8dee2aaSAndroid Build Coastguard Worker
363*c8dee2aaSAndroid Build Coastguard Worker // We'll put one pixel in the low 8 16-bit lanes to line up with wy,
364*c8dee2aaSAndroid Build Coastguard Worker // and another in the upper 8 16-bit lanes to line up with 16 - wy.
365*c8dee2aaSAndroid Build Coastguard Worker __m128i allY = __lsx_vilvl_d(__lsx_vreplgr2vr_h(16-wy), __lsx_vreplgr2vr_h(wy));
366*c8dee2aaSAndroid Build Coastguard Worker
367*c8dee2aaSAndroid Build Coastguard Worker while (count --> 0) {
368*c8dee2aaSAndroid Build Coastguard Worker int x0, x1, wx;
369*c8dee2aaSAndroid Build Coastguard Worker decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
370*c8dee2aaSAndroid Build Coastguard Worker
371*c8dee2aaSAndroid Build Coastguard Worker // Load the 4 pixels we're interpolating, in this grid:
372*c8dee2aaSAndroid Build Coastguard Worker // | tl tr |
373*c8dee2aaSAndroid Build Coastguard Worker // | bl br |
374*c8dee2aaSAndroid Build Coastguard Worker const __m128i zeros = __lsx_vldi(0);
375*c8dee2aaSAndroid Build Coastguard Worker const __m128i tl = __lsx_vinsgr2vr_w(zeros, row0[x0], 0),
376*c8dee2aaSAndroid Build Coastguard Worker tr = __lsx_vinsgr2vr_w(zeros, row0[x1], 0),
377*c8dee2aaSAndroid Build Coastguard Worker bl = __lsx_vinsgr2vr_w(zeros, row1[x0], 0),
378*c8dee2aaSAndroid Build Coastguard Worker br = __lsx_vinsgr2vr_w(zeros, row1[x1], 0);
379*c8dee2aaSAndroid Build Coastguard Worker
380*c8dee2aaSAndroid Build Coastguard Worker // We want to calculate a sum of 8 pixels weighted in two directions:
381*c8dee2aaSAndroid Build Coastguard Worker //
382*c8dee2aaSAndroid Build Coastguard Worker // sum = tl * (16-wy) * (16-wx)
383*c8dee2aaSAndroid Build Coastguard Worker // + bl * ( wy) * (16-wx)
384*c8dee2aaSAndroid Build Coastguard Worker // + tr * (16-wy) * ( wx)
385*c8dee2aaSAndroid Build Coastguard Worker // + br * ( wy) * ( wx)
386*c8dee2aaSAndroid Build Coastguard Worker //
387*c8dee2aaSAndroid Build Coastguard Worker // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
388*c8dee2aaSAndroid Build Coastguard Worker //
389*c8dee2aaSAndroid Build Coastguard Worker // We've already prepared allY as a vector containing [wy, 16-wy] as a way
390*c8dee2aaSAndroid Build Coastguard Worker // to apply those y-direction weights. So we'll start on the x-direction
391*c8dee2aaSAndroid Build Coastguard Worker // first, grouping into left and right halves, lined up with allY:
392*c8dee2aaSAndroid Build Coastguard Worker //
393*c8dee2aaSAndroid Build Coastguard Worker // L = [bl, tl]
394*c8dee2aaSAndroid Build Coastguard Worker // R = [br, tr]
395*c8dee2aaSAndroid Build Coastguard Worker //
396*c8dee2aaSAndroid Build Coastguard Worker // sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
397*c8dee2aaSAndroid Build Coastguard Worker //
398*c8dee2aaSAndroid Build Coastguard Worker // Rewriting that one more step, we can replace a multiply with a shift:
399*c8dee2aaSAndroid Build Coastguard Worker //
400*c8dee2aaSAndroid Build Coastguard Worker // sum = horizontalSum( allY * (16*L + (R-L)*wx) )
401*c8dee2aaSAndroid Build Coastguard Worker //
402*c8dee2aaSAndroid Build Coastguard Worker // That's how we'll actually do this math.
403*c8dee2aaSAndroid Build Coastguard Worker
404*c8dee2aaSAndroid Build Coastguard Worker
405*c8dee2aaSAndroid Build Coastguard Worker __m128i L = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tl, bl)),
406*c8dee2aaSAndroid Build Coastguard Worker R = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tr, br));
407*c8dee2aaSAndroid Build Coastguard Worker
408*c8dee2aaSAndroid Build Coastguard Worker __m128i inner = __lsx_vadd_h(__lsx_vslli_h(L, 4),
409*c8dee2aaSAndroid Build Coastguard Worker __lsx_vmul_h(__lsx_vsub_h(R,L),
410*c8dee2aaSAndroid Build Coastguard Worker __lsx_vreplgr2vr_h(wx)));
411*c8dee2aaSAndroid Build Coastguard Worker
412*c8dee2aaSAndroid Build Coastguard Worker __m128i sum_in_x = __lsx_vmul_h(inner, allY);
413*c8dee2aaSAndroid Build Coastguard Worker
414*c8dee2aaSAndroid Build Coastguard Worker // sum = horizontalSum( ... )
415*c8dee2aaSAndroid Build Coastguard Worker __m128i sum = __lsx_vadd_h(sum_in_x, __lsx_vbsrl_v(sum_in_x, 8));
416*c8dee2aaSAndroid Build Coastguard Worker
417*c8dee2aaSAndroid Build Coastguard Worker // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
418*c8dee2aaSAndroid Build Coastguard Worker sum = __lsx_vsrli_h(sum, 8);
419*c8dee2aaSAndroid Build Coastguard Worker
420*c8dee2aaSAndroid Build Coastguard Worker if (s.fAlphaScale < 256) {
421*c8dee2aaSAndroid Build Coastguard Worker // Scale by alpha, which is in [0,256].
422*c8dee2aaSAndroid Build Coastguard Worker sum = __lsx_vmul_h(sum, __lsx_vreplgr2vr_h(s.fAlphaScale));
423*c8dee2aaSAndroid Build Coastguard Worker sum = __lsx_vsrli_h(sum, 8);
424*c8dee2aaSAndroid Build Coastguard Worker }
425*c8dee2aaSAndroid Build Coastguard Worker
426*c8dee2aaSAndroid Build Coastguard Worker // Pack back into 8-bit values and store.
427*c8dee2aaSAndroid Build Coastguard Worker *colors++ = __lsx_vpickve2gr_w(__lsx_vpickev_b(__lsx_vldi(0),
428*c8dee2aaSAndroid Build Coastguard Worker __lsx_vsat_hu(sum, 8)), 0);
429*c8dee2aaSAndroid Build Coastguard Worker }
430*c8dee2aaSAndroid Build Coastguard Worker }
431*c8dee2aaSAndroid Build Coastguard Worker
432*c8dee2aaSAndroid Build Coastguard Worker #else
433*c8dee2aaSAndroid Build Coastguard Worker
434*c8dee2aaSAndroid Build Coastguard Worker // The NEON code only actually differs from the portable code in the
435*c8dee2aaSAndroid Build Coastguard Worker // filtering step after we've loaded all four pixels we want to bilerp.
436*c8dee2aaSAndroid Build Coastguard Worker
437*c8dee2aaSAndroid Build Coastguard Worker #if defined(SK_ARM_HAS_NEON)
filter_and_scale_by_alpha(unsigned x,unsigned y,SkPMColor a00,SkPMColor a01,SkPMColor a10,SkPMColor a11,SkPMColor * dst,uint16_t scale)438*c8dee2aaSAndroid Build Coastguard Worker static void filter_and_scale_by_alpha(unsigned x, unsigned y,
439*c8dee2aaSAndroid Build Coastguard Worker SkPMColor a00, SkPMColor a01,
440*c8dee2aaSAndroid Build Coastguard Worker SkPMColor a10, SkPMColor a11,
441*c8dee2aaSAndroid Build Coastguard Worker SkPMColor *dst,
442*c8dee2aaSAndroid Build Coastguard Worker uint16_t scale) {
443*c8dee2aaSAndroid Build Coastguard Worker uint8x8_t vy, vconst16_8, v16_y, vres;
444*c8dee2aaSAndroid Build Coastguard Worker uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
445*c8dee2aaSAndroid Build Coastguard Worker uint32x2_t va0, va1;
446*c8dee2aaSAndroid Build Coastguard Worker uint16x8_t tmp1, tmp2;
447*c8dee2aaSAndroid Build Coastguard Worker
448*c8dee2aaSAndroid Build Coastguard Worker vy = vdup_n_u8(y); // duplicate y into vy
449*c8dee2aaSAndroid Build Coastguard Worker vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8
450*c8dee2aaSAndroid Build Coastguard Worker v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y
451*c8dee2aaSAndroid Build Coastguard Worker
452*c8dee2aaSAndroid Build Coastguard Worker va0 = vdup_n_u32(a00); // duplicate a00
453*c8dee2aaSAndroid Build Coastguard Worker va1 = vdup_n_u32(a10); // duplicate a10
454*c8dee2aaSAndroid Build Coastguard Worker va0 = vset_lane_u32(a01, va0, 1); // set top to a01
455*c8dee2aaSAndroid Build Coastguard Worker va1 = vset_lane_u32(a11, va1, 1); // set top to a11
456*c8dee2aaSAndroid Build Coastguard Worker
457*c8dee2aaSAndroid Build Coastguard Worker tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
458*c8dee2aaSAndroid Build Coastguard Worker tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y
459*c8dee2aaSAndroid Build Coastguard Worker
460*c8dee2aaSAndroid Build Coastguard Worker vx = vdup_n_u16(x); // duplicate x into vx
461*c8dee2aaSAndroid Build Coastguard Worker vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16
462*c8dee2aaSAndroid Build Coastguard Worker v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
463*c8dee2aaSAndroid Build Coastguard Worker
464*c8dee2aaSAndroid Build Coastguard Worker tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x
465*c8dee2aaSAndroid Build Coastguard Worker tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x
466*c8dee2aaSAndroid Build Coastguard Worker tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
467*c8dee2aaSAndroid Build Coastguard Worker tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
468*c8dee2aaSAndroid Build Coastguard Worker
469*c8dee2aaSAndroid Build Coastguard Worker if (scale < 256) {
470*c8dee2aaSAndroid Build Coastguard Worker vscale = vdup_n_u16(scale); // duplicate scale
471*c8dee2aaSAndroid Build Coastguard Worker tmp = vshr_n_u16(tmp, 8); // shift down result by 8
472*c8dee2aaSAndroid Build Coastguard Worker tmp = vmul_u16(tmp, vscale); // multiply result by scale
473*c8dee2aaSAndroid Build Coastguard Worker }
474*c8dee2aaSAndroid Build Coastguard Worker
475*c8dee2aaSAndroid Build Coastguard Worker vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8
476*c8dee2aaSAndroid Build Coastguard Worker vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result
477*c8dee2aaSAndroid Build Coastguard Worker }
478*c8dee2aaSAndroid Build Coastguard Worker #else
filter_and_scale_by_alpha(unsigned x,unsigned y,SkPMColor a00,SkPMColor a01,SkPMColor a10,SkPMColor a11,SkPMColor * dstColor,unsigned alphaScale)479*c8dee2aaSAndroid Build Coastguard Worker static void filter_and_scale_by_alpha(unsigned x, unsigned y,
480*c8dee2aaSAndroid Build Coastguard Worker SkPMColor a00, SkPMColor a01,
481*c8dee2aaSAndroid Build Coastguard Worker SkPMColor a10, SkPMColor a11,
482*c8dee2aaSAndroid Build Coastguard Worker SkPMColor* dstColor,
483*c8dee2aaSAndroid Build Coastguard Worker unsigned alphaScale) {
484*c8dee2aaSAndroid Build Coastguard Worker SkASSERT((unsigned)x <= 0xF);
485*c8dee2aaSAndroid Build Coastguard Worker SkASSERT((unsigned)y <= 0xF);
486*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(alphaScale <= 256);
487*c8dee2aaSAndroid Build Coastguard Worker
488*c8dee2aaSAndroid Build Coastguard Worker int xy = x * y;
489*c8dee2aaSAndroid Build Coastguard Worker const uint32_t mask = 0xFF00FF;
490*c8dee2aaSAndroid Build Coastguard Worker
491*c8dee2aaSAndroid Build Coastguard Worker int scale = 256 - 16*y - 16*x + xy;
492*c8dee2aaSAndroid Build Coastguard Worker uint32_t lo = (a00 & mask) * scale;
493*c8dee2aaSAndroid Build Coastguard Worker uint32_t hi = ((a00 >> 8) & mask) * scale;
494*c8dee2aaSAndroid Build Coastguard Worker
495*c8dee2aaSAndroid Build Coastguard Worker scale = 16*x - xy;
496*c8dee2aaSAndroid Build Coastguard Worker lo += (a01 & mask) * scale;
497*c8dee2aaSAndroid Build Coastguard Worker hi += ((a01 >> 8) & mask) * scale;
498*c8dee2aaSAndroid Build Coastguard Worker
499*c8dee2aaSAndroid Build Coastguard Worker scale = 16*y - xy;
500*c8dee2aaSAndroid Build Coastguard Worker lo += (a10 & mask) * scale;
501*c8dee2aaSAndroid Build Coastguard Worker hi += ((a10 >> 8) & mask) * scale;
502*c8dee2aaSAndroid Build Coastguard Worker
503*c8dee2aaSAndroid Build Coastguard Worker lo += (a11 & mask) * xy;
504*c8dee2aaSAndroid Build Coastguard Worker hi += ((a11 >> 8) & mask) * xy;
505*c8dee2aaSAndroid Build Coastguard Worker
506*c8dee2aaSAndroid Build Coastguard Worker if (alphaScale < 256) {
507*c8dee2aaSAndroid Build Coastguard Worker lo = ((lo >> 8) & mask) * alphaScale;
508*c8dee2aaSAndroid Build Coastguard Worker hi = ((hi >> 8) & mask) * alphaScale;
509*c8dee2aaSAndroid Build Coastguard Worker }
510*c8dee2aaSAndroid Build Coastguard Worker
511*c8dee2aaSAndroid Build Coastguard Worker *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
512*c8dee2aaSAndroid Build Coastguard Worker }
513*c8dee2aaSAndroid Build Coastguard Worker #endif
514*c8dee2aaSAndroid Build Coastguard Worker
515*c8dee2aaSAndroid Build Coastguard Worker
516*c8dee2aaSAndroid Build Coastguard Worker /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,SkPMColor * colors)517*c8dee2aaSAndroid Build Coastguard Worker void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
518*c8dee2aaSAndroid Build Coastguard Worker const uint32_t* xy, int count, SkPMColor* colors) {
519*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(count > 0 && colors != nullptr);
520*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(s.fBilerp);
521*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
522*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(s.fAlphaScale <= 256);
523*c8dee2aaSAndroid Build Coastguard Worker
524*c8dee2aaSAndroid Build Coastguard Worker int y0, y1, wy;
525*c8dee2aaSAndroid Build Coastguard Worker decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
526*c8dee2aaSAndroid Build Coastguard Worker
527*c8dee2aaSAndroid Build Coastguard Worker auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
528*c8dee2aaSAndroid Build Coastguard Worker row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
529*c8dee2aaSAndroid Build Coastguard Worker
530*c8dee2aaSAndroid Build Coastguard Worker while (count --> 0) {
531*c8dee2aaSAndroid Build Coastguard Worker int x0, x1, wx;
532*c8dee2aaSAndroid Build Coastguard Worker decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
533*c8dee2aaSAndroid Build Coastguard Worker
534*c8dee2aaSAndroid Build Coastguard Worker filter_and_scale_by_alpha(wx, wy,
535*c8dee2aaSAndroid Build Coastguard Worker row0[x0], row0[x1],
536*c8dee2aaSAndroid Build Coastguard Worker row1[x0], row1[x1],
537*c8dee2aaSAndroid Build Coastguard Worker colors++,
538*c8dee2aaSAndroid Build Coastguard Worker s.fAlphaScale);
539*c8dee2aaSAndroid Build Coastguard Worker }
540*c8dee2aaSAndroid Build Coastguard Worker }
541*c8dee2aaSAndroid Build Coastguard Worker
542*c8dee2aaSAndroid Build Coastguard Worker #endif
543*c8dee2aaSAndroid Build Coastguard Worker
544*c8dee2aaSAndroid Build Coastguard Worker #if defined(SK_ARM_HAS_NEON)
545*c8dee2aaSAndroid Build Coastguard Worker /*not static*/ inline
S32_alpha_D32_filter_DXDY(const SkBitmapProcState & s,const uint32_t * xy,int count,SkPMColor * colors)546*c8dee2aaSAndroid Build Coastguard Worker void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s,
547*c8dee2aaSAndroid Build Coastguard Worker const uint32_t* xy, int count, SkPMColor* colors) {
548*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(count > 0 && colors != nullptr);
549*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(s.fBilerp);
550*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
551*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(s.fAlphaScale <= 256);
552*c8dee2aaSAndroid Build Coastguard Worker
553*c8dee2aaSAndroid Build Coastguard Worker auto src = (const char*)s.fPixmap.addr();
554*c8dee2aaSAndroid Build Coastguard Worker size_t rb = s.fPixmap.rowBytes();
555*c8dee2aaSAndroid Build Coastguard Worker
556*c8dee2aaSAndroid Build Coastguard Worker while (count --> 0) {
557*c8dee2aaSAndroid Build Coastguard Worker int y0, y1, wy,
558*c8dee2aaSAndroid Build Coastguard Worker x0, x1, wx;
559*c8dee2aaSAndroid Build Coastguard Worker decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
560*c8dee2aaSAndroid Build Coastguard Worker decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
561*c8dee2aaSAndroid Build Coastguard Worker
562*c8dee2aaSAndroid Build Coastguard Worker auto row0 = (const uint32_t*)(src + y0*rb),
563*c8dee2aaSAndroid Build Coastguard Worker row1 = (const uint32_t*)(src + y1*rb);
564*c8dee2aaSAndroid Build Coastguard Worker
565*c8dee2aaSAndroid Build Coastguard Worker filter_and_scale_by_alpha(wx, wy,
566*c8dee2aaSAndroid Build Coastguard Worker row0[x0], row0[x1],
567*c8dee2aaSAndroid Build Coastguard Worker row1[x0], row1[x1],
568*c8dee2aaSAndroid Build Coastguard Worker colors++,
569*c8dee2aaSAndroid Build Coastguard Worker s.fAlphaScale);
570*c8dee2aaSAndroid Build Coastguard Worker }
571*c8dee2aaSAndroid Build Coastguard Worker }
572*c8dee2aaSAndroid Build Coastguard Worker #else
573*c8dee2aaSAndroid Build Coastguard Worker // It's not yet clear whether it's worthwhile specializing for other architectures.
574*c8dee2aaSAndroid Build Coastguard Worker constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&,
575*c8dee2aaSAndroid Build Coastguard Worker const uint32_t*, int, SkPMColor*) = nullptr;
576*c8dee2aaSAndroid Build Coastguard Worker #endif
577*c8dee2aaSAndroid Build Coastguard Worker
578*c8dee2aaSAndroid Build Coastguard Worker } // namespace SK_OPTS_NS
579*c8dee2aaSAndroid Build Coastguard Worker
580*c8dee2aaSAndroid Build Coastguard Worker namespace sktests {
581*c8dee2aaSAndroid Build Coastguard Worker template <typename U32, typename Out>
decode_packed_coordinates_and_weight(U32 packed,Out * v0,Out * v1,Out * w)582*c8dee2aaSAndroid Build Coastguard Worker void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
583*c8dee2aaSAndroid Build Coastguard Worker SK_OPTS_NS::decode_packed_coordinates_and_weight<U32, Out>(packed, v0, v1, w);
584*c8dee2aaSAndroid Build Coastguard Worker }
585*c8dee2aaSAndroid Build Coastguard Worker }
586*c8dee2aaSAndroid Build Coastguard Worker
587*c8dee2aaSAndroid Build Coastguard Worker #endif
588