xref: /aosp_15_r20/external/skia/src/opts/SkSwizzler_opts.inc (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1*c8dee2aaSAndroid Build Coastguard Worker/*
2*c8dee2aaSAndroid Build Coastguard Worker * Copyright 2016 Google Inc.
3*c8dee2aaSAndroid Build Coastguard Worker *
4*c8dee2aaSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license that can be
5*c8dee2aaSAndroid Build Coastguard Worker * found in the LICENSE file.
6*c8dee2aaSAndroid Build Coastguard Worker */
7*c8dee2aaSAndroid Build Coastguard Worker
8*c8dee2aaSAndroid Build Coastguard Worker#include "include/private/SkColorData.h"
9*c8dee2aaSAndroid Build Coastguard Worker#include "src/base/SkUtils.h"
10*c8dee2aaSAndroid Build Coastguard Worker#include "src/base/SkVx.h"
11*c8dee2aaSAndroid Build Coastguard Worker#include "src/core/SkSwizzlePriv.h"
12*c8dee2aaSAndroid Build Coastguard Worker
13*c8dee2aaSAndroid Build Coastguard Worker#include <algorithm>
14*c8dee2aaSAndroid Build Coastguard Worker#include <cmath>
15*c8dee2aaSAndroid Build Coastguard Worker#include <utility>
16*c8dee2aaSAndroid Build Coastguard Worker
17*c8dee2aaSAndroid Build Coastguard Worker#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
18*c8dee2aaSAndroid Build Coastguard Worker    #include <immintrin.h>
19*c8dee2aaSAndroid Build Coastguard Worker#elif defined(SK_ARM_HAS_NEON)
20*c8dee2aaSAndroid Build Coastguard Worker    #include <arm_neon.h>
21*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
22*c8dee2aaSAndroid Build Coastguard Worker    #include <lasxintrin.h>
23*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
24*c8dee2aaSAndroid Build Coastguard Worker    #include <lsxintrin.h>
25*c8dee2aaSAndroid Build Coastguard Worker#endif
26*c8dee2aaSAndroid Build Coastguard Worker
27*c8dee2aaSAndroid Build Coastguard Worker// This file is included in multiple translation units with different #defines set enabling
28*c8dee2aaSAndroid Build Coastguard Worker// different instruction use for different CPU architectures.
29*c8dee2aaSAndroid Build Coastguard Worker//
30*c8dee2aaSAndroid Build Coastguard Worker// A pair of files controls what #defines are defined: SkOpts_SetTarget.h set the flags, and
31*c8dee2aaSAndroid Build Coastguard Worker// SkOpts_RestoreTarget.h restores them. SkOpts_SetTarget is controlled by setting the
32*c8dee2aaSAndroid Build Coastguard Worker// SK_OPTS_TARGET define before included it.
33*c8dee2aaSAndroid Build Coastguard Worker//
34*c8dee2aaSAndroid Build Coastguard Worker// SkOpts_SetTarget also sets the #define SK_OPTS_NS to the unique namespace for this code.
35*c8dee2aaSAndroid Build Coastguard Worker
36*c8dee2aaSAndroid Build Coastguard Worker#if defined(__clang__) || defined(__GNUC__)
37*c8dee2aaSAndroid Build Coastguard Worker#define SI __attribute__((always_inline)) static inline
38*c8dee2aaSAndroid Build Coastguard Worker#else
39*c8dee2aaSAndroid Build Coastguard Worker#define SI static inline
40*c8dee2aaSAndroid Build Coastguard Worker#endif
41*c8dee2aaSAndroid Build Coastguard Worker
42*c8dee2aaSAndroid Build Coastguard Workernamespace SK_OPTS_NS {
43*c8dee2aaSAndroid Build Coastguard Worker
44*c8dee2aaSAndroid Build Coastguard Worker#if defined(SK_USE_FAST_UNPREMUL_324099025)
45*c8dee2aaSAndroid Build Coastguard Workerconstexpr bool kFastUnpremul = true;
46*c8dee2aaSAndroid Build Coastguard Worker#else
47*c8dee2aaSAndroid Build Coastguard Workerconstexpr bool kFastUnpremul = false;
48*c8dee2aaSAndroid Build Coastguard Worker#endif
49*c8dee2aaSAndroid Build Coastguard Worker
50*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha_times_255_portable(float a) {
51*c8dee2aaSAndroid Build Coastguard Worker    return a != 0 ? 255.0f / a : 0.0f;
52*c8dee2aaSAndroid Build Coastguard Worker}
53*c8dee2aaSAndroid Build Coastguard Worker
54*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha_portable(float a) {
55*c8dee2aaSAndroid Build Coastguard Worker    return a != 0 ? 1.0f / a : 0.0f;
56*c8dee2aaSAndroid Build Coastguard Worker}
57*c8dee2aaSAndroid Build Coastguard Worker
58*c8dee2aaSAndroid Build Coastguard Worker#if defined(SK_ARM_HAS_NEON)
59*c8dee2aaSAndroid Build Coastguard Worker// -- NEON -- Harden against timing attacks
60*c8dee2aaSAndroid Build Coastguard Worker// For neon, the portable versions create branchless code.
61*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha_times_255(float a) {
62*c8dee2aaSAndroid Build Coastguard Worker    return reciprocal_alpha_times_255_portable(a);
63*c8dee2aaSAndroid Build Coastguard Worker}
64*c8dee2aaSAndroid Build Coastguard Worker
65*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha(float a) {
66*c8dee2aaSAndroid Build Coastguard Worker    return reciprocal_alpha_portable(a);
67*c8dee2aaSAndroid Build Coastguard Worker}
68*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 && (defined(__clang__) || !defined(_MSC_VER))
69*c8dee2aaSAndroid Build Coastguard Worker// -- SSE -- Harden against timing attacks -- MSVC is not supported.
70*c8dee2aaSAndroid Build Coastguard Workerusing F4 = __m128;
71*c8dee2aaSAndroid Build Coastguard Worker
72*c8dee2aaSAndroid Build Coastguard WorkerSK_NO_SANITIZE("float-divide-by-zero")
73*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha_times_255(float a) {
74*c8dee2aaSAndroid Build Coastguard Worker    SkASSERT(0 <= a && a <= 255);
75*c8dee2aaSAndroid Build Coastguard Worker    F4 vA{a, a, a, a};
76*c8dee2aaSAndroid Build Coastguard Worker    auto q = F4{255.0f} / vA;
77*c8dee2aaSAndroid Build Coastguard Worker    return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0];
78*c8dee2aaSAndroid Build Coastguard Worker}
79*c8dee2aaSAndroid Build Coastguard Worker
80*c8dee2aaSAndroid Build Coastguard WorkerSK_NO_SANITIZE("float-divide-by-zero")
81*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha(float a) {
82*c8dee2aaSAndroid Build Coastguard Worker    SkASSERT(0 <= a && a <= 1);
83*c8dee2aaSAndroid Build Coastguard Worker    F4 vA{a, a, a, a};
84*c8dee2aaSAndroid Build Coastguard Worker    auto q = F4{1.0f} / vA;
85*c8dee2aaSAndroid Build Coastguard Worker    return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0];
86*c8dee2aaSAndroid Build Coastguard Worker}
87*c8dee2aaSAndroid Build Coastguard Worker#else
88*c8dee2aaSAndroid Build Coastguard Worker// -- Portable -- *Not* hardened against timing attacks
89*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha_times_255(float a) {
90*c8dee2aaSAndroid Build Coastguard Worker    return reciprocal_alpha_times_255_portable(a);
91*c8dee2aaSAndroid Build Coastguard Worker}
92*c8dee2aaSAndroid Build Coastguard Worker
93*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha(float a) {
94*c8dee2aaSAndroid Build Coastguard Worker    return reciprocal_alpha_portable(a);
95*c8dee2aaSAndroid Build Coastguard Worker}
96*c8dee2aaSAndroid Build Coastguard Worker#endif
97*c8dee2aaSAndroid Build Coastguard Worker
98*c8dee2aaSAndroid Build Coastguard Workerstatic void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) {
99*c8dee2aaSAndroid Build Coastguard Worker    for (int i = 0; i < count; i++) {
100*c8dee2aaSAndroid Build Coastguard Worker        uint8_t a = (src[i] >> 24) & 0xFF,
101*c8dee2aaSAndroid Build Coastguard Worker                b = (src[i] >> 16) & 0xFF,
102*c8dee2aaSAndroid Build Coastguard Worker                g = (src[i] >>  8) & 0xFF,
103*c8dee2aaSAndroid Build Coastguard Worker                r = (src[i] >>  0) & 0xFF;
104*c8dee2aaSAndroid Build Coastguard Worker        b = (b*a+127)/255;
105*c8dee2aaSAndroid Build Coastguard Worker        g = (g*a+127)/255;
106*c8dee2aaSAndroid Build Coastguard Worker        r = (r*a+127)/255;
107*c8dee2aaSAndroid Build Coastguard Worker        dst[i] = (uint32_t)a << 24
108*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)b << 16
109*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)g <<  8
110*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)r <<  0;
111*c8dee2aaSAndroid Build Coastguard Worker    }
112*c8dee2aaSAndroid Build Coastguard Worker}
113*c8dee2aaSAndroid Build Coastguard Worker
114*c8dee2aaSAndroid Build Coastguard Worker// RP uses the following rounding routines in store_8888. There are three different
115*c8dee2aaSAndroid Build Coastguard Worker// styles of rounding:
116*c8dee2aaSAndroid Build Coastguard Worker//   1) +0.5 and floor - used by scalar and ARMv7
117*c8dee2aaSAndroid Build Coastguard Worker//   2) round to even for sure - ARMv8
118*c8dee2aaSAndroid Build Coastguard Worker//   3) round to even maybe - intel. The rounding on intel depends on MXCSR which
119*c8dee2aaSAndroid Build Coastguard Worker//                            defaults to round to even.
120*c8dee2aaSAndroid Build Coastguard Worker//
121*c8dee2aaSAndroid Build Coastguard Worker// Note: that vrndns_f32 is the single float version of vcvtnq_u32_f32.
122*c8dee2aaSAndroid Build Coastguard Worker
123*c8dee2aaSAndroid Build Coastguard WorkerSI uint32_t pixel_round_as_RP(float n) {
124*c8dee2aaSAndroid Build Coastguard Worker#if defined(SK_ARM_HAS_NEON) && defined(SK_CPU_ARM64)
125*c8dee2aaSAndroid Build Coastguard Worker    return vrndns_f32(n);
126*c8dee2aaSAndroid Build Coastguard Worker#elif defined(SK_ARM_HAS_NEON) && !defined(SK_CPU_ARM64)
127*c8dee2aaSAndroid Build Coastguard Worker    float32x4_t vN{n + 0.5f};
128*c8dee2aaSAndroid Build Coastguard Worker    return vcvtq_u32_f32(vN)[0];
129*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 && (defined(__clang__) || !defined(_MSC_VER))
130*c8dee2aaSAndroid Build Coastguard Worker    return _mm_cvtps_epi32(__m128{n})[0];
131*c8dee2aaSAndroid Build Coastguard Worker#else
132*c8dee2aaSAndroid Build Coastguard Worker    return (uint32_t)(n + 0.5f);
133*c8dee2aaSAndroid Build Coastguard Worker#endif
134*c8dee2aaSAndroid Build Coastguard Worker}
135*c8dee2aaSAndroid Build Coastguard Worker
136*c8dee2aaSAndroid Build Coastguard Worker// Doing the math for an original color b resulting in a premul color x,
137*c8dee2aaSAndroid Build Coastguard Worker//   x = ⌊(b * a + 127) / 255⌋,
138*c8dee2aaSAndroid Build Coastguard Worker//   x ≤ (b * a + 127) / 255 < x + 1,
139*c8dee2aaSAndroid Build Coastguard Worker//   255 * x ≤ b * a + 127 < 255 * (x + 1),
140*c8dee2aaSAndroid Build Coastguard Worker//   255 * x - 127 ≤ b * a < 255 * (x + 1) - 127,
141*c8dee2aaSAndroid Build Coastguard Worker//   255 * x - 127 ≤ b * a < 255 * x + 128,
142*c8dee2aaSAndroid Build Coastguard Worker//   (255 * x - 127) / a ≤ b < (255 * x + 128) / a.
143*c8dee2aaSAndroid Build Coastguard Worker// So, given a premul value x < a, the original color b can be in the above range.
144*c8dee2aaSAndroid Build Coastguard Worker// We can pick the middle of that range as
145*c8dee2aaSAndroid Build Coastguard Worker//   b = 255 * x / a
146*c8dee2aaSAndroid Build Coastguard Worker//   b = x * (255 / a)
147*c8dee2aaSAndroid Build Coastguard WorkerSI uint32_t unpremul_quick(float reciprocalA, float c) {
148*c8dee2aaSAndroid Build Coastguard Worker    return (uint32_t)std::min(255.0f, (c * reciprocalA + 0.5f));
149*c8dee2aaSAndroid Build Coastguard Worker}
150*c8dee2aaSAndroid Build Coastguard Worker
151*c8dee2aaSAndroid Build Coastguard Worker// Similar to unpremul but simulates Raster Pipeline by normalizing the pixel on the interval
152*c8dee2aaSAndroid Build Coastguard Worker// [0, 1] and uses round-to-even in most cases instead of round-up.
153*c8dee2aaSAndroid Build Coastguard WorkerSI uint32_t unpremul_simulating_RP(float reciprocalA, float c) {
154*c8dee2aaSAndroid Build Coastguard Worker    const float normalizedC = c * (1.0f / 255.0f);
155*c8dee2aaSAndroid Build Coastguard Worker    const float answer = std::min(255.0f, normalizedC * reciprocalA * 255.0f);
156*c8dee2aaSAndroid Build Coastguard Worker    return pixel_round_as_RP(answer);
157*c8dee2aaSAndroid Build Coastguard Worker}
158*c8dee2aaSAndroid Build Coastguard Worker
159*c8dee2aaSAndroid Build Coastguard WorkerSI uint32_t rgbA_to_CCCA(float c00, float c08, float c16, float a) {
160*c8dee2aaSAndroid Build Coastguard Worker    if constexpr (kFastUnpremul) {
161*c8dee2aaSAndroid Build Coastguard Worker        const float reciprocalA = reciprocal_alpha_times_255(a);
162*c8dee2aaSAndroid Build Coastguard Worker        auto unpremul = [reciprocalA](float c) -> uint32_t {
163*c8dee2aaSAndroid Build Coastguard Worker            return unpremul_quick(reciprocalA, c);
164*c8dee2aaSAndroid Build Coastguard Worker        };
165*c8dee2aaSAndroid Build Coastguard Worker        return (uint32_t) a << 24
166*c8dee2aaSAndroid Build Coastguard Worker               | unpremul(c16) << 16
167*c8dee2aaSAndroid Build Coastguard Worker               | unpremul(c08) <<  8
168*c8dee2aaSAndroid Build Coastguard Worker               | unpremul(c00) <<  0;
169*c8dee2aaSAndroid Build Coastguard Worker    } else {
170*c8dee2aaSAndroid Build Coastguard Worker        const float normalizedA = a * (1.0f / 255.0f);
171*c8dee2aaSAndroid Build Coastguard Worker        const float reciprocalA = reciprocal_alpha(normalizedA);
172*c8dee2aaSAndroid Build Coastguard Worker        auto unpremul = [reciprocalA](float c) -> uint32_t {
173*c8dee2aaSAndroid Build Coastguard Worker            return unpremul_simulating_RP(reciprocalA, c);
174*c8dee2aaSAndroid Build Coastguard Worker        };
175*c8dee2aaSAndroid Build Coastguard Worker        return (uint32_t) a << 24
176*c8dee2aaSAndroid Build Coastguard Worker               | unpremul(c16) << 16
177*c8dee2aaSAndroid Build Coastguard Worker               | unpremul(c08) <<  8
178*c8dee2aaSAndroid Build Coastguard Worker               | unpremul(c00) <<  0;
179*c8dee2aaSAndroid Build Coastguard Worker    }
180*c8dee2aaSAndroid Build Coastguard Worker}
181*c8dee2aaSAndroid Build Coastguard Worker
182*c8dee2aaSAndroid Build Coastguard Workerstatic void rgbA_to_RGBA_portable(uint32_t* dst, const uint32_t* src, int count) {
183*c8dee2aaSAndroid Build Coastguard Worker    for (int i = 0; i < count; i++) {
184*c8dee2aaSAndroid Build Coastguard Worker        const uint32_t p = src[i];
185*c8dee2aaSAndroid Build Coastguard Worker
186*c8dee2aaSAndroid Build Coastguard Worker        const float a = (p >> 24) & 0xFF,
187*c8dee2aaSAndroid Build Coastguard Worker                    b = (p >> 16) & 0xFF,
188*c8dee2aaSAndroid Build Coastguard Worker                    g = (p >>  8) & 0xFF,
189*c8dee2aaSAndroid Build Coastguard Worker                    r = (p >>  0) & 0xFF;
190*c8dee2aaSAndroid Build Coastguard Worker
191*c8dee2aaSAndroid Build Coastguard Worker        dst[i] = rgbA_to_CCCA(r, g, b, a);
192*c8dee2aaSAndroid Build Coastguard Worker    }
193*c8dee2aaSAndroid Build Coastguard Worker}
194*c8dee2aaSAndroid Build Coastguard Worker
195*c8dee2aaSAndroid Build Coastguard Workerstatic void rgbA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
196*c8dee2aaSAndroid Build Coastguard Worker    for (int i = 0; i < count; i++) {
197*c8dee2aaSAndroid Build Coastguard Worker        const uint32_t p = src[i];
198*c8dee2aaSAndroid Build Coastguard Worker
199*c8dee2aaSAndroid Build Coastguard Worker        const uint32_t a = (p >> 24) & 0xFF,
200*c8dee2aaSAndroid Build Coastguard Worker                       b = (p >> 16) & 0xFF,
201*c8dee2aaSAndroid Build Coastguard Worker                       g = (p >>  8) & 0xFF,
202*c8dee2aaSAndroid Build Coastguard Worker                       r = (p >>  0) & 0xFF;
203*c8dee2aaSAndroid Build Coastguard Worker
204*c8dee2aaSAndroid Build Coastguard Worker        dst[i] = rgbA_to_CCCA(b, g, r, a);
205*c8dee2aaSAndroid Build Coastguard Worker    }
206*c8dee2aaSAndroid Build Coastguard Worker}
207*c8dee2aaSAndroid Build Coastguard Worker
208*c8dee2aaSAndroid Build Coastguard Workerstatic void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) {
209*c8dee2aaSAndroid Build Coastguard Worker    for (int i = 0; i < count; i++) {
210*c8dee2aaSAndroid Build Coastguard Worker        uint8_t a = (src[i] >> 24) & 0xFF,
211*c8dee2aaSAndroid Build Coastguard Worker                b = (src[i] >> 16) & 0xFF,
212*c8dee2aaSAndroid Build Coastguard Worker                g = (src[i] >>  8) & 0xFF,
213*c8dee2aaSAndroid Build Coastguard Worker                r = (src[i] >>  0) & 0xFF;
214*c8dee2aaSAndroid Build Coastguard Worker        b = (b*a+127)/255;
215*c8dee2aaSAndroid Build Coastguard Worker        g = (g*a+127)/255;
216*c8dee2aaSAndroid Build Coastguard Worker        r = (r*a+127)/255;
217*c8dee2aaSAndroid Build Coastguard Worker        dst[i] = (uint32_t)a << 24
218*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)r << 16
219*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)g <<  8
220*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)b <<  0;
221*c8dee2aaSAndroid Build Coastguard Worker    }
222*c8dee2aaSAndroid Build Coastguard Worker}
223*c8dee2aaSAndroid Build Coastguard Worker
224*c8dee2aaSAndroid Build Coastguard Workerstatic void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
225*c8dee2aaSAndroid Build Coastguard Worker    for (int i = 0; i < count; i++) {
226*c8dee2aaSAndroid Build Coastguard Worker        uint8_t a = (src[i] >> 24) & 0xFF,
227*c8dee2aaSAndroid Build Coastguard Worker                b = (src[i] >> 16) & 0xFF,
228*c8dee2aaSAndroid Build Coastguard Worker                g = (src[i] >>  8) & 0xFF,
229*c8dee2aaSAndroid Build Coastguard Worker                r = (src[i] >>  0) & 0xFF;
230*c8dee2aaSAndroid Build Coastguard Worker        dst[i] = (uint32_t)a << 24
231*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)r << 16
232*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)g <<  8
233*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)b <<  0;
234*c8dee2aaSAndroid Build Coastguard Worker    }
235*c8dee2aaSAndroid Build Coastguard Worker}
236*c8dee2aaSAndroid Build Coastguard Worker
237*c8dee2aaSAndroid Build Coastguard Workerstatic void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) {
238*c8dee2aaSAndroid Build Coastguard Worker    for (int i = 0; i < count; i++) {
239*c8dee2aaSAndroid Build Coastguard Worker        uint8_t g = src[0],
240*c8dee2aaSAndroid Build Coastguard Worker                a = src[1];
241*c8dee2aaSAndroid Build Coastguard Worker        src += 2;
242*c8dee2aaSAndroid Build Coastguard Worker        dst[i] = (uint32_t)a << 24
243*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)g << 16
244*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)g <<  8
245*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)g <<  0;
246*c8dee2aaSAndroid Build Coastguard Worker    }
247*c8dee2aaSAndroid Build Coastguard Worker}
248*c8dee2aaSAndroid Build Coastguard Worker
249*c8dee2aaSAndroid Build Coastguard Workerstatic void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) {
250*c8dee2aaSAndroid Build Coastguard Worker    for (int i = 0; i < count; i++) {
251*c8dee2aaSAndroid Build Coastguard Worker        uint8_t g = src[0],
252*c8dee2aaSAndroid Build Coastguard Worker                a = src[1];
253*c8dee2aaSAndroid Build Coastguard Worker        src += 2;
254*c8dee2aaSAndroid Build Coastguard Worker        g = (g*a+127)/255;
255*c8dee2aaSAndroid Build Coastguard Worker        dst[i] = (uint32_t)a << 24
256*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)g << 16
257*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)g <<  8
258*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)g <<  0;
259*c8dee2aaSAndroid Build Coastguard Worker    }
260*c8dee2aaSAndroid Build Coastguard Worker}
261*c8dee2aaSAndroid Build Coastguard Worker
262*c8dee2aaSAndroid Build Coastguard Workerstatic void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) {
263*c8dee2aaSAndroid Build Coastguard Worker    for (int i = 0; i < count; i++) {
264*c8dee2aaSAndroid Build Coastguard Worker        uint8_t k = (src[i] >> 24) & 0xFF,
265*c8dee2aaSAndroid Build Coastguard Worker                y = (src[i] >> 16) & 0xFF,
266*c8dee2aaSAndroid Build Coastguard Worker                m = (src[i] >>  8) & 0xFF,
267*c8dee2aaSAndroid Build Coastguard Worker                c = (src[i] >>  0) & 0xFF;
268*c8dee2aaSAndroid Build Coastguard Worker        // See comments in SkSwizzler.cpp for details on the conversion formula.
269*c8dee2aaSAndroid Build Coastguard Worker        uint8_t b = (y*k+127)/255,
270*c8dee2aaSAndroid Build Coastguard Worker                g = (m*k+127)/255,
271*c8dee2aaSAndroid Build Coastguard Worker                r = (c*k+127)/255;
272*c8dee2aaSAndroid Build Coastguard Worker        dst[i] = (uint32_t)0xFF << 24
273*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)   b << 16
274*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)   g <<  8
275*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)   r <<  0;
276*c8dee2aaSAndroid Build Coastguard Worker    }
277*c8dee2aaSAndroid Build Coastguard Worker}
278*c8dee2aaSAndroid Build Coastguard Worker
279*c8dee2aaSAndroid Build Coastguard Workerstatic void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) {
280*c8dee2aaSAndroid Build Coastguard Worker    for (int i = 0; i < count; i++) {
281*c8dee2aaSAndroid Build Coastguard Worker        uint8_t k = (src[i] >> 24) & 0xFF,
282*c8dee2aaSAndroid Build Coastguard Worker                y = (src[i] >> 16) & 0xFF,
283*c8dee2aaSAndroid Build Coastguard Worker                m = (src[i] >>  8) & 0xFF,
284*c8dee2aaSAndroid Build Coastguard Worker                c = (src[i] >>  0) & 0xFF;
285*c8dee2aaSAndroid Build Coastguard Worker        uint8_t b = (y*k+127)/255,
286*c8dee2aaSAndroid Build Coastguard Worker                g = (m*k+127)/255,
287*c8dee2aaSAndroid Build Coastguard Worker                r = (c*k+127)/255;
288*c8dee2aaSAndroid Build Coastguard Worker        dst[i] = (uint32_t)0xFF << 24
289*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)   r << 16
290*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)   g <<  8
291*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)   b <<  0;
292*c8dee2aaSAndroid Build Coastguard Worker    }
293*c8dee2aaSAndroid Build Coastguard Worker}
294*c8dee2aaSAndroid Build Coastguard Worker
295*c8dee2aaSAndroid Build Coastguard Worker#if defined(SK_ARM_HAS_NEON)
296*c8dee2aaSAndroid Build Coastguard Worker// -- NEON -----------------------------------------------------------------------------------------
297*c8dee2aaSAndroid Build Coastguard Worker// Rounded divide by 255, (x + 127) / 255
298*c8dee2aaSAndroid Build Coastguard WorkerSI uint8x8_t div255_round(uint16x8_t x) {
299*c8dee2aaSAndroid Build Coastguard Worker    // result = (x + 127) / 255
300*c8dee2aaSAndroid Build Coastguard Worker    // result = (x + 127) / 256 + error1
301*c8dee2aaSAndroid Build Coastguard Worker    //
302*c8dee2aaSAndroid Build Coastguard Worker    // error1 = (x + 127) / (255 * 256)
303*c8dee2aaSAndroid Build Coastguard Worker    // error1 = (x + 127) / (256 * 256) + error2
304*c8dee2aaSAndroid Build Coastguard Worker    //
305*c8dee2aaSAndroid Build Coastguard Worker    // error2 = (x + 127) / (255 * 256 * 256)
306*c8dee2aaSAndroid Build Coastguard Worker    //
307*c8dee2aaSAndroid Build Coastguard Worker    // The maximum value of error2 is too small to matter.  Thus:
308*c8dee2aaSAndroid Build Coastguard Worker    // result = (x + 127) / 256 + (x + 127) / (256 * 256)
309*c8dee2aaSAndroid Build Coastguard Worker    // result = ((x + 127) / 256 + x + 127) / 256
310*c8dee2aaSAndroid Build Coastguard Worker    // result = ((x + 127) >> 8 + x + 127) >> 8
311*c8dee2aaSAndroid Build Coastguard Worker    //
312*c8dee2aaSAndroid Build Coastguard Worker    // Use >>> to represent "rounded right shift" which, conveniently,
313*c8dee2aaSAndroid Build Coastguard Worker    // NEON supports in one instruction.
314*c8dee2aaSAndroid Build Coastguard Worker    // result = ((x >>> 8) + x) >>> 8
315*c8dee2aaSAndroid Build Coastguard Worker    //
316*c8dee2aaSAndroid Build Coastguard Worker    // Note that the second right shift is actually performed as an
317*c8dee2aaSAndroid Build Coastguard Worker    // "add, round, and narrow back to 8-bits" instruction.
318*c8dee2aaSAndroid Build Coastguard Worker    return vraddhn_u16(x, vrshrq_n_u16(x, 8));
319*c8dee2aaSAndroid Build Coastguard Worker}
320*c8dee2aaSAndroid Build Coastguard Worker
321*c8dee2aaSAndroid Build Coastguard Worker// Scale a byte by another, (x * y + 127) / 255
322*c8dee2aaSAndroid Build Coastguard WorkerSI uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
323*c8dee2aaSAndroid Build Coastguard Worker    return div255_round(vmull_u8(x, y));
324*c8dee2aaSAndroid Build Coastguard Worker}
325*c8dee2aaSAndroid Build Coastguard Worker
326*c8dee2aaSAndroid Build Coastguard Workerstatic void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
327*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 8) {
328*c8dee2aaSAndroid Build Coastguard Worker        // Load 8 pixels.
329*c8dee2aaSAndroid Build Coastguard Worker        uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
330*c8dee2aaSAndroid Build Coastguard Worker
331*c8dee2aaSAndroid Build Coastguard Worker        uint8x8_t a = rgba.val[3],
332*c8dee2aaSAndroid Build Coastguard Worker                  b = rgba.val[2],
333*c8dee2aaSAndroid Build Coastguard Worker                  g = rgba.val[1],
334*c8dee2aaSAndroid Build Coastguard Worker                  r = rgba.val[0];
335*c8dee2aaSAndroid Build Coastguard Worker
336*c8dee2aaSAndroid Build Coastguard Worker        // Premultiply.
337*c8dee2aaSAndroid Build Coastguard Worker        b = scale(b, a);
338*c8dee2aaSAndroid Build Coastguard Worker        g = scale(g, a);
339*c8dee2aaSAndroid Build Coastguard Worker        r = scale(r, a);
340*c8dee2aaSAndroid Build Coastguard Worker
341*c8dee2aaSAndroid Build Coastguard Worker        // Store 8 premultiplied pixels.
342*c8dee2aaSAndroid Build Coastguard Worker        if (kSwapRB) {
343*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[2] = r;
344*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[1] = g;
345*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[0] = b;
346*c8dee2aaSAndroid Build Coastguard Worker        } else {
347*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[2] = b;
348*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[1] = g;
349*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[0] = r;
350*c8dee2aaSAndroid Build Coastguard Worker        }
351*c8dee2aaSAndroid Build Coastguard Worker        vst4_u8((uint8_t*) dst, rgba);
352*c8dee2aaSAndroid Build Coastguard Worker        src += 8;
353*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
354*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
355*c8dee2aaSAndroid Build Coastguard Worker    }
356*c8dee2aaSAndroid Build Coastguard Worker
357*c8dee2aaSAndroid Build Coastguard Worker    // Call portable code to finish up the tail of [0,8) pixels.
358*c8dee2aaSAndroid Build Coastguard Worker    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
359*c8dee2aaSAndroid Build Coastguard Worker    proc(dst, src, count);
360*c8dee2aaSAndroid Build Coastguard Worker}
361*c8dee2aaSAndroid Build Coastguard Worker
362*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
363*c8dee2aaSAndroid Build Coastguard Worker    premul_should_swapRB(false, dst, src, count);
364*c8dee2aaSAndroid Build Coastguard Worker}
365*c8dee2aaSAndroid Build Coastguard Worker
366*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
367*c8dee2aaSAndroid Build Coastguard Worker    premul_should_swapRB(true, dst, src, count);
368*c8dee2aaSAndroid Build Coastguard Worker}
369*c8dee2aaSAndroid Build Coastguard Worker
370*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
371*c8dee2aaSAndroid Build Coastguard Worker    using std::swap;
372*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 16) {
373*c8dee2aaSAndroid Build Coastguard Worker        // Load 16 pixels.
374*c8dee2aaSAndroid Build Coastguard Worker        uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
375*c8dee2aaSAndroid Build Coastguard Worker
376*c8dee2aaSAndroid Build Coastguard Worker        // Swap r and b.
377*c8dee2aaSAndroid Build Coastguard Worker        swap(rgba.val[0], rgba.val[2]);
378*c8dee2aaSAndroid Build Coastguard Worker
379*c8dee2aaSAndroid Build Coastguard Worker        // Store 16 pixels.
380*c8dee2aaSAndroid Build Coastguard Worker        vst4q_u8((uint8_t*) dst, rgba);
381*c8dee2aaSAndroid Build Coastguard Worker        src += 16;
382*c8dee2aaSAndroid Build Coastguard Worker        dst += 16;
383*c8dee2aaSAndroid Build Coastguard Worker        count -= 16;
384*c8dee2aaSAndroid Build Coastguard Worker    }
385*c8dee2aaSAndroid Build Coastguard Worker
386*c8dee2aaSAndroid Build Coastguard Worker    if (count >= 8) {
387*c8dee2aaSAndroid Build Coastguard Worker        // Load 8 pixels.
388*c8dee2aaSAndroid Build Coastguard Worker        uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
389*c8dee2aaSAndroid Build Coastguard Worker
390*c8dee2aaSAndroid Build Coastguard Worker        // Swap r and b.
391*c8dee2aaSAndroid Build Coastguard Worker        swap(rgba.val[0], rgba.val[2]);
392*c8dee2aaSAndroid Build Coastguard Worker
393*c8dee2aaSAndroid Build Coastguard Worker        // Store 8 pixels.
394*c8dee2aaSAndroid Build Coastguard Worker        vst4_u8((uint8_t*) dst, rgba);
395*c8dee2aaSAndroid Build Coastguard Worker        src += 8;
396*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
397*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
398*c8dee2aaSAndroid Build Coastguard Worker    }
399*c8dee2aaSAndroid Build Coastguard Worker
400*c8dee2aaSAndroid Build Coastguard Worker    RGBA_to_BGRA_portable(dst, src, count);
401*c8dee2aaSAndroid Build Coastguard Worker}
402*c8dee2aaSAndroid Build Coastguard Worker
403*c8dee2aaSAndroid Build Coastguard Workerstatic void expand_grayA(bool kPremul, uint32_t dst[], const uint8_t* src, int count) {
404*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 16) {
405*c8dee2aaSAndroid Build Coastguard Worker        // Load 16 pixels.
406*c8dee2aaSAndroid Build Coastguard Worker        uint8x16x2_t ga = vld2q_u8(src);
407*c8dee2aaSAndroid Build Coastguard Worker
408*c8dee2aaSAndroid Build Coastguard Worker        // Premultiply if requested.
409*c8dee2aaSAndroid Build Coastguard Worker        if (kPremul) {
410*c8dee2aaSAndroid Build Coastguard Worker            ga.val[0] = vcombine_u8(
411*c8dee2aaSAndroid Build Coastguard Worker                    scale(vget_low_u8(ga.val[0]),  vget_low_u8(ga.val[1])),
412*c8dee2aaSAndroid Build Coastguard Worker                    scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
413*c8dee2aaSAndroid Build Coastguard Worker        }
414*c8dee2aaSAndroid Build Coastguard Worker
415*c8dee2aaSAndroid Build Coastguard Worker        // Set each of the color channels.
416*c8dee2aaSAndroid Build Coastguard Worker        uint8x16x4_t rgba;
417*c8dee2aaSAndroid Build Coastguard Worker        rgba.val[0] = ga.val[0];
418*c8dee2aaSAndroid Build Coastguard Worker        rgba.val[1] = ga.val[0];
419*c8dee2aaSAndroid Build Coastguard Worker        rgba.val[2] = ga.val[0];
420*c8dee2aaSAndroid Build Coastguard Worker        rgba.val[3] = ga.val[1];
421*c8dee2aaSAndroid Build Coastguard Worker
422*c8dee2aaSAndroid Build Coastguard Worker        // Store 16 pixels.
423*c8dee2aaSAndroid Build Coastguard Worker        vst4q_u8((uint8_t*) dst, rgba);
424*c8dee2aaSAndroid Build Coastguard Worker        src += 16*2;
425*c8dee2aaSAndroid Build Coastguard Worker        dst += 16;
426*c8dee2aaSAndroid Build Coastguard Worker        count -= 16;
427*c8dee2aaSAndroid Build Coastguard Worker    }
428*c8dee2aaSAndroid Build Coastguard Worker
429*c8dee2aaSAndroid Build Coastguard Worker    if (count >= 8) {
430*c8dee2aaSAndroid Build Coastguard Worker        // Load 8 pixels.
431*c8dee2aaSAndroid Build Coastguard Worker        uint8x8x2_t ga = vld2_u8(src);
432*c8dee2aaSAndroid Build Coastguard Worker
433*c8dee2aaSAndroid Build Coastguard Worker        // Premultiply if requested.
434*c8dee2aaSAndroid Build Coastguard Worker        if (kPremul) {
435*c8dee2aaSAndroid Build Coastguard Worker            ga.val[0] = scale(ga.val[0], ga.val[1]);
436*c8dee2aaSAndroid Build Coastguard Worker        }
437*c8dee2aaSAndroid Build Coastguard Worker
438*c8dee2aaSAndroid Build Coastguard Worker        // Set each of the color channels.
439*c8dee2aaSAndroid Build Coastguard Worker        uint8x8x4_t rgba;
440*c8dee2aaSAndroid Build Coastguard Worker        rgba.val[0] = ga.val[0];
441*c8dee2aaSAndroid Build Coastguard Worker        rgba.val[1] = ga.val[0];
442*c8dee2aaSAndroid Build Coastguard Worker        rgba.val[2] = ga.val[0];
443*c8dee2aaSAndroid Build Coastguard Worker        rgba.val[3] = ga.val[1];
444*c8dee2aaSAndroid Build Coastguard Worker
445*c8dee2aaSAndroid Build Coastguard Worker        // Store 8 pixels.
446*c8dee2aaSAndroid Build Coastguard Worker        vst4_u8((uint8_t*) dst, rgba);
447*c8dee2aaSAndroid Build Coastguard Worker        src += 8*2;
448*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
449*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
450*c8dee2aaSAndroid Build Coastguard Worker    }
451*c8dee2aaSAndroid Build Coastguard Worker
452*c8dee2aaSAndroid Build Coastguard Worker    auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
453*c8dee2aaSAndroid Build Coastguard Worker    proc(dst, src, count);
454*c8dee2aaSAndroid Build Coastguard Worker}
455*c8dee2aaSAndroid Build Coastguard Worker
456*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
457*c8dee2aaSAndroid Build Coastguard Worker    expand_grayA(false, dst, src, count);
458*c8dee2aaSAndroid Build Coastguard Worker}
459*c8dee2aaSAndroid Build Coastguard Worker
460*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
461*c8dee2aaSAndroid Build Coastguard Worker    expand_grayA(true, dst, src, count);
462*c8dee2aaSAndroid Build Coastguard Worker}
463*c8dee2aaSAndroid Build Coastguard Worker
464*c8dee2aaSAndroid Build Coastguard Workerenum Format { kRGB1, kBGR1 };
465*c8dee2aaSAndroid Build Coastguard Workerstatic void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
466*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 8) {
467*c8dee2aaSAndroid Build Coastguard Worker        // Load 8 cmyk pixels.
468*c8dee2aaSAndroid Build Coastguard Worker        uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
469*c8dee2aaSAndroid Build Coastguard Worker
470*c8dee2aaSAndroid Build Coastguard Worker        uint8x8_t k = pixels.val[3],
471*c8dee2aaSAndroid Build Coastguard Worker                  y = pixels.val[2],
472*c8dee2aaSAndroid Build Coastguard Worker                  m = pixels.val[1],
473*c8dee2aaSAndroid Build Coastguard Worker                  c = pixels.val[0];
474*c8dee2aaSAndroid Build Coastguard Worker
475*c8dee2aaSAndroid Build Coastguard Worker        // Scale to r, g, b.
476*c8dee2aaSAndroid Build Coastguard Worker        uint8x8_t b = scale(y, k);
477*c8dee2aaSAndroid Build Coastguard Worker        uint8x8_t g = scale(m, k);
478*c8dee2aaSAndroid Build Coastguard Worker        uint8x8_t r = scale(c, k);
479*c8dee2aaSAndroid Build Coastguard Worker
480*c8dee2aaSAndroid Build Coastguard Worker        // Store 8 rgba pixels.
481*c8dee2aaSAndroid Build Coastguard Worker        if (kBGR1 == format) {
482*c8dee2aaSAndroid Build Coastguard Worker            pixels.val[3] = vdup_n_u8(0xFF);
483*c8dee2aaSAndroid Build Coastguard Worker            pixels.val[2] = r;
484*c8dee2aaSAndroid Build Coastguard Worker            pixels.val[1] = g;
485*c8dee2aaSAndroid Build Coastguard Worker            pixels.val[0] = b;
486*c8dee2aaSAndroid Build Coastguard Worker        } else {
487*c8dee2aaSAndroid Build Coastguard Worker            pixels.val[3] = vdup_n_u8(0xFF);
488*c8dee2aaSAndroid Build Coastguard Worker            pixels.val[2] = b;
489*c8dee2aaSAndroid Build Coastguard Worker            pixels.val[1] = g;
490*c8dee2aaSAndroid Build Coastguard Worker            pixels.val[0] = r;
491*c8dee2aaSAndroid Build Coastguard Worker        }
492*c8dee2aaSAndroid Build Coastguard Worker        vst4_u8((uint8_t*) dst, pixels);
493*c8dee2aaSAndroid Build Coastguard Worker        src += 8;
494*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
495*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
496*c8dee2aaSAndroid Build Coastguard Worker    }
497*c8dee2aaSAndroid Build Coastguard Worker
498*c8dee2aaSAndroid Build Coastguard Worker    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
499*c8dee2aaSAndroid Build Coastguard Worker    proc(dst, src, count);
500*c8dee2aaSAndroid Build Coastguard Worker}
501*c8dee2aaSAndroid Build Coastguard Worker
502*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
503*c8dee2aaSAndroid Build Coastguard Worker    inverted_cmyk_to(kRGB1, dst, src, count);
504*c8dee2aaSAndroid Build Coastguard Worker}
505*c8dee2aaSAndroid Build Coastguard Worker
506*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
507*c8dee2aaSAndroid Build Coastguard Worker    inverted_cmyk_to(kBGR1, dst, src, count);
508*c8dee2aaSAndroid Build Coastguard Worker}
509*c8dee2aaSAndroid Build Coastguard Worker
510*c8dee2aaSAndroid Build Coastguard Workertemplate <bool swapRB>
511*c8dee2aaSAndroid Build Coastguard Workerstatic void common_rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
512*c8dee2aaSAndroid Build Coastguard Worker
513*c8dee2aaSAndroid Build Coastguard Worker    // Only use the SIMD code if simulating RP, otherwise the quick code auto-vectorizes will
514*c8dee2aaSAndroid Build Coastguard Worker    // enough on ARM to not need a SIMD implementation.
515*c8dee2aaSAndroid Build Coastguard Worker    if constexpr (!kFastUnpremul) {
516*c8dee2aaSAndroid Build Coastguard Worker        while (count >= 8) {
517*c8dee2aaSAndroid Build Coastguard Worker            const uint8x8x4_t in = vld4_u8((const uint8_t*)src);
518*c8dee2aaSAndroid Build Coastguard Worker
519*c8dee2aaSAndroid Build Coastguard Worker            auto round = [](float32x4_t v) -> uint32x4_t {
520*c8dee2aaSAndroid Build Coastguard Worker                #if defined(SK_CPU_ARM64)
521*c8dee2aaSAndroid Build Coastguard Worker                    return vcvtnq_u32_f32(v);
522*c8dee2aaSAndroid Build Coastguard Worker                #else
523*c8dee2aaSAndroid Build Coastguard Worker                    return vcvtq_u32_f32(v + 0.5f);
524*c8dee2aaSAndroid Build Coastguard Worker                #endif
525*c8dee2aaSAndroid Build Coastguard Worker            };
526*c8dee2aaSAndroid Build Coastguard Worker
527*c8dee2aaSAndroid Build Coastguard Worker            static constexpr float kN = 1.0f / 255.0f;
528*c8dee2aaSAndroid Build Coastguard Worker            auto toNormalized = [](uint16x4_t v) -> float32x4_t {
529*c8dee2aaSAndroid Build Coastguard Worker                return vcvtq_f32_u32(vmovl_u16(v)) * kN;
530*c8dee2aaSAndroid Build Coastguard Worker            };
531*c8dee2aaSAndroid Build Coastguard Worker
532*c8dee2aaSAndroid Build Coastguard Worker            auto unpremulHalf =
533*c8dee2aaSAndroid Build Coastguard Worker                    [toNormalized, round](float32x4_t invA, uint16x4_t v) -> uint16x4_t {
534*c8dee2aaSAndroid Build Coastguard Worker                const float32x4_t normalizedV = toNormalized(v);
535*c8dee2aaSAndroid Build Coastguard Worker                const float32x4_t divided = invA * normalizedV;
536*c8dee2aaSAndroid Build Coastguard Worker                const float32x4_t denormalized = divided * 255.0f;
537*c8dee2aaSAndroid Build Coastguard Worker                const uint32x4_t rounded = round(denormalized);
538*c8dee2aaSAndroid Build Coastguard Worker                return vqmovn_u32(rounded);
539*c8dee2aaSAndroid Build Coastguard Worker            };
540*c8dee2aaSAndroid Build Coastguard Worker
541*c8dee2aaSAndroid Build Coastguard Worker            auto reciprocal = [](float32x4_t a) -> float32x4_t {
542*c8dee2aaSAndroid Build Coastguard Worker                uint32x4_t mask = sk_bit_cast<uint32x4_t>(a != float32x4_t{0, 0, 0, 0});
543*c8dee2aaSAndroid Build Coastguard Worker                auto recip = 1.0f / a;
544*c8dee2aaSAndroid Build Coastguard Worker                return sk_bit_cast<float32x4_t>(mask & sk_bit_cast<uint32x4_t>(recip));
545*c8dee2aaSAndroid Build Coastguard Worker            };
546*c8dee2aaSAndroid Build Coastguard Worker
547*c8dee2aaSAndroid Build Coastguard Worker            const uint8x8_t a = in.val[3];
548*c8dee2aaSAndroid Build Coastguard Worker            const uint16x8_t intA = vmovl_u8(a);
549*c8dee2aaSAndroid Build Coastguard Worker            const float32x4_t invALow = reciprocal(toNormalized(vget_low_u16(intA)));
550*c8dee2aaSAndroid Build Coastguard Worker            const float32x4_t invAHigh = reciprocal(toNormalized(vget_high_u16(intA)));
551*c8dee2aaSAndroid Build Coastguard Worker
552*c8dee2aaSAndroid Build Coastguard Worker            auto unpremul = [unpremulHalf, invALow, invAHigh](uint8x8_t v) -> uint8x8_t {
553*c8dee2aaSAndroid Build Coastguard Worker                const uint16x8_t to16 = vmovl_u8(v);
554*c8dee2aaSAndroid Build Coastguard Worker
555*c8dee2aaSAndroid Build Coastguard Worker                const uint16x4_t low = unpremulHalf(invALow, vget_low_u16(to16));
556*c8dee2aaSAndroid Build Coastguard Worker                const uint16x4_t high = unpremulHalf(invAHigh, vget_high_u16(to16));
557*c8dee2aaSAndroid Build Coastguard Worker
558*c8dee2aaSAndroid Build Coastguard Worker                const uint16x8_t combined = vcombine_u16(low, high);
559*c8dee2aaSAndroid Build Coastguard Worker                return vqmovn_u16(combined);
560*c8dee2aaSAndroid Build Coastguard Worker            };
561*c8dee2aaSAndroid Build Coastguard Worker
562*c8dee2aaSAndroid Build Coastguard Worker            const uint8x8_t b = unpremul(in.val[2]);
563*c8dee2aaSAndroid Build Coastguard Worker            const uint8x8_t g = unpremul(in.val[1]);
564*c8dee2aaSAndroid Build Coastguard Worker            const uint8x8_t r = unpremul(in.val[0]);
565*c8dee2aaSAndroid Build Coastguard Worker
566*c8dee2aaSAndroid Build Coastguard Worker            if constexpr (swapRB) {
567*c8dee2aaSAndroid Build Coastguard Worker                const uint8x8x4_t out{b, g, r, a};
568*c8dee2aaSAndroid Build Coastguard Worker                vst4_u8((uint8_t*)dst, out);
569*c8dee2aaSAndroid Build Coastguard Worker            } else {
570*c8dee2aaSAndroid Build Coastguard Worker                const uint8x8x4_t out{r, g, b, a};
571*c8dee2aaSAndroid Build Coastguard Worker                vst4_u8((uint8_t*)dst, out);
572*c8dee2aaSAndroid Build Coastguard Worker            }
573*c8dee2aaSAndroid Build Coastguard Worker
574*c8dee2aaSAndroid Build Coastguard Worker            src += 8;
575*c8dee2aaSAndroid Build Coastguard Worker            dst += 8;
576*c8dee2aaSAndroid Build Coastguard Worker            count -= 8;
577*c8dee2aaSAndroid Build Coastguard Worker        }
578*c8dee2aaSAndroid Build Coastguard Worker    }
579*c8dee2aaSAndroid Build Coastguard Worker
580*c8dee2aaSAndroid Build Coastguard Worker    // Handle the tail. Count will be < 8.
581*c8dee2aaSAndroid Build Coastguard Worker    if constexpr (swapRB) {
582*c8dee2aaSAndroid Build Coastguard Worker        rgbA_to_BGRA_portable(dst, src, count);
583*c8dee2aaSAndroid Build Coastguard Worker    } else {
584*c8dee2aaSAndroid Build Coastguard Worker        rgbA_to_RGBA_portable(dst, src, count);
585*c8dee2aaSAndroid Build Coastguard Worker    }
586*c8dee2aaSAndroid Build Coastguard Worker}
587*c8dee2aaSAndroid Build Coastguard Worker
588*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
589*c8dee2aaSAndroid Build Coastguard Worker    common_rgbA_to_RGBA</*swapRB=*/false>(dst, src, count);
590*c8dee2aaSAndroid Build Coastguard Worker}
591*c8dee2aaSAndroid Build Coastguard Worker
592*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
593*c8dee2aaSAndroid Build Coastguard Worker    common_rgbA_to_RGBA</*swapRB=*/true>(dst, src, count);
594*c8dee2aaSAndroid Build Coastguard Worker}
595*c8dee2aaSAndroid Build Coastguard Worker
596*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
597*c8dee2aaSAndroid Build Coastguard Worker// -- AVX2 -----------------------------------------------------------------------------------------
598*c8dee2aaSAndroid Build Coastguard Worker
599*c8dee2aaSAndroid Build Coastguard Worker// Scale a byte by another.
600*c8dee2aaSAndroid Build Coastguard Worker// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
601*c8dee2aaSAndroid Build Coastguard Workerstatic __m256i scale(__m256i x, __m256i y) {
602*c8dee2aaSAndroid Build Coastguard Worker    const __m256i _128 = _mm256_set1_epi16(128);
603*c8dee2aaSAndroid Build Coastguard Worker    const __m256i _257 = _mm256_set1_epi16(257);
604*c8dee2aaSAndroid Build Coastguard Worker
605*c8dee2aaSAndroid Build Coastguard Worker    // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
606*c8dee2aaSAndroid Build Coastguard Worker    return _mm256_mulhi_epu16(_mm256_add_epi16(_mm256_mullo_epi16(x, y), _128), _257);
607*c8dee2aaSAndroid Build Coastguard Worker}
608*c8dee2aaSAndroid Build Coastguard Worker
609*c8dee2aaSAndroid Build Coastguard Workerstatic void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
610*c8dee2aaSAndroid Build Coastguard Worker
611*c8dee2aaSAndroid Build Coastguard Worker    auto premul8 = [=](__m256i* lo, __m256i* hi) {
612*c8dee2aaSAndroid Build Coastguard Worker        const __m256i zeros = _mm256_setzero_si256();
613*c8dee2aaSAndroid Build Coastguard Worker        __m256i planar;
614*c8dee2aaSAndroid Build Coastguard Worker        if (kSwapRB) {
615*c8dee2aaSAndroid Build Coastguard Worker            planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
616*c8dee2aaSAndroid Build Coastguard Worker                                      2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
617*c8dee2aaSAndroid Build Coastguard Worker        } else {
618*c8dee2aaSAndroid Build Coastguard Worker            planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
619*c8dee2aaSAndroid Build Coastguard Worker                                      0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
620*c8dee2aaSAndroid Build Coastguard Worker        }
621*c8dee2aaSAndroid Build Coastguard Worker
622*c8dee2aaSAndroid Build Coastguard Worker        // Swizzle the pixels to 8-bit planar.
623*c8dee2aaSAndroid Build Coastguard Worker        *lo = _mm256_shuffle_epi8(*lo, planar);             // rrrrgggg bbbbaaaa rrrrgggg bbbbaaaa
624*c8dee2aaSAndroid Build Coastguard Worker        *hi = _mm256_shuffle_epi8(*hi, planar);             // RRRRGGGG BBBBAAAA RRRRGGGG BBBBAAAA
625*c8dee2aaSAndroid Build Coastguard Worker        __m256i rg = _mm256_unpacklo_epi32(*lo, *hi),       // rrrrRRRR ggggGGGG rrrrRRRR ggggGGGG
626*c8dee2aaSAndroid Build Coastguard Worker                ba = _mm256_unpackhi_epi32(*lo, *hi);       // bbbbBBBB aaaaAAAA bbbbBBBB aaaaAAAA
627*c8dee2aaSAndroid Build Coastguard Worker
628*c8dee2aaSAndroid Build Coastguard Worker        // Unpack to 16-bit planar.
629*c8dee2aaSAndroid Build Coastguard Worker        __m256i r = _mm256_unpacklo_epi8(rg, zeros),        // r_r_r_r_ R_R_R_R_ r_r_r_r_ R_R_R_R_
630*c8dee2aaSAndroid Build Coastguard Worker                g = _mm256_unpackhi_epi8(rg, zeros),        // g_g_g_g_ G_G_G_G_ g_g_g_g_ G_G_G_G_
631*c8dee2aaSAndroid Build Coastguard Worker                b = _mm256_unpacklo_epi8(ba, zeros),        // b_b_b_b_ B_B_B_B_ b_b_b_b_ B_B_B_B_
632*c8dee2aaSAndroid Build Coastguard Worker                a = _mm256_unpackhi_epi8(ba, zeros);        // a_a_a_a_ A_A_A_A_ a_a_a_a_ A_A_A_A_
633*c8dee2aaSAndroid Build Coastguard Worker
634*c8dee2aaSAndroid Build Coastguard Worker        // Premultiply!
635*c8dee2aaSAndroid Build Coastguard Worker        r = scale(r, a);
636*c8dee2aaSAndroid Build Coastguard Worker        g = scale(g, a);
637*c8dee2aaSAndroid Build Coastguard Worker        b = scale(b, a);
638*c8dee2aaSAndroid Build Coastguard Worker
639*c8dee2aaSAndroid Build Coastguard Worker        // Repack into interlaced pixels.
640*c8dee2aaSAndroid Build Coastguard Worker        rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8));   // rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
641*c8dee2aaSAndroid Build Coastguard Worker        ba = _mm256_or_si256(b, _mm256_slli_epi16(a, 8));   // babababa BABABABA babababa BABABABA
642*c8dee2aaSAndroid Build Coastguard Worker        *lo = _mm256_unpacklo_epi16(rg, ba);                // rgbargba rgbargba rgbargba rgbargba
643*c8dee2aaSAndroid Build Coastguard Worker        *hi = _mm256_unpackhi_epi16(rg, ba);                // RGBARGBA RGBARGBA RGBARGBA RGBARGBA
644*c8dee2aaSAndroid Build Coastguard Worker    };
645*c8dee2aaSAndroid Build Coastguard Worker
646*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 16) {
647*c8dee2aaSAndroid Build Coastguard Worker        __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)),
648*c8dee2aaSAndroid Build Coastguard Worker                hi = _mm256_loadu_si256((const __m256i*) (src + 8));
649*c8dee2aaSAndroid Build Coastguard Worker
650*c8dee2aaSAndroid Build Coastguard Worker        premul8(&lo, &hi);
651*c8dee2aaSAndroid Build Coastguard Worker
652*c8dee2aaSAndroid Build Coastguard Worker        _mm256_storeu_si256((__m256i*) (dst + 0), lo);
653*c8dee2aaSAndroid Build Coastguard Worker        _mm256_storeu_si256((__m256i*) (dst + 8), hi);
654*c8dee2aaSAndroid Build Coastguard Worker
655*c8dee2aaSAndroid Build Coastguard Worker        src += 16;
656*c8dee2aaSAndroid Build Coastguard Worker        dst += 16;
657*c8dee2aaSAndroid Build Coastguard Worker        count -= 16;
658*c8dee2aaSAndroid Build Coastguard Worker    }
659*c8dee2aaSAndroid Build Coastguard Worker
660*c8dee2aaSAndroid Build Coastguard Worker    if (count >= 8) {
661*c8dee2aaSAndroid Build Coastguard Worker        __m256i lo = _mm256_loadu_si256((const __m256i*) src),
662*c8dee2aaSAndroid Build Coastguard Worker                hi = _mm256_setzero_si256();
663*c8dee2aaSAndroid Build Coastguard Worker
664*c8dee2aaSAndroid Build Coastguard Worker        premul8(&lo, &hi);
665*c8dee2aaSAndroid Build Coastguard Worker
666*c8dee2aaSAndroid Build Coastguard Worker        _mm256_storeu_si256((__m256i*) dst, lo);
667*c8dee2aaSAndroid Build Coastguard Worker
668*c8dee2aaSAndroid Build Coastguard Worker        src += 8;
669*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
670*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
671*c8dee2aaSAndroid Build Coastguard Worker    }
672*c8dee2aaSAndroid Build Coastguard Worker
673*c8dee2aaSAndroid Build Coastguard Worker    // Call portable code to finish up the tail of [0,8) pixels.
674*c8dee2aaSAndroid Build Coastguard Worker    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
675*c8dee2aaSAndroid Build Coastguard Worker    proc(dst, src, count);
676*c8dee2aaSAndroid Build Coastguard Worker}
677*c8dee2aaSAndroid Build Coastguard Worker
678*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
679*c8dee2aaSAndroid Build Coastguard Worker    premul_should_swapRB(false, dst, src, count);
680*c8dee2aaSAndroid Build Coastguard Worker}
681*c8dee2aaSAndroid Build Coastguard Worker
682*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
683*c8dee2aaSAndroid Build Coastguard Worker    premul_should_swapRB(true, dst, src, count);
684*c8dee2aaSAndroid Build Coastguard Worker}
685*c8dee2aaSAndroid Build Coastguard Worker
686*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
687*c8dee2aaSAndroid Build Coastguard Worker    const __m256i swapRB = _mm256_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
688*c8dee2aaSAndroid Build Coastguard Worker                                            2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
689*c8dee2aaSAndroid Build Coastguard Worker
690*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 8) {
691*c8dee2aaSAndroid Build Coastguard Worker        __m256i rgba = _mm256_loadu_si256((const __m256i*) src);
692*c8dee2aaSAndroid Build Coastguard Worker        __m256i bgra = _mm256_shuffle_epi8(rgba, swapRB);
693*c8dee2aaSAndroid Build Coastguard Worker        _mm256_storeu_si256((__m256i*) dst, bgra);
694*c8dee2aaSAndroid Build Coastguard Worker
695*c8dee2aaSAndroid Build Coastguard Worker        src += 8;
696*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
697*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
698*c8dee2aaSAndroid Build Coastguard Worker    }
699*c8dee2aaSAndroid Build Coastguard Worker
700*c8dee2aaSAndroid Build Coastguard Worker    RGBA_to_BGRA_portable(dst, src, count);
701*c8dee2aaSAndroid Build Coastguard Worker}
702*c8dee2aaSAndroid Build Coastguard Worker
703*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
704*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 16) {
705*c8dee2aaSAndroid Build Coastguard Worker        __m256i ga = _mm256_loadu_si256((const __m256i*) src);
706*c8dee2aaSAndroid Build Coastguard Worker
707*c8dee2aaSAndroid Build Coastguard Worker        __m256i gg = _mm256_or_si256(_mm256_and_si256(ga, _mm256_set1_epi16(0x00FF)),
708*c8dee2aaSAndroid Build Coastguard Worker                                     _mm256_slli_epi16(ga, 8));
709*c8dee2aaSAndroid Build Coastguard Worker
710*c8dee2aaSAndroid Build Coastguard Worker        __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
711*c8dee2aaSAndroid Build Coastguard Worker        __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);
712*c8dee2aaSAndroid Build Coastguard Worker
713*c8dee2aaSAndroid Build Coastguard Worker        // Shuffle for pixel reorder
714*c8dee2aaSAndroid Build Coastguard Worker        // Note. 'p' stands for 'ggga'
715*c8dee2aaSAndroid Build Coastguard Worker        // Before shuffle:
716*c8dee2aaSAndroid Build Coastguard Worker        // ggga_lo = p0 p1 p2 p3 | p8  p9  p10 p11
717*c8dee2aaSAndroid Build Coastguard Worker        // ggga_hi = p4 p5 p6 p7 | p12 p13 p14 p15
718*c8dee2aaSAndroid Build Coastguard Worker        //
719*c8dee2aaSAndroid Build Coastguard Worker        // After shuffle:
720*c8dee2aaSAndroid Build Coastguard Worker        // ggga_lo_shuffle = p0 p1 p2  p3  | p4  p5  p6  p7
721*c8dee2aaSAndroid Build Coastguard Worker        // ggga_hi_shuffle = p8 p9 p10 p11 | p12 p13 p14 p15
722*c8dee2aaSAndroid Build Coastguard Worker        __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20),
723*c8dee2aaSAndroid Build Coastguard Worker                ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31);
724*c8dee2aaSAndroid Build Coastguard Worker
725*c8dee2aaSAndroid Build Coastguard Worker        _mm256_storeu_si256((__m256i*) (dst +  0), ggga_lo_shuffle);
726*c8dee2aaSAndroid Build Coastguard Worker        _mm256_storeu_si256((__m256i*) (dst +  8), ggga_hi_shuffle);
727*c8dee2aaSAndroid Build Coastguard Worker
728*c8dee2aaSAndroid Build Coastguard Worker        src += 16*2;
729*c8dee2aaSAndroid Build Coastguard Worker        dst += 16;
730*c8dee2aaSAndroid Build Coastguard Worker        count -= 16;
731*c8dee2aaSAndroid Build Coastguard Worker    }
732*c8dee2aaSAndroid Build Coastguard Worker
733*c8dee2aaSAndroid Build Coastguard Worker    grayA_to_RGBA_portable(dst, src, count);
734*c8dee2aaSAndroid Build Coastguard Worker}
735*c8dee2aaSAndroid Build Coastguard Worker
736*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
737*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 16) {
738*c8dee2aaSAndroid Build Coastguard Worker        __m256i grayA = _mm256_loadu_si256((const __m256i*) src);
739*c8dee2aaSAndroid Build Coastguard Worker
740*c8dee2aaSAndroid Build Coastguard Worker        __m256i g0 = _mm256_and_si256(grayA, _mm256_set1_epi16(0x00FF));
741*c8dee2aaSAndroid Build Coastguard Worker        __m256i a0 = _mm256_srli_epi16(grayA, 8);
742*c8dee2aaSAndroid Build Coastguard Worker
743*c8dee2aaSAndroid Build Coastguard Worker        // Premultiply
744*c8dee2aaSAndroid Build Coastguard Worker        g0 = scale(g0, a0);
745*c8dee2aaSAndroid Build Coastguard Worker
746*c8dee2aaSAndroid Build Coastguard Worker        __m256i gg = _mm256_or_si256(g0, _mm256_slli_epi16(g0, 8));
747*c8dee2aaSAndroid Build Coastguard Worker        __m256i ga = _mm256_or_si256(g0, _mm256_slli_epi16(a0, 8));
748*c8dee2aaSAndroid Build Coastguard Worker
749*c8dee2aaSAndroid Build Coastguard Worker        __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
750*c8dee2aaSAndroid Build Coastguard Worker        __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);
751*c8dee2aaSAndroid Build Coastguard Worker
752*c8dee2aaSAndroid Build Coastguard Worker        // Shuffle for pixel reorder, similar as grayA_to_RGBA
753*c8dee2aaSAndroid Build Coastguard Worker        __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20),
754*c8dee2aaSAndroid Build Coastguard Worker                ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31);
755*c8dee2aaSAndroid Build Coastguard Worker
756*c8dee2aaSAndroid Build Coastguard Worker        _mm256_storeu_si256((__m256i*) (dst +  0), ggga_lo_shuffle);
757*c8dee2aaSAndroid Build Coastguard Worker        _mm256_storeu_si256((__m256i*) (dst +  8), ggga_hi_shuffle);
758*c8dee2aaSAndroid Build Coastguard Worker
759*c8dee2aaSAndroid Build Coastguard Worker        src += 16*2;
760*c8dee2aaSAndroid Build Coastguard Worker        dst += 16;
761*c8dee2aaSAndroid Build Coastguard Worker        count -= 16;
762*c8dee2aaSAndroid Build Coastguard Worker    }
763*c8dee2aaSAndroid Build Coastguard Worker
764*c8dee2aaSAndroid Build Coastguard Worker    grayA_to_rgbA_portable(dst, src, count);
765*c8dee2aaSAndroid Build Coastguard Worker}
766*c8dee2aaSAndroid Build Coastguard Worker
767*c8dee2aaSAndroid Build Coastguard Workerenum Format { kRGB1, kBGR1 };
768*c8dee2aaSAndroid Build Coastguard Workerstatic void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
769*c8dee2aaSAndroid Build Coastguard Worker    auto convert8 = [=](__m256i* lo, __m256i* hi) {
770*c8dee2aaSAndroid Build Coastguard Worker        const __m256i zeros = _mm256_setzero_si256();
771*c8dee2aaSAndroid Build Coastguard Worker        __m256i planar;
772*c8dee2aaSAndroid Build Coastguard Worker        if (kBGR1 == format) {
773*c8dee2aaSAndroid Build Coastguard Worker            planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
774*c8dee2aaSAndroid Build Coastguard Worker                                      2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
775*c8dee2aaSAndroid Build Coastguard Worker        } else {
776*c8dee2aaSAndroid Build Coastguard Worker            planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
777*c8dee2aaSAndroid Build Coastguard Worker                                      0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
778*c8dee2aaSAndroid Build Coastguard Worker        }
779*c8dee2aaSAndroid Build Coastguard Worker
780*c8dee2aaSAndroid Build Coastguard Worker        // Swizzle the pixels to 8-bit planar.
781*c8dee2aaSAndroid Build Coastguard Worker        *lo = _mm256_shuffle_epi8(*lo, planar);            // ccccmmmm yyyykkkk ccccmmmm yyyykkkk
782*c8dee2aaSAndroid Build Coastguard Worker        *hi = _mm256_shuffle_epi8(*hi, planar);            // CCCCMMMM YYYYKKKK CCCCMMMM YYYYKKKK
783*c8dee2aaSAndroid Build Coastguard Worker        __m256i cm = _mm256_unpacklo_epi32(*lo, *hi),      // ccccCCCC mmmmMMMM ccccCCCC mmmmMMMM
784*c8dee2aaSAndroid Build Coastguard Worker                yk = _mm256_unpackhi_epi32(*lo, *hi);      // yyyyYYYY kkkkKKKK yyyyYYYY kkkkKKKK
785*c8dee2aaSAndroid Build Coastguard Worker
786*c8dee2aaSAndroid Build Coastguard Worker        // Unpack to 16-bit planar.
787*c8dee2aaSAndroid Build Coastguard Worker        __m256i c = _mm256_unpacklo_epi8(cm, zeros),       // c_c_c_c_ C_C_C_C_ c_c_c_c_ C_C_C_C_
788*c8dee2aaSAndroid Build Coastguard Worker                m = _mm256_unpackhi_epi8(cm, zeros),       // m_m_m_m_ M_M_M_M_ m_m_m_m_ M_M_M_M_
789*c8dee2aaSAndroid Build Coastguard Worker                y = _mm256_unpacklo_epi8(yk, zeros),       // y_y_y_y_ Y_Y_Y_Y_ y_y_y_y_ Y_Y_Y_Y_
790*c8dee2aaSAndroid Build Coastguard Worker                k = _mm256_unpackhi_epi8(yk, zeros);       // k_k_k_k_ K_K_K_K_ k_k_k_k_ K_K_K_K_
791*c8dee2aaSAndroid Build Coastguard Worker
792*c8dee2aaSAndroid Build Coastguard Worker        // Scale to r, g, b.
793*c8dee2aaSAndroid Build Coastguard Worker        __m256i r = scale(c, k),
794*c8dee2aaSAndroid Build Coastguard Worker                g = scale(m, k),
795*c8dee2aaSAndroid Build Coastguard Worker                b = scale(y, k);
796*c8dee2aaSAndroid Build Coastguard Worker
797*c8dee2aaSAndroid Build Coastguard Worker        // Repack into interlaced pixels:
798*c8dee2aaSAndroid Build Coastguard Worker        //     rg = rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
799*c8dee2aaSAndroid Build Coastguard Worker        //     ba = b1b1b1b1 B1B1B1B1 b1b1b1b1 B1B1B1B1
800*c8dee2aaSAndroid Build Coastguard Worker        __m256i rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8)),
801*c8dee2aaSAndroid Build Coastguard Worker                ba = _mm256_or_si256(b, _mm256_set1_epi16((uint16_t) 0xFF00));
802*c8dee2aaSAndroid Build Coastguard Worker        *lo = _mm256_unpacklo_epi16(rg, ba);               // rgb1rgb1 rgb1rgb1 rgb1rgb1 rgb1rgb1
803*c8dee2aaSAndroid Build Coastguard Worker        *hi = _mm256_unpackhi_epi16(rg, ba);               // RGB1RGB1 RGB1RGB1 RGB1RGB1 RGB1RGB1
804*c8dee2aaSAndroid Build Coastguard Worker    };
805*c8dee2aaSAndroid Build Coastguard Worker
806*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 16) {
807*c8dee2aaSAndroid Build Coastguard Worker        __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)),
808*c8dee2aaSAndroid Build Coastguard Worker                hi = _mm256_loadu_si256((const __m256i*) (src + 8));
809*c8dee2aaSAndroid Build Coastguard Worker
810*c8dee2aaSAndroid Build Coastguard Worker        convert8(&lo, &hi);
811*c8dee2aaSAndroid Build Coastguard Worker
812*c8dee2aaSAndroid Build Coastguard Worker        _mm256_storeu_si256((__m256i*) (dst + 0), lo);
813*c8dee2aaSAndroid Build Coastguard Worker        _mm256_storeu_si256((__m256i*) (dst + 8), hi);
814*c8dee2aaSAndroid Build Coastguard Worker
815*c8dee2aaSAndroid Build Coastguard Worker        src += 16;
816*c8dee2aaSAndroid Build Coastguard Worker        dst += 16;
817*c8dee2aaSAndroid Build Coastguard Worker        count -= 16;
818*c8dee2aaSAndroid Build Coastguard Worker    }
819*c8dee2aaSAndroid Build Coastguard Worker
820*c8dee2aaSAndroid Build Coastguard Worker    if (count >= 8) {
821*c8dee2aaSAndroid Build Coastguard Worker        __m256i lo = _mm256_loadu_si256((const __m256i*) src),
822*c8dee2aaSAndroid Build Coastguard Worker                hi = _mm256_setzero_si256();
823*c8dee2aaSAndroid Build Coastguard Worker
824*c8dee2aaSAndroid Build Coastguard Worker        convert8(&lo, &hi);
825*c8dee2aaSAndroid Build Coastguard Worker
826*c8dee2aaSAndroid Build Coastguard Worker        _mm256_storeu_si256((__m256i*) dst, lo);
827*c8dee2aaSAndroid Build Coastguard Worker
828*c8dee2aaSAndroid Build Coastguard Worker        src += 8;
829*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
830*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
831*c8dee2aaSAndroid Build Coastguard Worker    }
832*c8dee2aaSAndroid Build Coastguard Worker
833*c8dee2aaSAndroid Build Coastguard Worker    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
834*c8dee2aaSAndroid Build Coastguard Worker    proc(dst, src, count);
835*c8dee2aaSAndroid Build Coastguard Worker}
836*c8dee2aaSAndroid Build Coastguard Worker
837*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
838*c8dee2aaSAndroid Build Coastguard Worker    inverted_cmyk_to(kRGB1, dst, src, count);
839*c8dee2aaSAndroid Build Coastguard Worker}
840*c8dee2aaSAndroid Build Coastguard Worker
841*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
842*c8dee2aaSAndroid Build Coastguard Worker    inverted_cmyk_to(kBGR1, dst, src, count);
843*c8dee2aaSAndroid Build Coastguard Worker}
844*c8dee2aaSAndroid Build Coastguard Worker
845*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
846*c8dee2aaSAndroid Build Coastguard Worker    rgbA_to_RGBA_portable(dst, src, count);
847*c8dee2aaSAndroid Build Coastguard Worker}
848*c8dee2aaSAndroid Build Coastguard Worker
849*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
850*c8dee2aaSAndroid Build Coastguard Worker    rgbA_to_BGRA_portable(dst, src, count);
851*c8dee2aaSAndroid Build Coastguard Worker}
852*c8dee2aaSAndroid Build Coastguard Worker
853*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
854*c8dee2aaSAndroid Build Coastguard Worker// -- SSSE3 ----------------------------------------------------------------------------------------
855*c8dee2aaSAndroid Build Coastguard Worker
856*c8dee2aaSAndroid Build Coastguard Worker// Scale a byte by another.
857*c8dee2aaSAndroid Build Coastguard Worker// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
858*c8dee2aaSAndroid Build Coastguard Workerstatic __m128i scale(__m128i x, __m128i y) {
859*c8dee2aaSAndroid Build Coastguard Worker    const __m128i _128 = _mm_set1_epi16(128);
860*c8dee2aaSAndroid Build Coastguard Worker    const __m128i _257 = _mm_set1_epi16(257);
861*c8dee2aaSAndroid Build Coastguard Worker
862*c8dee2aaSAndroid Build Coastguard Worker    // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
863*c8dee2aaSAndroid Build Coastguard Worker    return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
864*c8dee2aaSAndroid Build Coastguard Worker}
865*c8dee2aaSAndroid Build Coastguard Worker
866*c8dee2aaSAndroid Build Coastguard Workerstatic void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
867*c8dee2aaSAndroid Build Coastguard Worker
868*c8dee2aaSAndroid Build Coastguard Worker    auto premul8 = [=](__m128i* lo, __m128i* hi) {
869*c8dee2aaSAndroid Build Coastguard Worker        const __m128i zeros = _mm_setzero_si128();
870*c8dee2aaSAndroid Build Coastguard Worker        __m128i planar;
871*c8dee2aaSAndroid Build Coastguard Worker        if (kSwapRB) {
872*c8dee2aaSAndroid Build Coastguard Worker            planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
873*c8dee2aaSAndroid Build Coastguard Worker        } else {
874*c8dee2aaSAndroid Build Coastguard Worker            planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
875*c8dee2aaSAndroid Build Coastguard Worker        }
876*c8dee2aaSAndroid Build Coastguard Worker
877*c8dee2aaSAndroid Build Coastguard Worker        // Swizzle the pixels to 8-bit planar.
878*c8dee2aaSAndroid Build Coastguard Worker        *lo = _mm_shuffle_epi8(*lo, planar);                      // rrrrgggg bbbbaaaa
879*c8dee2aaSAndroid Build Coastguard Worker        *hi = _mm_shuffle_epi8(*hi, planar);                      // RRRRGGGG BBBBAAAA
880*c8dee2aaSAndroid Build Coastguard Worker        __m128i rg = _mm_unpacklo_epi32(*lo, *hi),                // rrrrRRRR ggggGGGG
881*c8dee2aaSAndroid Build Coastguard Worker                ba = _mm_unpackhi_epi32(*lo, *hi);                // bbbbBBBB aaaaAAAA
882*c8dee2aaSAndroid Build Coastguard Worker
883*c8dee2aaSAndroid Build Coastguard Worker        // Unpack to 16-bit planar.
884*c8dee2aaSAndroid Build Coastguard Worker        __m128i r = _mm_unpacklo_epi8(rg, zeros),                 // r_r_r_r_ R_R_R_R_
885*c8dee2aaSAndroid Build Coastguard Worker                g = _mm_unpackhi_epi8(rg, zeros),                 // g_g_g_g_ G_G_G_G_
886*c8dee2aaSAndroid Build Coastguard Worker                b = _mm_unpacklo_epi8(ba, zeros),                 // b_b_b_b_ B_B_B_B_
887*c8dee2aaSAndroid Build Coastguard Worker                a = _mm_unpackhi_epi8(ba, zeros);                 // a_a_a_a_ A_A_A_A_
888*c8dee2aaSAndroid Build Coastguard Worker
889*c8dee2aaSAndroid Build Coastguard Worker        // Premultiply!
890*c8dee2aaSAndroid Build Coastguard Worker        r = scale(r, a);
891*c8dee2aaSAndroid Build Coastguard Worker        g = scale(g, a);
892*c8dee2aaSAndroid Build Coastguard Worker        b = scale(b, a);
893*c8dee2aaSAndroid Build Coastguard Worker
894*c8dee2aaSAndroid Build Coastguard Worker        // Repack into interlaced pixels.
895*c8dee2aaSAndroid Build Coastguard Worker        rg = _mm_or_si128(r, _mm_slli_epi16(g, 8));               // rgrgrgrg RGRGRGRG
896*c8dee2aaSAndroid Build Coastguard Worker        ba = _mm_or_si128(b, _mm_slli_epi16(a, 8));               // babababa BABABABA
897*c8dee2aaSAndroid Build Coastguard Worker        *lo = _mm_unpacklo_epi16(rg, ba);                         // rgbargba rgbargba
898*c8dee2aaSAndroid Build Coastguard Worker        *hi = _mm_unpackhi_epi16(rg, ba);                         // RGBARGBA RGBARGBA
899*c8dee2aaSAndroid Build Coastguard Worker    };
900*c8dee2aaSAndroid Build Coastguard Worker
901*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 8) {
902*c8dee2aaSAndroid Build Coastguard Worker        __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
903*c8dee2aaSAndroid Build Coastguard Worker                hi = _mm_loadu_si128((const __m128i*) (src + 4));
904*c8dee2aaSAndroid Build Coastguard Worker
905*c8dee2aaSAndroid Build Coastguard Worker        premul8(&lo, &hi);
906*c8dee2aaSAndroid Build Coastguard Worker
907*c8dee2aaSAndroid Build Coastguard Worker        _mm_storeu_si128((__m128i*) (dst + 0), lo);
908*c8dee2aaSAndroid Build Coastguard Worker        _mm_storeu_si128((__m128i*) (dst + 4), hi);
909*c8dee2aaSAndroid Build Coastguard Worker
910*c8dee2aaSAndroid Build Coastguard Worker        src += 8;
911*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
912*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
913*c8dee2aaSAndroid Build Coastguard Worker    }
914*c8dee2aaSAndroid Build Coastguard Worker
915*c8dee2aaSAndroid Build Coastguard Worker    if (count >= 4) {
916*c8dee2aaSAndroid Build Coastguard Worker        __m128i lo = _mm_loadu_si128((const __m128i*) src),
917*c8dee2aaSAndroid Build Coastguard Worker                hi = _mm_setzero_si128();
918*c8dee2aaSAndroid Build Coastguard Worker
919*c8dee2aaSAndroid Build Coastguard Worker        premul8(&lo, &hi);
920*c8dee2aaSAndroid Build Coastguard Worker
921*c8dee2aaSAndroid Build Coastguard Worker        _mm_storeu_si128((__m128i*) dst, lo);
922*c8dee2aaSAndroid Build Coastguard Worker
923*c8dee2aaSAndroid Build Coastguard Worker        src += 4;
924*c8dee2aaSAndroid Build Coastguard Worker        dst += 4;
925*c8dee2aaSAndroid Build Coastguard Worker        count -= 4;
926*c8dee2aaSAndroid Build Coastguard Worker    }
927*c8dee2aaSAndroid Build Coastguard Worker
928*c8dee2aaSAndroid Build Coastguard Worker    // Call portable code to finish up the tail of [0,4) pixels.
929*c8dee2aaSAndroid Build Coastguard Worker    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
930*c8dee2aaSAndroid Build Coastguard Worker    proc(dst, src, count);
931*c8dee2aaSAndroid Build Coastguard Worker}
932*c8dee2aaSAndroid Build Coastguard Worker
933*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
934*c8dee2aaSAndroid Build Coastguard Worker    premul_should_swapRB(false, dst, src, count);
935*c8dee2aaSAndroid Build Coastguard Worker}
936*c8dee2aaSAndroid Build Coastguard Worker
937*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
938*c8dee2aaSAndroid Build Coastguard Worker    premul_should_swapRB(true, dst, src, count);
939*c8dee2aaSAndroid Build Coastguard Worker}
940*c8dee2aaSAndroid Build Coastguard Worker
941*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
942*c8dee2aaSAndroid Build Coastguard Worker    const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
943*c8dee2aaSAndroid Build Coastguard Worker
944*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 4) {
945*c8dee2aaSAndroid Build Coastguard Worker        __m128i rgba = _mm_loadu_si128((const __m128i*) src);
946*c8dee2aaSAndroid Build Coastguard Worker        __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
947*c8dee2aaSAndroid Build Coastguard Worker        _mm_storeu_si128((__m128i*) dst, bgra);
948*c8dee2aaSAndroid Build Coastguard Worker
949*c8dee2aaSAndroid Build Coastguard Worker        src += 4;
950*c8dee2aaSAndroid Build Coastguard Worker        dst += 4;
951*c8dee2aaSAndroid Build Coastguard Worker        count -= 4;
952*c8dee2aaSAndroid Build Coastguard Worker    }
953*c8dee2aaSAndroid Build Coastguard Worker
954*c8dee2aaSAndroid Build Coastguard Worker    RGBA_to_BGRA_portable(dst, src, count);
955*c8dee2aaSAndroid Build Coastguard Worker}
956*c8dee2aaSAndroid Build Coastguard Worker
957*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
958*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 8) {
959*c8dee2aaSAndroid Build Coastguard Worker        __m128i ga = _mm_loadu_si128((const __m128i*) src);
960*c8dee2aaSAndroid Build Coastguard Worker
961*c8dee2aaSAndroid Build Coastguard Worker        __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
962*c8dee2aaSAndroid Build Coastguard Worker                                  _mm_slli_epi16(ga, 8));
963*c8dee2aaSAndroid Build Coastguard Worker
964*c8dee2aaSAndroid Build Coastguard Worker        __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
965*c8dee2aaSAndroid Build Coastguard Worker        __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
966*c8dee2aaSAndroid Build Coastguard Worker
967*c8dee2aaSAndroid Build Coastguard Worker        _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
968*c8dee2aaSAndroid Build Coastguard Worker        _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
969*c8dee2aaSAndroid Build Coastguard Worker
970*c8dee2aaSAndroid Build Coastguard Worker        src += 8*2;
971*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
972*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
973*c8dee2aaSAndroid Build Coastguard Worker    }
974*c8dee2aaSAndroid Build Coastguard Worker
975*c8dee2aaSAndroid Build Coastguard Worker    grayA_to_RGBA_portable(dst, src, count);
976*c8dee2aaSAndroid Build Coastguard Worker}
977*c8dee2aaSAndroid Build Coastguard Worker
978*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
979*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 8) {
980*c8dee2aaSAndroid Build Coastguard Worker        __m128i grayA = _mm_loadu_si128((const __m128i*) src);
981*c8dee2aaSAndroid Build Coastguard Worker
982*c8dee2aaSAndroid Build Coastguard Worker        __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
983*c8dee2aaSAndroid Build Coastguard Worker        __m128i a0 = _mm_srli_epi16(grayA, 8);
984*c8dee2aaSAndroid Build Coastguard Worker
985*c8dee2aaSAndroid Build Coastguard Worker        // Premultiply
986*c8dee2aaSAndroid Build Coastguard Worker        g0 = scale(g0, a0);
987*c8dee2aaSAndroid Build Coastguard Worker
988*c8dee2aaSAndroid Build Coastguard Worker        __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
989*c8dee2aaSAndroid Build Coastguard Worker        __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
990*c8dee2aaSAndroid Build Coastguard Worker
991*c8dee2aaSAndroid Build Coastguard Worker
992*c8dee2aaSAndroid Build Coastguard Worker        __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
993*c8dee2aaSAndroid Build Coastguard Worker        __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
994*c8dee2aaSAndroid Build Coastguard Worker
995*c8dee2aaSAndroid Build Coastguard Worker        _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
996*c8dee2aaSAndroid Build Coastguard Worker        _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
997*c8dee2aaSAndroid Build Coastguard Worker
998*c8dee2aaSAndroid Build Coastguard Worker        src += 8*2;
999*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
1000*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
1001*c8dee2aaSAndroid Build Coastguard Worker    }
1002*c8dee2aaSAndroid Build Coastguard Worker
1003*c8dee2aaSAndroid Build Coastguard Worker    grayA_to_rgbA_portable(dst, src, count);
1004*c8dee2aaSAndroid Build Coastguard Worker}
1005*c8dee2aaSAndroid Build Coastguard Worker
1006*c8dee2aaSAndroid Build Coastguard Workerenum Format { kRGB1, kBGR1 };
1007*c8dee2aaSAndroid Build Coastguard Workerstatic void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
1008*c8dee2aaSAndroid Build Coastguard Worker    auto convert8 = [=](__m128i* lo, __m128i* hi) {
1009*c8dee2aaSAndroid Build Coastguard Worker        const __m128i zeros = _mm_setzero_si128();
1010*c8dee2aaSAndroid Build Coastguard Worker        __m128i planar;
1011*c8dee2aaSAndroid Build Coastguard Worker        if (kBGR1 == format) {
1012*c8dee2aaSAndroid Build Coastguard Worker            planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
1013*c8dee2aaSAndroid Build Coastguard Worker        } else {
1014*c8dee2aaSAndroid Build Coastguard Worker            planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
1015*c8dee2aaSAndroid Build Coastguard Worker        }
1016*c8dee2aaSAndroid Build Coastguard Worker
1017*c8dee2aaSAndroid Build Coastguard Worker        // Swizzle the pixels to 8-bit planar.
1018*c8dee2aaSAndroid Build Coastguard Worker        *lo = _mm_shuffle_epi8(*lo, planar);                                 // ccccmmmm yyyykkkk
1019*c8dee2aaSAndroid Build Coastguard Worker        *hi = _mm_shuffle_epi8(*hi, planar);                                 // CCCCMMMM YYYYKKKK
1020*c8dee2aaSAndroid Build Coastguard Worker        __m128i cm = _mm_unpacklo_epi32(*lo, *hi),                           // ccccCCCC mmmmMMMM
1021*c8dee2aaSAndroid Build Coastguard Worker                yk = _mm_unpackhi_epi32(*lo, *hi);                           // yyyyYYYY kkkkKKKK
1022*c8dee2aaSAndroid Build Coastguard Worker
1023*c8dee2aaSAndroid Build Coastguard Worker        // Unpack to 16-bit planar.
1024*c8dee2aaSAndroid Build Coastguard Worker        __m128i c = _mm_unpacklo_epi8(cm, zeros),                            // c_c_c_c_ C_C_C_C_
1025*c8dee2aaSAndroid Build Coastguard Worker                m = _mm_unpackhi_epi8(cm, zeros),                            // m_m_m_m_ M_M_M_M_
1026*c8dee2aaSAndroid Build Coastguard Worker                y = _mm_unpacklo_epi8(yk, zeros),                            // y_y_y_y_ Y_Y_Y_Y_
1027*c8dee2aaSAndroid Build Coastguard Worker                k = _mm_unpackhi_epi8(yk, zeros);                            // k_k_k_k_ K_K_K_K_
1028*c8dee2aaSAndroid Build Coastguard Worker
1029*c8dee2aaSAndroid Build Coastguard Worker        // Scale to r, g, b.
1030*c8dee2aaSAndroid Build Coastguard Worker        __m128i r = scale(c, k),
1031*c8dee2aaSAndroid Build Coastguard Worker                g = scale(m, k),
1032*c8dee2aaSAndroid Build Coastguard Worker                b = scale(y, k);
1033*c8dee2aaSAndroid Build Coastguard Worker
1034*c8dee2aaSAndroid Build Coastguard Worker        // Repack into interlaced pixels.
1035*c8dee2aaSAndroid Build Coastguard Worker        __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)),                  // rgrgrgrg RGRGRGRG
1036*c8dee2aaSAndroid Build Coastguard Worker                ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00));     // b1b1b1b1 B1B1B1B1
1037*c8dee2aaSAndroid Build Coastguard Worker        *lo = _mm_unpacklo_epi16(rg, ba);                                    // rgbargba rgbargba
1038*c8dee2aaSAndroid Build Coastguard Worker        *hi = _mm_unpackhi_epi16(rg, ba);                                    // RGB1RGB1 RGB1RGB1
1039*c8dee2aaSAndroid Build Coastguard Worker    };
1040*c8dee2aaSAndroid Build Coastguard Worker
1041*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 8) {
1042*c8dee2aaSAndroid Build Coastguard Worker        __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
1043*c8dee2aaSAndroid Build Coastguard Worker                hi = _mm_loadu_si128((const __m128i*) (src + 4));
1044*c8dee2aaSAndroid Build Coastguard Worker
1045*c8dee2aaSAndroid Build Coastguard Worker        convert8(&lo, &hi);
1046*c8dee2aaSAndroid Build Coastguard Worker
1047*c8dee2aaSAndroid Build Coastguard Worker        _mm_storeu_si128((__m128i*) (dst + 0), lo);
1048*c8dee2aaSAndroid Build Coastguard Worker        _mm_storeu_si128((__m128i*) (dst + 4), hi);
1049*c8dee2aaSAndroid Build Coastguard Worker
1050*c8dee2aaSAndroid Build Coastguard Worker        src += 8;
1051*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
1052*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
1053*c8dee2aaSAndroid Build Coastguard Worker    }
1054*c8dee2aaSAndroid Build Coastguard Worker
1055*c8dee2aaSAndroid Build Coastguard Worker    if (count >= 4) {
1056*c8dee2aaSAndroid Build Coastguard Worker        __m128i lo = _mm_loadu_si128((const __m128i*) src),
1057*c8dee2aaSAndroid Build Coastguard Worker                hi = _mm_setzero_si128();
1058*c8dee2aaSAndroid Build Coastguard Worker
1059*c8dee2aaSAndroid Build Coastguard Worker        convert8(&lo, &hi);
1060*c8dee2aaSAndroid Build Coastguard Worker
1061*c8dee2aaSAndroid Build Coastguard Worker        _mm_storeu_si128((__m128i*) dst, lo);
1062*c8dee2aaSAndroid Build Coastguard Worker
1063*c8dee2aaSAndroid Build Coastguard Worker        src += 4;
1064*c8dee2aaSAndroid Build Coastguard Worker        dst += 4;
1065*c8dee2aaSAndroid Build Coastguard Worker        count -= 4;
1066*c8dee2aaSAndroid Build Coastguard Worker    }
1067*c8dee2aaSAndroid Build Coastguard Worker
1068*c8dee2aaSAndroid Build Coastguard Worker    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1069*c8dee2aaSAndroid Build Coastguard Worker    proc(dst, src, count);
1070*c8dee2aaSAndroid Build Coastguard Worker}
1071*c8dee2aaSAndroid Build Coastguard Worker
1072*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1073*c8dee2aaSAndroid Build Coastguard Worker    inverted_cmyk_to(kRGB1, dst, src, count);
1074*c8dee2aaSAndroid Build Coastguard Worker}
1075*c8dee2aaSAndroid Build Coastguard Worker
1076*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1077*c8dee2aaSAndroid Build Coastguard Worker    inverted_cmyk_to(kBGR1, dst, src, count);
1078*c8dee2aaSAndroid Build Coastguard Worker}
1079*c8dee2aaSAndroid Build Coastguard Worker
1080*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
1081*c8dee2aaSAndroid Build Coastguard Worker    rgbA_to_RGBA_portable(dst, src, count);
1082*c8dee2aaSAndroid Build Coastguard Worker}
1083*c8dee2aaSAndroid Build Coastguard Worker
1084*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1085*c8dee2aaSAndroid Build Coastguard Worker    rgbA_to_BGRA_portable(dst, src, count);
1086*c8dee2aaSAndroid Build Coastguard Worker}
1087*c8dee2aaSAndroid Build Coastguard Worker
1088*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
1089*c8dee2aaSAndroid Build Coastguard Worker// -- LASX ----------------------------------------------------------------------------------------
1090*c8dee2aaSAndroid Build Coastguard Worker
1091*c8dee2aaSAndroid Build Coastguard Worker// Scale a byte by another.
1092*c8dee2aaSAndroid Build Coastguard Worker// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
1093*c8dee2aaSAndroid Build Coastguard Worker// (x+127)/255 == ((x+128)*257)>>16
1094*c8dee2aaSAndroid Build Coastguard WorkerSI __m256i scale(__m256i x, __m256i y) {
1095*c8dee2aaSAndroid Build Coastguard Worker    const __m256i _128 = __lasx_xvreplgr2vr_h(128);
1096*c8dee2aaSAndroid Build Coastguard Worker    const __m256i _257 = __lasx_xvreplgr2vr_h(257);
1097*c8dee2aaSAndroid Build Coastguard Worker
1098*c8dee2aaSAndroid Build Coastguard Worker    // (x+127)/255 == ((x+128)*257)>>16
1099*c8dee2aaSAndroid Build Coastguard Worker    return __lasx_xvmuh_hu(__lasx_xvadd_h(__lasx_xvmul_h(x, y), _128), _257);
1100*c8dee2aaSAndroid Build Coastguard Worker}
1101*c8dee2aaSAndroid Build Coastguard Worker
1102*c8dee2aaSAndroid Build Coastguard Workerstatic void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
1103*c8dee2aaSAndroid Build Coastguard Worker    auto premul8 = [=](__m256i* lo, __m256i* hi) {
1104*c8dee2aaSAndroid Build Coastguard Worker        const __m256i zeros = __lasx_xvldi(0);
1105*c8dee2aaSAndroid Build Coastguard Worker        __m256i planar = __lasx_xvldi(0);
1106*c8dee2aaSAndroid Build Coastguard Worker        if (kSwapRB) {
1107*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,0);
1108*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,1);
1109*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,2);
1110*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,3);
1111*c8dee2aaSAndroid Build Coastguard Worker        } else {
1112*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,0);
1113*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,1);
1114*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,2);
1115*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,3);
1116*c8dee2aaSAndroid Build Coastguard Worker        }
1117*c8dee2aaSAndroid Build Coastguard Worker
1118*c8dee2aaSAndroid Build Coastguard Worker        // Swizzle the pixels to 8-bit planar.
1119*c8dee2aaSAndroid Build Coastguard Worker        *lo = __lasx_xvshuf_b(zeros, *lo, planar);      // rrrrgggg bbbbaaaa rrrrgggg bbbbaaaa
1120*c8dee2aaSAndroid Build Coastguard Worker        *hi = __lasx_xvshuf_b(zeros, *hi, planar);      // RRRRGGGG BBBBAAAA RRRRGGGG BBBBAAAA
1121*c8dee2aaSAndroid Build Coastguard Worker        __m256i rg = __lasx_xvilvl_w(*hi, *lo),         // rrrrRRRR ggggGGGG rrrrRRRR ggggGGGG
1122*c8dee2aaSAndroid Build Coastguard Worker                ba = __lasx_xvilvh_w(*hi, *lo);         // bbbbBBBB aaaaAAAA bbbbBBBB aaaaAAAA
1123*c8dee2aaSAndroid Build Coastguard Worker
1124*c8dee2aaSAndroid Build Coastguard Worker        // Unpack to 16-bit planar.
1125*c8dee2aaSAndroid Build Coastguard Worker        __m256i r = __lasx_xvilvl_b(zeros, rg),         // r_r_r_r_ R_R_R_R_ r_r_r_r_ R_R_R_R_
1126*c8dee2aaSAndroid Build Coastguard Worker                g = __lasx_xvilvh_b(zeros, rg),         // g_g_g_g_ G_G_G_G_ g_g_g_g_ G_G_G_G_
1127*c8dee2aaSAndroid Build Coastguard Worker                b = __lasx_xvilvl_b(zeros, ba),         // b_b_b_b_ B_B_B_B_ b_b_b_b_ B_B_B_B_
1128*c8dee2aaSAndroid Build Coastguard Worker                a = __lasx_xvilvh_b(zeros, ba);         // a_a_a_a_ A_A_A_A_ a_a_a_a_ A_A_A_A_
1129*c8dee2aaSAndroid Build Coastguard Worker
1130*c8dee2aaSAndroid Build Coastguard Worker        // Premultiply!
1131*c8dee2aaSAndroid Build Coastguard Worker        r = scale(r, a);
1132*c8dee2aaSAndroid Build Coastguard Worker        g = scale(g, a);
1133*c8dee2aaSAndroid Build Coastguard Worker        b = scale(b, a);
1134*c8dee2aaSAndroid Build Coastguard Worker
1135*c8dee2aaSAndroid Build Coastguard Worker        // Repack into interlaced pixels.
1136*c8dee2aaSAndroid Build Coastguard Worker        rg = __lasx_xvor_v(r, __lasx_xvslli_h(g, 8));   // rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
1137*c8dee2aaSAndroid Build Coastguard Worker        ba = __lasx_xvor_v(b, __lasx_xvslli_h(a, 8));   // babababa BABABABA babababa BABABABA
1138*c8dee2aaSAndroid Build Coastguard Worker        *lo = __lasx_xvilvl_h(ba, rg);                  // rgbargba rgbargba rgbargba rgbargba
1139*c8dee2aaSAndroid Build Coastguard Worker        *hi = __lasx_xvilvh_h(ba, rg);                  // RGBARGBA RGBARGBA RGBARGBA RGBARGBA
1140*c8dee2aaSAndroid Build Coastguard Worker    };
1141*c8dee2aaSAndroid Build Coastguard Worker
1142*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 16) {
1143*c8dee2aaSAndroid Build Coastguard Worker        __m256i lo = __lasx_xvld(src, 0),
1144*c8dee2aaSAndroid Build Coastguard Worker                hi = __lasx_xvld(src, 32);
1145*c8dee2aaSAndroid Build Coastguard Worker
1146*c8dee2aaSAndroid Build Coastguard Worker        premul8(&lo, &hi);
1147*c8dee2aaSAndroid Build Coastguard Worker
1148*c8dee2aaSAndroid Build Coastguard Worker        __lasx_xvst(lo, dst, 0);
1149*c8dee2aaSAndroid Build Coastguard Worker        __lasx_xvst(hi, dst, 32);
1150*c8dee2aaSAndroid Build Coastguard Worker
1151*c8dee2aaSAndroid Build Coastguard Worker        src += 16;
1152*c8dee2aaSAndroid Build Coastguard Worker        dst += 16;
1153*c8dee2aaSAndroid Build Coastguard Worker        count -= 16;
1154*c8dee2aaSAndroid Build Coastguard Worker    }
1155*c8dee2aaSAndroid Build Coastguard Worker
1156*c8dee2aaSAndroid Build Coastguard Worker    if (count >= 8) {
1157*c8dee2aaSAndroid Build Coastguard Worker        __m256i lo = __lasx_xvld(src, 0),
1158*c8dee2aaSAndroid Build Coastguard Worker                hi = __lasx_xvldi(0);
1159*c8dee2aaSAndroid Build Coastguard Worker
1160*c8dee2aaSAndroid Build Coastguard Worker        premul8(&lo, &hi);
1161*c8dee2aaSAndroid Build Coastguard Worker
1162*c8dee2aaSAndroid Build Coastguard Worker        __lasx_xvst(lo, dst, 0);
1163*c8dee2aaSAndroid Build Coastguard Worker
1164*c8dee2aaSAndroid Build Coastguard Worker        src += 8;
1165*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
1166*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
1167*c8dee2aaSAndroid Build Coastguard Worker    }
1168*c8dee2aaSAndroid Build Coastguard Worker
1169*c8dee2aaSAndroid Build Coastguard Worker    // Call portable code to finish up the tail of [0,4) pixels.
1170*c8dee2aaSAndroid Build Coastguard Worker    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
1171*c8dee2aaSAndroid Build Coastguard Worker    proc(dst, src, count);
1172*c8dee2aaSAndroid Build Coastguard Worker}
1173*c8dee2aaSAndroid Build Coastguard Worker
1174*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
1175*c8dee2aaSAndroid Build Coastguard Worker    premul_should_swapRB(false, dst, src, count);
1176*c8dee2aaSAndroid Build Coastguard Worker}
1177*c8dee2aaSAndroid Build Coastguard Worker
1178*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
1179*c8dee2aaSAndroid Build Coastguard Worker    premul_should_swapRB(true, dst, src, count);
1180*c8dee2aaSAndroid Build Coastguard Worker}
1181*c8dee2aaSAndroid Build Coastguard Worker
1182*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1183*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 8) {
1184*c8dee2aaSAndroid Build Coastguard Worker        __m256i rgba = __lasx_xvld(src, 0);
1185*c8dee2aaSAndroid Build Coastguard Worker        __m256i bgra = __lasx_xvshuf4i_b(rgba, 0xC6);
1186*c8dee2aaSAndroid Build Coastguard Worker        __lasx_xvst(bgra, dst, 0);
1187*c8dee2aaSAndroid Build Coastguard Worker
1188*c8dee2aaSAndroid Build Coastguard Worker        src += 8;
1189*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
1190*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
1191*c8dee2aaSAndroid Build Coastguard Worker    }
1192*c8dee2aaSAndroid Build Coastguard Worker
1193*c8dee2aaSAndroid Build Coastguard Worker    RGBA_to_BGRA_portable(dst, src, count);
1194*c8dee2aaSAndroid Build Coastguard Worker}
1195*c8dee2aaSAndroid Build Coastguard Worker
1196*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
1197*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 16) {
1198*c8dee2aaSAndroid Build Coastguard Worker        __m256i ga = __lasx_xvld(src, 0);
1199*c8dee2aaSAndroid Build Coastguard Worker
1200*c8dee2aaSAndroid Build Coastguard Worker        __m256i gg = __lasx_xvor_v(__lasx_xvand_v(ga, __lasx_xvreplgr2vr_h(0x00FF)),
1201*c8dee2aaSAndroid Build Coastguard Worker                                   __lasx_xvslli_h(ga, 8));
1202*c8dee2aaSAndroid Build Coastguard Worker
1203*c8dee2aaSAndroid Build Coastguard Worker        __m256i ggga_lo = __lasx_xvilvl_h(ga, gg);
1204*c8dee2aaSAndroid Build Coastguard Worker        __m256i ggga_hi = __lasx_xvilvh_h(ga, gg);
1205*c8dee2aaSAndroid Build Coastguard Worker
1206*c8dee2aaSAndroid Build Coastguard Worker        __lasx_xvst(__lasx_xvpermi_q(ggga_lo, ggga_hi, 0x02), dst, 0);
1207*c8dee2aaSAndroid Build Coastguard Worker        __lasx_xvst(__lasx_xvpermi_q(ggga_lo, ggga_hi, 0x13), dst, 32);
1208*c8dee2aaSAndroid Build Coastguard Worker
1209*c8dee2aaSAndroid Build Coastguard Worker        src += 16*2;
1210*c8dee2aaSAndroid Build Coastguard Worker        dst += 16;
1211*c8dee2aaSAndroid Build Coastguard Worker        count -= 16;
1212*c8dee2aaSAndroid Build Coastguard Worker    }
1213*c8dee2aaSAndroid Build Coastguard Worker
1214*c8dee2aaSAndroid Build Coastguard Worker    grayA_to_RGBA_portable(dst, src, count);
1215*c8dee2aaSAndroid Build Coastguard Worker}
1216*c8dee2aaSAndroid Build Coastguard Worker
1217*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
1218*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 16) {
1219*c8dee2aaSAndroid Build Coastguard Worker        __m256i grayA = __lasx_xvld(src, 0);
1220*c8dee2aaSAndroid Build Coastguard Worker
1221*c8dee2aaSAndroid Build Coastguard Worker        __m256i val = __lasx_xvreplgr2vr_h(0x00FF);
1222*c8dee2aaSAndroid Build Coastguard Worker
1223*c8dee2aaSAndroid Build Coastguard Worker        __m256i g0 = __lasx_xvand_v(grayA, val);
1224*c8dee2aaSAndroid Build Coastguard Worker        __m256i a0 = __lasx_xvsrli_h(grayA, 8);
1225*c8dee2aaSAndroid Build Coastguard Worker
1226*c8dee2aaSAndroid Build Coastguard Worker        // Premultiply
1227*c8dee2aaSAndroid Build Coastguard Worker        g0 = scale(g0, a0);
1228*c8dee2aaSAndroid Build Coastguard Worker
1229*c8dee2aaSAndroid Build Coastguard Worker        __m256i gg = __lasx_xvor_v(g0, __lasx_xvslli_h(g0, 8));
1230*c8dee2aaSAndroid Build Coastguard Worker        __m256i ga = __lasx_xvor_v(g0, __lasx_xvslli_h(a0, 8));
1231*c8dee2aaSAndroid Build Coastguard Worker
1232*c8dee2aaSAndroid Build Coastguard Worker        __m256i ggga_lo = __lasx_xvilvl_h(ga, gg);
1233*c8dee2aaSAndroid Build Coastguard Worker        __m256i ggga_hi = __lasx_xvilvh_h(ga, gg);
1234*c8dee2aaSAndroid Build Coastguard Worker
1235*c8dee2aaSAndroid Build Coastguard Worker        val = __lasx_xvpermi_q(ggga_lo, ggga_hi, 0x02);
1236*c8dee2aaSAndroid Build Coastguard Worker        __lasx_xvst(val, dst, 0);
1237*c8dee2aaSAndroid Build Coastguard Worker
1238*c8dee2aaSAndroid Build Coastguard Worker        val = __lasx_xvpermi_q(ggga_lo, ggga_hi, 0x13);
1239*c8dee2aaSAndroid Build Coastguard Worker        __lasx_xvst(val, dst, 32);
1240*c8dee2aaSAndroid Build Coastguard Worker
1241*c8dee2aaSAndroid Build Coastguard Worker        src += 16*2;
1242*c8dee2aaSAndroid Build Coastguard Worker        dst += 16;
1243*c8dee2aaSAndroid Build Coastguard Worker        count -= 16;
1244*c8dee2aaSAndroid Build Coastguard Worker    }
1245*c8dee2aaSAndroid Build Coastguard Worker
1246*c8dee2aaSAndroid Build Coastguard Worker    grayA_to_rgbA_portable(dst, src, count);
1247*c8dee2aaSAndroid Build Coastguard Worker}
1248*c8dee2aaSAndroid Build Coastguard Worker
1249*c8dee2aaSAndroid Build Coastguard Workerenum Format { kRGB1, kBGR1 };
1250*c8dee2aaSAndroid Build Coastguard Workerstatic void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
1251*c8dee2aaSAndroid Build Coastguard Worker    auto convert8 = [=](__m256i *lo, __m256i* hi) {
1252*c8dee2aaSAndroid Build Coastguard Worker        const __m256i zeros = __lasx_xvldi(0);
1253*c8dee2aaSAndroid Build Coastguard Worker        __m256i planar = __lasx_xvldi(0);
1254*c8dee2aaSAndroid Build Coastguard Worker        if (kBGR1 == format) {
1255*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,0);
1256*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,1);
1257*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,2);
1258*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,3);
1259*c8dee2aaSAndroid Build Coastguard Worker        } else {
1260*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,0);
1261*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,1);
1262*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,2);
1263*c8dee2aaSAndroid Build Coastguard Worker            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,3);
1264*c8dee2aaSAndroid Build Coastguard Worker        }
1265*c8dee2aaSAndroid Build Coastguard Worker
1266*c8dee2aaSAndroid Build Coastguard Worker        // Swizzle the pixels to 8-bit planar.
1267*c8dee2aaSAndroid Build Coastguard Worker        *lo = __lasx_xvshuf_b(zeros, *lo, planar);   // ccccmmmm yyyykkkk ccccmmmm yyyykkkk
1268*c8dee2aaSAndroid Build Coastguard Worker        *hi = __lasx_xvshuf_b(zeros, *hi, planar);   // CCCCMMMM YYYYKKKK CCCCMMMM YYYYKKKK
1269*c8dee2aaSAndroid Build Coastguard Worker        __m256i cm = __lasx_xvilvl_w(*hi, *lo),      // ccccCCCC mmmmMMMM ccccCCCC mmmmMMMM
1270*c8dee2aaSAndroid Build Coastguard Worker                yk = __lasx_xvilvh_w(*hi, *lo);      // yyyyYYYY kkkkKKKK yyyyYYYY kkkkKKKK
1271*c8dee2aaSAndroid Build Coastguard Worker
1272*c8dee2aaSAndroid Build Coastguard Worker        // Unpack to 16-bit planar.
1273*c8dee2aaSAndroid Build Coastguard Worker        __m256i c = __lasx_xvilvl_b(zeros, cm),      // c_c_c_c_ C_C_C_C_ c_c_c_c_ C_C_C_C_
1274*c8dee2aaSAndroid Build Coastguard Worker                m = __lasx_xvilvh_b(zeros, cm),      // m_m_m_m_ M_M_M_M_ m_m_m_m_ M_M_M_M_
1275*c8dee2aaSAndroid Build Coastguard Worker                y = __lasx_xvilvl_b(zeros, yk),      // y_y_y_y_ Y_Y_Y_Y_ y_y_y_y_ Y_Y_Y_Y_
1276*c8dee2aaSAndroid Build Coastguard Worker                k = __lasx_xvilvh_b(zeros, yk);      // k_k_k_k_ K_K_K_K_ k_k_k_k_ K_K_K_K_
1277*c8dee2aaSAndroid Build Coastguard Worker
1278*c8dee2aaSAndroid Build Coastguard Worker        // Scale to r, g, b.
1279*c8dee2aaSAndroid Build Coastguard Worker        __m256i r = scale(c, k),
1280*c8dee2aaSAndroid Build Coastguard Worker                g = scale(m, k),
1281*c8dee2aaSAndroid Build Coastguard Worker                b = scale(y, k);
1282*c8dee2aaSAndroid Build Coastguard Worker
1283*c8dee2aaSAndroid Build Coastguard Worker        // Repack into interlaced pixels:
1284*c8dee2aaSAndroid Build Coastguard Worker        //     rg = rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
1285*c8dee2aaSAndroid Build Coastguard Worker        //     ba = b1b1b1b1 B1B1B1B1 b1b1b1b1 B1B1B1B1
1286*c8dee2aaSAndroid Build Coastguard Worker        __m256i rg = __lasx_xvor_v(r, __lasx_xvslli_h(g, 8)),
1287*c8dee2aaSAndroid Build Coastguard Worker                ba = __lasx_xvor_v(b, __lasx_xvreplgr2vr_h(0xff00));
1288*c8dee2aaSAndroid Build Coastguard Worker        *lo = __lasx_xvilvl_h(ba, rg);               // rgb1rgb1 rgb1rgb1 rgb1rgb1 rgb1rgb1
1289*c8dee2aaSAndroid Build Coastguard Worker        *hi = __lasx_xvilvh_h(ba, rg);               // RGB1RGB1 RGB1RGB1 RGB1RGB1 RGB1RGB1
1290*c8dee2aaSAndroid Build Coastguard Worker    };
1291*c8dee2aaSAndroid Build Coastguard Worker
1292*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 16) {
1293*c8dee2aaSAndroid Build Coastguard Worker        __m256i lo = __lasx_xvld(src, 0),
1294*c8dee2aaSAndroid Build Coastguard Worker                hi = __lasx_xvld(src, 32);
1295*c8dee2aaSAndroid Build Coastguard Worker
1296*c8dee2aaSAndroid Build Coastguard Worker        convert8(&lo, &hi);
1297*c8dee2aaSAndroid Build Coastguard Worker
1298*c8dee2aaSAndroid Build Coastguard Worker        __lasx_xvst(lo, dst, 0);
1299*c8dee2aaSAndroid Build Coastguard Worker        __lasx_xvst(hi, dst, 32);
1300*c8dee2aaSAndroid Build Coastguard Worker
1301*c8dee2aaSAndroid Build Coastguard Worker        src += 16;
1302*c8dee2aaSAndroid Build Coastguard Worker        dst += 16;
1303*c8dee2aaSAndroid Build Coastguard Worker        count -= 16;
1304*c8dee2aaSAndroid Build Coastguard Worker    }
1305*c8dee2aaSAndroid Build Coastguard Worker
1306*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 8) {
1307*c8dee2aaSAndroid Build Coastguard Worker        __m256i lo = __lasx_xvld(src, 0),
1308*c8dee2aaSAndroid Build Coastguard Worker                hi = __lasx_xvldi(0);
1309*c8dee2aaSAndroid Build Coastguard Worker
1310*c8dee2aaSAndroid Build Coastguard Worker        convert8(&lo, &hi);
1311*c8dee2aaSAndroid Build Coastguard Worker
1312*c8dee2aaSAndroid Build Coastguard Worker        __lasx_xvst(lo, dst, 0);
1313*c8dee2aaSAndroid Build Coastguard Worker
1314*c8dee2aaSAndroid Build Coastguard Worker        src += 8;
1315*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
1316*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
1317*c8dee2aaSAndroid Build Coastguard Worker    }
1318*c8dee2aaSAndroid Build Coastguard Worker
1319*c8dee2aaSAndroid Build Coastguard Worker    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1320*c8dee2aaSAndroid Build Coastguard Worker    proc(dst, src, count);
1321*c8dee2aaSAndroid Build Coastguard Worker}
1322*c8dee2aaSAndroid Build Coastguard Worker
1323*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1324*c8dee2aaSAndroid Build Coastguard Worker    inverted_cmyk_to(kRGB1, dst, src, count);
1325*c8dee2aaSAndroid Build Coastguard Worker}
1326*c8dee2aaSAndroid Build Coastguard Worker
1327*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1328*c8dee2aaSAndroid Build Coastguard Worker    inverted_cmyk_to(kBGR1, dst, src, count);
1329*c8dee2aaSAndroid Build Coastguard Worker}
1330*c8dee2aaSAndroid Build Coastguard Worker
1331*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
1332*c8dee2aaSAndroid Build Coastguard Worker    rgbA_to_RGBA_portable(dst, src, count);
1333*c8dee2aaSAndroid Build Coastguard Worker}
1334*c8dee2aaSAndroid Build Coastguard Worker
1335*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1336*c8dee2aaSAndroid Build Coastguard Worker    rgbA_to_BGRA_portable(dst, src, count);
1337*c8dee2aaSAndroid Build Coastguard Worker}
1338*c8dee2aaSAndroid Build Coastguard Worker
1339*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1340*c8dee2aaSAndroid Build Coastguard Worker// -- LSX -----------------------------------------------------------------------------------------
1341*c8dee2aaSAndroid Build Coastguard Worker
1342*c8dee2aaSAndroid Build Coastguard Worker// Scale a byte by another.
1343*c8dee2aaSAndroid Build Coastguard Worker// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
1344*c8dee2aaSAndroid Build Coastguard WorkerSI __m128i scale(__m128i x, __m128i y) {
1345*c8dee2aaSAndroid Build Coastguard Worker    const __m128i _128 = __lsx_vreplgr2vr_h(128);
1346*c8dee2aaSAndroid Build Coastguard Worker    const __m128i _257 = __lsx_vreplgr2vr_h(257);
1347*c8dee2aaSAndroid Build Coastguard Worker
1348*c8dee2aaSAndroid Build Coastguard Worker    // (x+127)/255 == ((x+128)*257)>>16
1349*c8dee2aaSAndroid Build Coastguard Worker    return __lsx_vmuh_hu(__lsx_vadd_h(__lsx_vmul_h(x, y), _128), _257);
1350*c8dee2aaSAndroid Build Coastguard Worker}
1351*c8dee2aaSAndroid Build Coastguard Worker
1352*c8dee2aaSAndroid Build Coastguard Workerstatic void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
1353*c8dee2aaSAndroid Build Coastguard Worker
1354*c8dee2aaSAndroid Build Coastguard Worker    auto premul8 = [=](__m128i *lo, __m128i *hi){
1355*c8dee2aaSAndroid Build Coastguard Worker        const __m128i zeros = __lsx_vldi(0);
1356*c8dee2aaSAndroid Build Coastguard Worker        __m128i planar = __lsx_vldi(0);
1357*c8dee2aaSAndroid Build Coastguard Worker        if (kSwapRB) {
1358*c8dee2aaSAndroid Build Coastguard Worker            planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010e0a0602, 0);
1359*c8dee2aaSAndroid Build Coastguard Worker            planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030c080400, 1);
1360*c8dee2aaSAndroid Build Coastguard Worker        } else {
1361*c8dee2aaSAndroid Build Coastguard Worker            planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010c080400, 0);
1362*c8dee2aaSAndroid Build Coastguard Worker            planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030e0a0602, 1);
1363*c8dee2aaSAndroid Build Coastguard Worker        }
1364*c8dee2aaSAndroid Build Coastguard Worker
1365*c8dee2aaSAndroid Build Coastguard Worker        // Swizzle the pixels to 8-bit planar.
1366*c8dee2aaSAndroid Build Coastguard Worker        *lo = __lsx_vshuf_b(zeros, *lo, planar);             // rrrrgggg bbbbaaaa
1367*c8dee2aaSAndroid Build Coastguard Worker        *hi = __lsx_vshuf_b(zeros, *hi, planar);             // RRRRGGGG BBBBAAAA
1368*c8dee2aaSAndroid Build Coastguard Worker        __m128i rg = __lsx_vilvl_w(*hi, *lo),                // rrrrRRRR ggggGGGG
1369*c8dee2aaSAndroid Build Coastguard Worker                ba = __lsx_vilvh_w(*hi, *lo);                // bbbbBBBB aaaaAAAA
1370*c8dee2aaSAndroid Build Coastguard Worker
1371*c8dee2aaSAndroid Build Coastguard Worker        // Unpack to 16-bit planar.
1372*c8dee2aaSAndroid Build Coastguard Worker        __m128i r = __lsx_vilvl_b(zeros, rg),                 // r_r_r_r_ R_R_R_R_
1373*c8dee2aaSAndroid Build Coastguard Worker                g = __lsx_vilvh_b(zeros, rg),                 // g_g_g_g_ G_G_G_G_
1374*c8dee2aaSAndroid Build Coastguard Worker                b = __lsx_vilvl_b(zeros, ba),                 // b_b_b_b_ B_B_B_B_
1375*c8dee2aaSAndroid Build Coastguard Worker                a = __lsx_vilvh_b(zeros, ba);                 // a_a_a_a_ A_A_A_A_
1376*c8dee2aaSAndroid Build Coastguard Worker
1377*c8dee2aaSAndroid Build Coastguard Worker        // Premultiply!
1378*c8dee2aaSAndroid Build Coastguard Worker        r = scale(r, a);
1379*c8dee2aaSAndroid Build Coastguard Worker        g = scale(g, a);
1380*c8dee2aaSAndroid Build Coastguard Worker        b = scale(b, a);
1381*c8dee2aaSAndroid Build Coastguard Worker
1382*c8dee2aaSAndroid Build Coastguard Worker        // Repack into interlaced pixels.
1383*c8dee2aaSAndroid Build Coastguard Worker        rg = __lsx_vor_v(r, __lsx_vslli_h(g, 8));             // rgrgrgrg RGRGRGRG
1384*c8dee2aaSAndroid Build Coastguard Worker        ba = __lsx_vor_v(b, __lsx_vslli_h(a, 8));             // babababa BABABABA
1385*c8dee2aaSAndroid Build Coastguard Worker        *lo = __lsx_vilvl_h(ba, rg);                          // rgbargba rgbargba
1386*c8dee2aaSAndroid Build Coastguard Worker        *hi = __lsx_vilvh_h(ba, rg);                          // RGBARGBA RGBARGBA
1387*c8dee2aaSAndroid Build Coastguard Worker    };
1388*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 8) {
1389*c8dee2aaSAndroid Build Coastguard Worker        __m128i lo = __lsx_vld(src ,0),
1390*c8dee2aaSAndroid Build Coastguard Worker                hi = __lsx_vld(src ,16);
1391*c8dee2aaSAndroid Build Coastguard Worker
1392*c8dee2aaSAndroid Build Coastguard Worker        premul8(&lo, &hi);
1393*c8dee2aaSAndroid Build Coastguard Worker
1394*c8dee2aaSAndroid Build Coastguard Worker        __lsx_vst(lo, dst, 0);
1395*c8dee2aaSAndroid Build Coastguard Worker        __lsx_vst(hi, dst, 16);
1396*c8dee2aaSAndroid Build Coastguard Worker
1397*c8dee2aaSAndroid Build Coastguard Worker        src += 8;
1398*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
1399*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
1400*c8dee2aaSAndroid Build Coastguard Worker    }
1401*c8dee2aaSAndroid Build Coastguard Worker
1402*c8dee2aaSAndroid Build Coastguard Worker    if (count >= 4) {
1403*c8dee2aaSAndroid Build Coastguard Worker        __m128i lo = __lsx_vld(src, 0),
1404*c8dee2aaSAndroid Build Coastguard Worker                hi = __lsx_vldi(0);
1405*c8dee2aaSAndroid Build Coastguard Worker
1406*c8dee2aaSAndroid Build Coastguard Worker        premul8(&lo, &hi);
1407*c8dee2aaSAndroid Build Coastguard Worker
1408*c8dee2aaSAndroid Build Coastguard Worker        __lsx_vst(lo, dst, 0);
1409*c8dee2aaSAndroid Build Coastguard Worker
1410*c8dee2aaSAndroid Build Coastguard Worker        src += 4;
1411*c8dee2aaSAndroid Build Coastguard Worker        dst += 4;
1412*c8dee2aaSAndroid Build Coastguard Worker        count -= 4;
1413*c8dee2aaSAndroid Build Coastguard Worker    }
1414*c8dee2aaSAndroid Build Coastguard Worker
1415*c8dee2aaSAndroid Build Coastguard Worker    // Call portable code to finish up the tail of [0,4) pixels.
1416*c8dee2aaSAndroid Build Coastguard Worker    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
1417*c8dee2aaSAndroid Build Coastguard Worker    proc(dst, src, count);
1418*c8dee2aaSAndroid Build Coastguard Worker}
1419*c8dee2aaSAndroid Build Coastguard Worker
1420*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
1421*c8dee2aaSAndroid Build Coastguard Worker    premul_should_swapRB(false, dst, src, count);
1422*c8dee2aaSAndroid Build Coastguard Worker}
1423*c8dee2aaSAndroid Build Coastguard Worker
1424*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
1425*c8dee2aaSAndroid Build Coastguard Worker    premul_should_swapRB(true, dst, src, count);
1426*c8dee2aaSAndroid Build Coastguard Worker}
1427*c8dee2aaSAndroid Build Coastguard Worker
1428*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1429*c8dee2aaSAndroid Build Coastguard Worker    __m128i swapRB = __lsx_vldi(0);
1430*c8dee2aaSAndroid Build Coastguard Worker    swapRB = __lsx_vinsgr2vr_d(swapRB, 0x0704050603000102, 0);
1431*c8dee2aaSAndroid Build Coastguard Worker    swapRB = __lsx_vinsgr2vr_d(swapRB, 0x0f0c0d0e0b08090a, 1);
1432*c8dee2aaSAndroid Build Coastguard Worker
1433*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 4) {
1434*c8dee2aaSAndroid Build Coastguard Worker        __m128i rgba = __lsx_vld(src, 0);
1435*c8dee2aaSAndroid Build Coastguard Worker        __m128i bgra = __lsx_vshuf4i_b(rgba, 0xC6);
1436*c8dee2aaSAndroid Build Coastguard Worker        __lsx_vst(bgra, dst, 0);
1437*c8dee2aaSAndroid Build Coastguard Worker
1438*c8dee2aaSAndroid Build Coastguard Worker        src += 4;
1439*c8dee2aaSAndroid Build Coastguard Worker        dst += 4;
1440*c8dee2aaSAndroid Build Coastguard Worker        count -= 4;
1441*c8dee2aaSAndroid Build Coastguard Worker    }
1442*c8dee2aaSAndroid Build Coastguard Worker
1443*c8dee2aaSAndroid Build Coastguard Worker    RGBA_to_BGRA_portable(dst, src, count);
1444*c8dee2aaSAndroid Build Coastguard Worker}
1445*c8dee2aaSAndroid Build Coastguard Worker
1446*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
1447*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 8) {
1448*c8dee2aaSAndroid Build Coastguard Worker        __m128i ga = __lsx_vld(src, 0);
1449*c8dee2aaSAndroid Build Coastguard Worker
1450*c8dee2aaSAndroid Build Coastguard Worker        __m128i gg = __lsx_vor_v(__lsx_vand_v(ga, __lsx_vreplgr2vr_h(0x00FF)),
1451*c8dee2aaSAndroid Build Coastguard Worker                                 __lsx_vslli_h(ga, 8));
1452*c8dee2aaSAndroid Build Coastguard Worker
1453*c8dee2aaSAndroid Build Coastguard Worker        __m128i ggga_lo = __lsx_vilvl_h(ga, gg);
1454*c8dee2aaSAndroid Build Coastguard Worker        __m128i ggga_hi = __lsx_vilvh_h(ga, gg);
1455*c8dee2aaSAndroid Build Coastguard Worker
1456*c8dee2aaSAndroid Build Coastguard Worker        __lsx_vst(ggga_lo, dst, 0);
1457*c8dee2aaSAndroid Build Coastguard Worker        __lsx_vst(ggga_hi, dst, 16);
1458*c8dee2aaSAndroid Build Coastguard Worker
1459*c8dee2aaSAndroid Build Coastguard Worker        src += 8*2;
1460*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
1461*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
1462*c8dee2aaSAndroid Build Coastguard Worker    }
1463*c8dee2aaSAndroid Build Coastguard Worker
1464*c8dee2aaSAndroid Build Coastguard Worker    grayA_to_RGBA_portable(dst, src, count);
1465*c8dee2aaSAndroid Build Coastguard Worker}
1466*c8dee2aaSAndroid Build Coastguard Worker
1467*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
1468*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 8) {
1469*c8dee2aaSAndroid Build Coastguard Worker        __m128i grayA = __lsx_vld(src, 0);
1470*c8dee2aaSAndroid Build Coastguard Worker
1471*c8dee2aaSAndroid Build Coastguard Worker        __m128i g0 = __lsx_vand_v(grayA, __lsx_vreplgr2vr_h(0x00FF));
1472*c8dee2aaSAndroid Build Coastguard Worker        __m128i a0 = __lsx_vsrli_h(grayA, 8);
1473*c8dee2aaSAndroid Build Coastguard Worker
1474*c8dee2aaSAndroid Build Coastguard Worker        // Premultiply
1475*c8dee2aaSAndroid Build Coastguard Worker        g0 = scale(g0, a0);
1476*c8dee2aaSAndroid Build Coastguard Worker
1477*c8dee2aaSAndroid Build Coastguard Worker        __m128i gg = __lsx_vor_v(g0, __lsx_vslli_h(g0, 8));
1478*c8dee2aaSAndroid Build Coastguard Worker        __m128i ga = __lsx_vor_v(g0, __lsx_vslli_h(a0, 8));
1479*c8dee2aaSAndroid Build Coastguard Worker
1480*c8dee2aaSAndroid Build Coastguard Worker        __m128i ggga_lo = __lsx_vilvl_h(ga, gg);
1481*c8dee2aaSAndroid Build Coastguard Worker        __m128i ggga_hi = __lsx_vilvh_h(ga, gg);
1482*c8dee2aaSAndroid Build Coastguard Worker
1483*c8dee2aaSAndroid Build Coastguard Worker        __lsx_vst(ggga_lo, dst, 0);
1484*c8dee2aaSAndroid Build Coastguard Worker        __lsx_vst(ggga_hi, dst, 16);
1485*c8dee2aaSAndroid Build Coastguard Worker
1486*c8dee2aaSAndroid Build Coastguard Worker        src += 8*2;
1487*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
1488*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
1489*c8dee2aaSAndroid Build Coastguard Worker    }
1490*c8dee2aaSAndroid Build Coastguard Worker
1491*c8dee2aaSAndroid Build Coastguard Worker    grayA_to_rgbA_portable(dst, src, count);
1492*c8dee2aaSAndroid Build Coastguard Worker}
1493*c8dee2aaSAndroid Build Coastguard Worker
1494*c8dee2aaSAndroid Build Coastguard Workerenum Format { kRGB1, kBGR1 };
1495*c8dee2aaSAndroid Build Coastguard Workerstatic void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
1496*c8dee2aaSAndroid Build Coastguard Worker    auto convert8 = [=](__m128i *lo, __m128i* hi) {
1497*c8dee2aaSAndroid Build Coastguard Worker        const __m128i zeros = __lsx_vldi(0);
1498*c8dee2aaSAndroid Build Coastguard Worker        __m128i planar = __lsx_vldi(0);
1499*c8dee2aaSAndroid Build Coastguard Worker        if (kBGR1 == format) {
1500*c8dee2aaSAndroid Build Coastguard Worker            planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010e0a0602, 0);
1501*c8dee2aaSAndroid Build Coastguard Worker            planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030c080400, 1);
1502*c8dee2aaSAndroid Build Coastguard Worker        } else {
1503*c8dee2aaSAndroid Build Coastguard Worker            planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010c080400, 0);
1504*c8dee2aaSAndroid Build Coastguard Worker            planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030e0a0602, 1);
1505*c8dee2aaSAndroid Build Coastguard Worker        }
1506*c8dee2aaSAndroid Build Coastguard Worker
1507*c8dee2aaSAndroid Build Coastguard Worker        // Swizzle the pixels to 8-bit planar.
1508*c8dee2aaSAndroid Build Coastguard Worker        *lo = __lsx_vshuf_b(zeros, *lo, planar);              // ccccmmmm yyyykkkk
1509*c8dee2aaSAndroid Build Coastguard Worker        *hi = __lsx_vshuf_b(zeros, *hi, planar);              // CCCCMMMM YYYYKKKK
1510*c8dee2aaSAndroid Build Coastguard Worker        __m128i cm = __lsx_vilvl_w(*hi, *lo),                 // ccccCCCC mmmmMMMM
1511*c8dee2aaSAndroid Build Coastguard Worker                yk = __lsx_vilvh_w(*hi, *lo);                 // yyyyYYYY kkkkKKKK
1512*c8dee2aaSAndroid Build Coastguard Worker
1513*c8dee2aaSAndroid Build Coastguard Worker        // Unpack to 16-bit planar.
1514*c8dee2aaSAndroid Build Coastguard Worker        __m128i c = __lsx_vilvl_b(zeros, cm),                 // c_c_c_c_ C_C_C_C_
1515*c8dee2aaSAndroid Build Coastguard Worker                m = __lsx_vilvh_b(zeros, cm),                 // m_m_m_m_ M_M_M_M_
1516*c8dee2aaSAndroid Build Coastguard Worker                y = __lsx_vilvl_b(zeros, yk),                 // y_y_y_y_ Y_Y_Y_Y_
1517*c8dee2aaSAndroid Build Coastguard Worker                k = __lsx_vilvh_b(zeros, yk);                 // k_k_k_k_ K_K_K_K_
1518*c8dee2aaSAndroid Build Coastguard Worker
1519*c8dee2aaSAndroid Build Coastguard Worker        // Scale to r, g, b.
1520*c8dee2aaSAndroid Build Coastguard Worker        __m128i r = scale(c, k),
1521*c8dee2aaSAndroid Build Coastguard Worker                g = scale(m, k),
1522*c8dee2aaSAndroid Build Coastguard Worker                b = scale(y, k);
1523*c8dee2aaSAndroid Build Coastguard Worker
1524*c8dee2aaSAndroid Build Coastguard Worker        // Repack into interlaced pixels.
1525*c8dee2aaSAndroid Build Coastguard Worker        // rgrgrgrg RGRGRGRG
1526*c8dee2aaSAndroid Build Coastguard Worker        // b1b1b1b1 B1B1B1B1
1527*c8dee2aaSAndroid Build Coastguard Worker        __m128i rg = __lsx_vor_v(r, __lsx_vslli_h(g, 8)),
1528*c8dee2aaSAndroid Build Coastguard Worker                ba = __lsx_vor_v(b, __lsx_vreplgr2vr_h(0xff00));
1529*c8dee2aaSAndroid Build Coastguard Worker        *lo = __lsx_vilvl_h(ba, rg);                          // rgbargba rgbargba
1530*c8dee2aaSAndroid Build Coastguard Worker        *hi = __lsx_vilvl_h(ba, rg);                          // RGB1RGB1 RGB1RGB1
1531*c8dee2aaSAndroid Build Coastguard Worker    };
1532*c8dee2aaSAndroid Build Coastguard Worker
1533*c8dee2aaSAndroid Build Coastguard Worker    while (count >= 8) {
1534*c8dee2aaSAndroid Build Coastguard Worker        __m128i lo = __lsx_vld(src, 0),
1535*c8dee2aaSAndroid Build Coastguard Worker                hi = __lsx_vld(src, 16);
1536*c8dee2aaSAndroid Build Coastguard Worker
1537*c8dee2aaSAndroid Build Coastguard Worker        convert8(&lo, &hi);
1538*c8dee2aaSAndroid Build Coastguard Worker
1539*c8dee2aaSAndroid Build Coastguard Worker        __lsx_vst(lo, dst, 0);
1540*c8dee2aaSAndroid Build Coastguard Worker        __lsx_vst(hi, dst, 16);
1541*c8dee2aaSAndroid Build Coastguard Worker
1542*c8dee2aaSAndroid Build Coastguard Worker        src += 8;
1543*c8dee2aaSAndroid Build Coastguard Worker        dst += 8;
1544*c8dee2aaSAndroid Build Coastguard Worker        count -= 8;
1545*c8dee2aaSAndroid Build Coastguard Worker    }
1546*c8dee2aaSAndroid Build Coastguard Worker
1547*c8dee2aaSAndroid Build Coastguard Worker    if (count >= 4) {
1548*c8dee2aaSAndroid Build Coastguard Worker        __m128i lo = __lsx_vld(src, 0),
1549*c8dee2aaSAndroid Build Coastguard Worker                hi = __lsx_vldi(0);
1550*c8dee2aaSAndroid Build Coastguard Worker
1551*c8dee2aaSAndroid Build Coastguard Worker        convert8(&lo, &hi);
1552*c8dee2aaSAndroid Build Coastguard Worker
1553*c8dee2aaSAndroid Build Coastguard Worker        __lsx_vst(lo, dst, 0);
1554*c8dee2aaSAndroid Build Coastguard Worker
1555*c8dee2aaSAndroid Build Coastguard Worker        src += 4;
1556*c8dee2aaSAndroid Build Coastguard Worker        dst += 4;
1557*c8dee2aaSAndroid Build Coastguard Worker        count -= 4;
1558*c8dee2aaSAndroid Build Coastguard Worker    }
1559*c8dee2aaSAndroid Build Coastguard Worker
1560*c8dee2aaSAndroid Build Coastguard Worker    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1561*c8dee2aaSAndroid Build Coastguard Worker    proc(dst, src, count);
1562*c8dee2aaSAndroid Build Coastguard Worker}
1563*c8dee2aaSAndroid Build Coastguard Worker
1564*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1565*c8dee2aaSAndroid Build Coastguard Worker    inverted_cmyk_to(kRGB1, dst, src, count);
1566*c8dee2aaSAndroid Build Coastguard Worker}
1567*c8dee2aaSAndroid Build Coastguard Worker
1568*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1569*c8dee2aaSAndroid Build Coastguard Worker    inverted_cmyk_to(kBGR1, dst, src, count);
1570*c8dee2aaSAndroid Build Coastguard Worker}
1571*c8dee2aaSAndroid Build Coastguard Worker
1572*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
1573*c8dee2aaSAndroid Build Coastguard Worker    rgbA_to_RGBA_portable(dst, src, count);
1574*c8dee2aaSAndroid Build Coastguard Worker}
1575*c8dee2aaSAndroid Build Coastguard Worker
1576*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1577*c8dee2aaSAndroid Build Coastguard Worker    rgbA_to_BGRA_portable(dst, src, count);
1578*c8dee2aaSAndroid Build Coastguard Worker}
1579*c8dee2aaSAndroid Build Coastguard Worker
1580*c8dee2aaSAndroid Build Coastguard Worker#else
1581*c8dee2aaSAndroid Build Coastguard Worker// -- No Opts --------------------------------------------------------------------------------------
1582*c8dee2aaSAndroid Build Coastguard Worker
1583*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
1584*c8dee2aaSAndroid Build Coastguard Worker    rgbA_to_RGBA_portable(dst, src, count);
1585*c8dee2aaSAndroid Build Coastguard Worker}
1586*c8dee2aaSAndroid Build Coastguard Worker
1587*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1588*c8dee2aaSAndroid Build Coastguard Worker    rgbA_to_BGRA_portable(dst, src, count);
1589*c8dee2aaSAndroid Build Coastguard Worker}
1590*c8dee2aaSAndroid Build Coastguard Worker
1591*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
1592*c8dee2aaSAndroid Build Coastguard Worker    RGBA_to_rgbA_portable(dst, src, count);
1593*c8dee2aaSAndroid Build Coastguard Worker}
1594*c8dee2aaSAndroid Build Coastguard Worker
1595*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
1596*c8dee2aaSAndroid Build Coastguard Worker    RGBA_to_bgrA_portable(dst, src, count);
1597*c8dee2aaSAndroid Build Coastguard Worker}
1598*c8dee2aaSAndroid Build Coastguard Worker
1599*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1600*c8dee2aaSAndroid Build Coastguard Worker    RGBA_to_BGRA_portable(dst, src, count);
1601*c8dee2aaSAndroid Build Coastguard Worker}
1602*c8dee2aaSAndroid Build Coastguard Worker
1603*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
1604*c8dee2aaSAndroid Build Coastguard Worker    grayA_to_RGBA_portable(dst, src, count);
1605*c8dee2aaSAndroid Build Coastguard Worker}
1606*c8dee2aaSAndroid Build Coastguard Worker
1607*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
1608*c8dee2aaSAndroid Build Coastguard Worker    grayA_to_rgbA_portable(dst, src, count);
1609*c8dee2aaSAndroid Build Coastguard Worker}
1610*c8dee2aaSAndroid Build Coastguard Worker
1611*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1612*c8dee2aaSAndroid Build Coastguard Worker    inverted_CMYK_to_RGB1_portable(dst, src, count);
1613*c8dee2aaSAndroid Build Coastguard Worker}
1614*c8dee2aaSAndroid Build Coastguard Worker
1615*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1616*c8dee2aaSAndroid Build Coastguard Worker    inverted_CMYK_to_BGR1_portable(dst, src, count);
1617*c8dee2aaSAndroid Build Coastguard Worker}
1618*c8dee2aaSAndroid Build Coastguard Worker#endif
1619*c8dee2aaSAndroid Build Coastguard Worker
1620*c8dee2aaSAndroid Build Coastguard Worker// Basically as above, but we found no benefit from AVX-512 for gray_to_RGB1.
1621*c8dee2aaSAndroid Build Coastguard Workerstatic void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
1622*c8dee2aaSAndroid Build Coastguard Worker    for (int i = 0; i < count; i++) {
1623*c8dee2aaSAndroid Build Coastguard Worker        dst[i] = (uint32_t)0xFF   << 24
1624*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)src[i] << 16
1625*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)src[i] <<  8
1626*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)src[i] <<  0;
1627*c8dee2aaSAndroid Build Coastguard Worker    }
1628*c8dee2aaSAndroid Build Coastguard Worker}
1629*c8dee2aaSAndroid Build Coastguard Worker#if defined(SK_ARM_HAS_NEON)
1630*c8dee2aaSAndroid Build Coastguard Worker    void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1631*c8dee2aaSAndroid Build Coastguard Worker        while (count >= 16) {
1632*c8dee2aaSAndroid Build Coastguard Worker            // Load 16 pixels.
1633*c8dee2aaSAndroid Build Coastguard Worker            uint8x16_t gray = vld1q_u8(src);
1634*c8dee2aaSAndroid Build Coastguard Worker
1635*c8dee2aaSAndroid Build Coastguard Worker            // Set each of the color channels.
1636*c8dee2aaSAndroid Build Coastguard Worker            uint8x16x4_t rgba;
1637*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[0] = gray;
1638*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[1] = gray;
1639*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[2] = gray;
1640*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[3] = vdupq_n_u8(0xFF);
1641*c8dee2aaSAndroid Build Coastguard Worker
1642*c8dee2aaSAndroid Build Coastguard Worker            // Store 16 pixels.
1643*c8dee2aaSAndroid Build Coastguard Worker            vst4q_u8((uint8_t*) dst, rgba);
1644*c8dee2aaSAndroid Build Coastguard Worker            src += 16;
1645*c8dee2aaSAndroid Build Coastguard Worker            dst += 16;
1646*c8dee2aaSAndroid Build Coastguard Worker            count -= 16;
1647*c8dee2aaSAndroid Build Coastguard Worker        }
1648*c8dee2aaSAndroid Build Coastguard Worker        if (count >= 8) {
1649*c8dee2aaSAndroid Build Coastguard Worker            // Load 8 pixels.
1650*c8dee2aaSAndroid Build Coastguard Worker            uint8x8_t gray = vld1_u8(src);
1651*c8dee2aaSAndroid Build Coastguard Worker
1652*c8dee2aaSAndroid Build Coastguard Worker            // Set each of the color channels.
1653*c8dee2aaSAndroid Build Coastguard Worker            uint8x8x4_t rgba;
1654*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[0] = gray;
1655*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[1] = gray;
1656*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[2] = gray;
1657*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[3] = vdup_n_u8(0xFF);
1658*c8dee2aaSAndroid Build Coastguard Worker
1659*c8dee2aaSAndroid Build Coastguard Worker            // Store 8 pixels.
1660*c8dee2aaSAndroid Build Coastguard Worker            vst4_u8((uint8_t*) dst, rgba);
1661*c8dee2aaSAndroid Build Coastguard Worker            src += 8;
1662*c8dee2aaSAndroid Build Coastguard Worker            dst += 8;
1663*c8dee2aaSAndroid Build Coastguard Worker            count -= 8;
1664*c8dee2aaSAndroid Build Coastguard Worker        }
1665*c8dee2aaSAndroid Build Coastguard Worker        gray_to_RGB1_portable(dst, src, count);
1666*c8dee2aaSAndroid Build Coastguard Worker    }
1667*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
1668*c8dee2aaSAndroid Build Coastguard Worker    void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1669*c8dee2aaSAndroid Build Coastguard Worker        const __m256i alphas = _mm256_set1_epi8((uint8_t) 0xFF);
1670*c8dee2aaSAndroid Build Coastguard Worker        while (count >= 32) {
1671*c8dee2aaSAndroid Build Coastguard Worker            __m256i grays = _mm256_loadu_si256((const __m256i*) src);
1672*c8dee2aaSAndroid Build Coastguard Worker
1673*c8dee2aaSAndroid Build Coastguard Worker            __m256i gg_lo = _mm256_unpacklo_epi8(grays, grays);
1674*c8dee2aaSAndroid Build Coastguard Worker            __m256i gg_hi = _mm256_unpackhi_epi8(grays, grays);
1675*c8dee2aaSAndroid Build Coastguard Worker            __m256i ga_lo = _mm256_unpacklo_epi8(grays, alphas);
1676*c8dee2aaSAndroid Build Coastguard Worker            __m256i ga_hi = _mm256_unpackhi_epi8(grays, alphas);
1677*c8dee2aaSAndroid Build Coastguard Worker
1678*c8dee2aaSAndroid Build Coastguard Worker            __m256i ggga0 = _mm256_unpacklo_epi16(gg_lo, ga_lo);
1679*c8dee2aaSAndroid Build Coastguard Worker            __m256i ggga1 = _mm256_unpackhi_epi16(gg_lo, ga_lo);
1680*c8dee2aaSAndroid Build Coastguard Worker            __m256i ggga2 = _mm256_unpacklo_epi16(gg_hi, ga_hi);
1681*c8dee2aaSAndroid Build Coastguard Worker            __m256i ggga3 = _mm256_unpackhi_epi16(gg_hi, ga_hi);
1682*c8dee2aaSAndroid Build Coastguard Worker
1683*c8dee2aaSAndroid Build Coastguard Worker            // Shuffle for pixel reorder.
1684*c8dee2aaSAndroid Build Coastguard Worker            // Note. 'p' stands for 'ggga'
1685*c8dee2aaSAndroid Build Coastguard Worker            // Before shuffle:
1686*c8dee2aaSAndroid Build Coastguard Worker            //     ggga0 = p0  p1  p2  p3  | p16 p17 p18 p19
1687*c8dee2aaSAndroid Build Coastguard Worker            //     ggga1 = p4  p5  p6  p7  | p20 p21 p22 p23
1688*c8dee2aaSAndroid Build Coastguard Worker            //     ggga2 = p8  p9  p10 p11 | p24 p25 p26 p27
1689*c8dee2aaSAndroid Build Coastguard Worker            //     ggga3 = p12 p13 p14 p15 | p28 p29 p30 p31
1690*c8dee2aaSAndroid Build Coastguard Worker            //
1691*c8dee2aaSAndroid Build Coastguard Worker            // After shuffle:
1692*c8dee2aaSAndroid Build Coastguard Worker            //     ggga0_shuffle = p0  p1  p2  p3  | p4  p5  p6  p7
1693*c8dee2aaSAndroid Build Coastguard Worker            //     ggga1_shuffle = p8  p9  p10 p11 | p12 p13 p14 p15
1694*c8dee2aaSAndroid Build Coastguard Worker            //     ggga2_shuffle = p16 p17 p18 p19 | p20 p21 p22 p23
1695*c8dee2aaSAndroid Build Coastguard Worker            //     ggga3_shuffle = p24 p25 p26 p27 | p28 p29 p30 p31
1696*c8dee2aaSAndroid Build Coastguard Worker            __m256i ggga0_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x20),
1697*c8dee2aaSAndroid Build Coastguard Worker                    ggga1_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x20),
1698*c8dee2aaSAndroid Build Coastguard Worker                    ggga2_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x31),
1699*c8dee2aaSAndroid Build Coastguard Worker                    ggga3_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x31);
1700*c8dee2aaSAndroid Build Coastguard Worker
1701*c8dee2aaSAndroid Build Coastguard Worker            _mm256_storeu_si256((__m256i*) (dst +  0), ggga0_shuffle);
1702*c8dee2aaSAndroid Build Coastguard Worker            _mm256_storeu_si256((__m256i*) (dst +  8), ggga1_shuffle);
1703*c8dee2aaSAndroid Build Coastguard Worker            _mm256_storeu_si256((__m256i*) (dst + 16), ggga2_shuffle);
1704*c8dee2aaSAndroid Build Coastguard Worker            _mm256_storeu_si256((__m256i*) (dst + 24), ggga3_shuffle);
1705*c8dee2aaSAndroid Build Coastguard Worker
1706*c8dee2aaSAndroid Build Coastguard Worker            src += 32;
1707*c8dee2aaSAndroid Build Coastguard Worker            dst += 32;
1708*c8dee2aaSAndroid Build Coastguard Worker            count -= 32;
1709*c8dee2aaSAndroid Build Coastguard Worker        }
1710*c8dee2aaSAndroid Build Coastguard Worker        gray_to_RGB1_portable(dst, src, count);
1711*c8dee2aaSAndroid Build Coastguard Worker    }
1712*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3  // TODO: just check >= SSE2?
1713*c8dee2aaSAndroid Build Coastguard Worker    void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1714*c8dee2aaSAndroid Build Coastguard Worker        const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
1715*c8dee2aaSAndroid Build Coastguard Worker        while (count >= 16) {
1716*c8dee2aaSAndroid Build Coastguard Worker            __m128i grays = _mm_loadu_si128((const __m128i*) src);
1717*c8dee2aaSAndroid Build Coastguard Worker
1718*c8dee2aaSAndroid Build Coastguard Worker            __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
1719*c8dee2aaSAndroid Build Coastguard Worker            __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
1720*c8dee2aaSAndroid Build Coastguard Worker            __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
1721*c8dee2aaSAndroid Build Coastguard Worker            __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
1722*c8dee2aaSAndroid Build Coastguard Worker
1723*c8dee2aaSAndroid Build Coastguard Worker            __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
1724*c8dee2aaSAndroid Build Coastguard Worker            __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
1725*c8dee2aaSAndroid Build Coastguard Worker            __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
1726*c8dee2aaSAndroid Build Coastguard Worker            __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
1727*c8dee2aaSAndroid Build Coastguard Worker
1728*c8dee2aaSAndroid Build Coastguard Worker            _mm_storeu_si128((__m128i*) (dst +  0), ggga0);
1729*c8dee2aaSAndroid Build Coastguard Worker            _mm_storeu_si128((__m128i*) (dst +  4), ggga1);
1730*c8dee2aaSAndroid Build Coastguard Worker            _mm_storeu_si128((__m128i*) (dst +  8), ggga2);
1731*c8dee2aaSAndroid Build Coastguard Worker            _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
1732*c8dee2aaSAndroid Build Coastguard Worker
1733*c8dee2aaSAndroid Build Coastguard Worker            src += 16;
1734*c8dee2aaSAndroid Build Coastguard Worker            dst += 16;
1735*c8dee2aaSAndroid Build Coastguard Worker            count -= 16;
1736*c8dee2aaSAndroid Build Coastguard Worker        }
1737*c8dee2aaSAndroid Build Coastguard Worker        gray_to_RGB1_portable(dst, src, count);
1738*c8dee2aaSAndroid Build Coastguard Worker    }
1739*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
1740*c8dee2aaSAndroid Build Coastguard Worker    /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1741*c8dee2aaSAndroid Build Coastguard Worker        const __m256i alphas = __lasx_xvreplgr2vr_b(0xFF);
1742*c8dee2aaSAndroid Build Coastguard Worker        while (count >= 32) {
1743*c8dee2aaSAndroid Build Coastguard Worker            __m256i grays = __lasx_xvld(src, 0);
1744*c8dee2aaSAndroid Build Coastguard Worker
1745*c8dee2aaSAndroid Build Coastguard Worker            __m256i gg_lo = __lasx_xvilvl_b(grays, grays);
1746*c8dee2aaSAndroid Build Coastguard Worker            __m256i gg_hi = __lasx_xvilvh_b(grays, grays);
1747*c8dee2aaSAndroid Build Coastguard Worker            __m256i ga_lo = __lasx_xvilvl_b(alphas, grays);
1748*c8dee2aaSAndroid Build Coastguard Worker            __m256i ga_hi = __lasx_xvilvh_b(alphas, grays);
1749*c8dee2aaSAndroid Build Coastguard Worker
1750*c8dee2aaSAndroid Build Coastguard Worker            __m256i ggga0 = __lasx_xvilvl_h(ga_lo, gg_lo);
1751*c8dee2aaSAndroid Build Coastguard Worker            __m256i ggga1 = __lasx_xvilvh_h(ga_lo, gg_lo);
1752*c8dee2aaSAndroid Build Coastguard Worker            __m256i ggga2 = __lasx_xvilvl_h(ga_hi, gg_hi);
1753*c8dee2aaSAndroid Build Coastguard Worker            __m256i ggga3 = __lasx_xvilvh_h(ga_hi, gg_hi);
1754*c8dee2aaSAndroid Build Coastguard Worker
1755*c8dee2aaSAndroid Build Coastguard Worker            __m256i ggga_0 = __lasx_xvpermi_q(ggga0, ggga1, 0x02);
1756*c8dee2aaSAndroid Build Coastguard Worker            __m256i ggga_1 = __lasx_xvpermi_q(ggga2, ggga3, 0x02);
1757*c8dee2aaSAndroid Build Coastguard Worker            __m256i ggga_2 = __lasx_xvpermi_q(ggga0, ggga1, 0x13);
1758*c8dee2aaSAndroid Build Coastguard Worker            __m256i ggga_3 = __lasx_xvpermi_q(ggga2, ggga3, 0x13);
1759*c8dee2aaSAndroid Build Coastguard Worker
1760*c8dee2aaSAndroid Build Coastguard Worker            __lasx_xvst(ggga_0, dst,  0);
1761*c8dee2aaSAndroid Build Coastguard Worker            __lasx_xvst(ggga_1, dst, 32);
1762*c8dee2aaSAndroid Build Coastguard Worker            __lasx_xvst(ggga_2, dst, 64);
1763*c8dee2aaSAndroid Build Coastguard Worker            __lasx_xvst(ggga_3, dst, 96);
1764*c8dee2aaSAndroid Build Coastguard Worker
1765*c8dee2aaSAndroid Build Coastguard Worker            src += 32;
1766*c8dee2aaSAndroid Build Coastguard Worker            dst += 32;
1767*c8dee2aaSAndroid Build Coastguard Worker            count -= 32;
1768*c8dee2aaSAndroid Build Coastguard Worker        }
1769*c8dee2aaSAndroid Build Coastguard Worker        gray_to_RGB1_portable(dst, src, count);
1770*c8dee2aaSAndroid Build Coastguard Worker    }
1771*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1772*c8dee2aaSAndroid Build Coastguard Worker    /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1773*c8dee2aaSAndroid Build Coastguard Worker        const __m128i alphas = __lsx_vreplgr2vr_b(0xFF);
1774*c8dee2aaSAndroid Build Coastguard Worker        while (count >= 16) {
1775*c8dee2aaSAndroid Build Coastguard Worker            __m128i grays = __lsx_vld(src, 0);
1776*c8dee2aaSAndroid Build Coastguard Worker
1777*c8dee2aaSAndroid Build Coastguard Worker            __m128i gg_lo = __lsx_vilvl_b(grays, grays);
1778*c8dee2aaSAndroid Build Coastguard Worker            __m128i gg_hi = __lsx_vilvh_b(grays, grays);
1779*c8dee2aaSAndroid Build Coastguard Worker            __m128i ga_lo = __lsx_vilvl_b(alphas, grays);
1780*c8dee2aaSAndroid Build Coastguard Worker            __m128i ga_hi = __lsx_vilvh_b(alphas, grays);
1781*c8dee2aaSAndroid Build Coastguard Worker
1782*c8dee2aaSAndroid Build Coastguard Worker            __m128i ggga0 = __lsx_vilvl_h(ga_lo, gg_lo);
1783*c8dee2aaSAndroid Build Coastguard Worker            __m128i ggga1 = __lsx_vilvh_h(ga_lo, gg_lo);
1784*c8dee2aaSAndroid Build Coastguard Worker            __m128i ggga2 = __lsx_vilvl_h(ga_hi, gg_hi);
1785*c8dee2aaSAndroid Build Coastguard Worker            __m128i ggga3 = __lsx_vilvh_h(ga_hi, gg_hi);
1786*c8dee2aaSAndroid Build Coastguard Worker
1787*c8dee2aaSAndroid Build Coastguard Worker            __lsx_vst(ggga0, dst,  0);
1788*c8dee2aaSAndroid Build Coastguard Worker            __lsx_vst(ggga1, dst, 16);
1789*c8dee2aaSAndroid Build Coastguard Worker            __lsx_vst(ggga2, dst, 32);
1790*c8dee2aaSAndroid Build Coastguard Worker            __lsx_vst(ggga3, dst, 48);
1791*c8dee2aaSAndroid Build Coastguard Worker
1792*c8dee2aaSAndroid Build Coastguard Worker            src += 16;
1793*c8dee2aaSAndroid Build Coastguard Worker            dst += 16;
1794*c8dee2aaSAndroid Build Coastguard Worker            count -= 16;
1795*c8dee2aaSAndroid Build Coastguard Worker        }
1796*c8dee2aaSAndroid Build Coastguard Worker        gray_to_RGB1_portable(dst, src, count);
1797*c8dee2aaSAndroid Build Coastguard Worker    }
1798*c8dee2aaSAndroid Build Coastguard Worker#else
1799*c8dee2aaSAndroid Build Coastguard Worker    void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1800*c8dee2aaSAndroid Build Coastguard Worker        gray_to_RGB1_portable(dst, src, count);
1801*c8dee2aaSAndroid Build Coastguard Worker    }
1802*c8dee2aaSAndroid Build Coastguard Worker#endif
1803*c8dee2aaSAndroid Build Coastguard Worker
1804*c8dee2aaSAndroid Build Coastguard Worker// Again as above, this time not even finding benefit from AVX2 for RGB_to_{RGB,BGR}1.
1805*c8dee2aaSAndroid Build Coastguard Workerstatic void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
1806*c8dee2aaSAndroid Build Coastguard Worker    for (int i = 0; i < count; i++) {
1807*c8dee2aaSAndroid Build Coastguard Worker        uint8_t r = src[0],
1808*c8dee2aaSAndroid Build Coastguard Worker                g = src[1],
1809*c8dee2aaSAndroid Build Coastguard Worker                b = src[2];
1810*c8dee2aaSAndroid Build Coastguard Worker        src += 3;
1811*c8dee2aaSAndroid Build Coastguard Worker        dst[i] = (uint32_t)0xFF << 24
1812*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)b    << 16
1813*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)g    <<  8
1814*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)r    <<  0;
1815*c8dee2aaSAndroid Build Coastguard Worker    }
1816*c8dee2aaSAndroid Build Coastguard Worker}
1817*c8dee2aaSAndroid Build Coastguard Workerstatic void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) {
1818*c8dee2aaSAndroid Build Coastguard Worker    for (int i = 0; i < count; i++) {
1819*c8dee2aaSAndroid Build Coastguard Worker        uint8_t r = src[0],
1820*c8dee2aaSAndroid Build Coastguard Worker                g = src[1],
1821*c8dee2aaSAndroid Build Coastguard Worker                b = src[2];
1822*c8dee2aaSAndroid Build Coastguard Worker        src += 3;
1823*c8dee2aaSAndroid Build Coastguard Worker        dst[i] = (uint32_t)0xFF << 24
1824*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)r    << 16
1825*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)g    <<  8
1826*c8dee2aaSAndroid Build Coastguard Worker               | (uint32_t)b    <<  0;
1827*c8dee2aaSAndroid Build Coastguard Worker    }
1828*c8dee2aaSAndroid Build Coastguard Worker}
1829*c8dee2aaSAndroid Build Coastguard Worker#if defined(SK_ARM_HAS_NEON)
1830*c8dee2aaSAndroid Build Coastguard Worker    static void insert_alpha_should_swaprb(bool kSwapRB,
1831*c8dee2aaSAndroid Build Coastguard Worker                                           uint32_t dst[], const uint8_t* src, int count) {
1832*c8dee2aaSAndroid Build Coastguard Worker        while (count >= 16) {
1833*c8dee2aaSAndroid Build Coastguard Worker            // Load 16 pixels.
1834*c8dee2aaSAndroid Build Coastguard Worker            uint8x16x3_t rgb = vld3q_u8(src);
1835*c8dee2aaSAndroid Build Coastguard Worker
1836*c8dee2aaSAndroid Build Coastguard Worker            // Insert an opaque alpha channel and swap if needed.
1837*c8dee2aaSAndroid Build Coastguard Worker            uint8x16x4_t rgba;
1838*c8dee2aaSAndroid Build Coastguard Worker            if (kSwapRB) {
1839*c8dee2aaSAndroid Build Coastguard Worker                rgba.val[0] = rgb.val[2];
1840*c8dee2aaSAndroid Build Coastguard Worker                rgba.val[2] = rgb.val[0];
1841*c8dee2aaSAndroid Build Coastguard Worker            } else {
1842*c8dee2aaSAndroid Build Coastguard Worker                rgba.val[0] = rgb.val[0];
1843*c8dee2aaSAndroid Build Coastguard Worker                rgba.val[2] = rgb.val[2];
1844*c8dee2aaSAndroid Build Coastguard Worker            }
1845*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[1] = rgb.val[1];
1846*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[3] = vdupq_n_u8(0xFF);
1847*c8dee2aaSAndroid Build Coastguard Worker
1848*c8dee2aaSAndroid Build Coastguard Worker            // Store 16 pixels.
1849*c8dee2aaSAndroid Build Coastguard Worker            vst4q_u8((uint8_t*) dst, rgba);
1850*c8dee2aaSAndroid Build Coastguard Worker            src += 16*3;
1851*c8dee2aaSAndroid Build Coastguard Worker            dst += 16;
1852*c8dee2aaSAndroid Build Coastguard Worker            count -= 16;
1853*c8dee2aaSAndroid Build Coastguard Worker        }
1854*c8dee2aaSAndroid Build Coastguard Worker
1855*c8dee2aaSAndroid Build Coastguard Worker        if (count >= 8) {
1856*c8dee2aaSAndroid Build Coastguard Worker            // Load 8 pixels.
1857*c8dee2aaSAndroid Build Coastguard Worker            uint8x8x3_t rgb = vld3_u8(src);
1858*c8dee2aaSAndroid Build Coastguard Worker
1859*c8dee2aaSAndroid Build Coastguard Worker            // Insert an opaque alpha channel and swap if needed.
1860*c8dee2aaSAndroid Build Coastguard Worker            uint8x8x4_t rgba;
1861*c8dee2aaSAndroid Build Coastguard Worker            if (kSwapRB) {
1862*c8dee2aaSAndroid Build Coastguard Worker                rgba.val[0] = rgb.val[2];
1863*c8dee2aaSAndroid Build Coastguard Worker                rgba.val[2] = rgb.val[0];
1864*c8dee2aaSAndroid Build Coastguard Worker            } else {
1865*c8dee2aaSAndroid Build Coastguard Worker                rgba.val[0] = rgb.val[0];
1866*c8dee2aaSAndroid Build Coastguard Worker                rgba.val[2] = rgb.val[2];
1867*c8dee2aaSAndroid Build Coastguard Worker            }
1868*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[1] = rgb.val[1];
1869*c8dee2aaSAndroid Build Coastguard Worker            rgba.val[3] = vdup_n_u8(0xFF);
1870*c8dee2aaSAndroid Build Coastguard Worker
1871*c8dee2aaSAndroid Build Coastguard Worker            // Store 8 pixels.
1872*c8dee2aaSAndroid Build Coastguard Worker            vst4_u8((uint8_t*) dst, rgba);
1873*c8dee2aaSAndroid Build Coastguard Worker            src += 8*3;
1874*c8dee2aaSAndroid Build Coastguard Worker            dst += 8;
1875*c8dee2aaSAndroid Build Coastguard Worker            count -= 8;
1876*c8dee2aaSAndroid Build Coastguard Worker        }
1877*c8dee2aaSAndroid Build Coastguard Worker
1878*c8dee2aaSAndroid Build Coastguard Worker        // Call portable code to finish up the tail of [0,8) pixels.
1879*c8dee2aaSAndroid Build Coastguard Worker        auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1880*c8dee2aaSAndroid Build Coastguard Worker        proc(dst, src, count);
1881*c8dee2aaSAndroid Build Coastguard Worker    }
1882*c8dee2aaSAndroid Build Coastguard Worker
1883*c8dee2aaSAndroid Build Coastguard Worker    void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1884*c8dee2aaSAndroid Build Coastguard Worker        insert_alpha_should_swaprb(false, dst, src, count);
1885*c8dee2aaSAndroid Build Coastguard Worker    }
1886*c8dee2aaSAndroid Build Coastguard Worker    void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1887*c8dee2aaSAndroid Build Coastguard Worker        insert_alpha_should_swaprb(true, dst, src, count);
1888*c8dee2aaSAndroid Build Coastguard Worker    }
1889*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
1890*c8dee2aaSAndroid Build Coastguard Worker    static void insert_alpha_should_swaprb(bool kSwapRB,
1891*c8dee2aaSAndroid Build Coastguard Worker                                           uint32_t dst[], const uint8_t* src, int count) {
1892*c8dee2aaSAndroid Build Coastguard Worker        const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
1893*c8dee2aaSAndroid Build Coastguard Worker        __m128i expand;
1894*c8dee2aaSAndroid Build Coastguard Worker        const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
1895*c8dee2aaSAndroid Build Coastguard Worker        if (kSwapRB) {
1896*c8dee2aaSAndroid Build Coastguard Worker            expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
1897*c8dee2aaSAndroid Build Coastguard Worker        } else {
1898*c8dee2aaSAndroid Build Coastguard Worker            expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
1899*c8dee2aaSAndroid Build Coastguard Worker        }
1900*c8dee2aaSAndroid Build Coastguard Worker
1901*c8dee2aaSAndroid Build Coastguard Worker        while (count >= 6) {
1902*c8dee2aaSAndroid Build Coastguard Worker            // Load a vector.  While this actually contains 5 pixels plus an
1903*c8dee2aaSAndroid Build Coastguard Worker            // extra component, we will discard all but the first four pixels on
1904*c8dee2aaSAndroid Build Coastguard Worker            // this iteration.
1905*c8dee2aaSAndroid Build Coastguard Worker            __m128i rgb = _mm_loadu_si128((const __m128i*) src);
1906*c8dee2aaSAndroid Build Coastguard Worker
1907*c8dee2aaSAndroid Build Coastguard Worker            // Expand the first four pixels to RGBX and then mask to RGB(FF).
1908*c8dee2aaSAndroid Build Coastguard Worker            __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
1909*c8dee2aaSAndroid Build Coastguard Worker
1910*c8dee2aaSAndroid Build Coastguard Worker            // Store 4 pixels.
1911*c8dee2aaSAndroid Build Coastguard Worker            _mm_storeu_si128((__m128i*) dst, rgba);
1912*c8dee2aaSAndroid Build Coastguard Worker
1913*c8dee2aaSAndroid Build Coastguard Worker            src += 4*3;
1914*c8dee2aaSAndroid Build Coastguard Worker            dst += 4;
1915*c8dee2aaSAndroid Build Coastguard Worker            count -= 4;
1916*c8dee2aaSAndroid Build Coastguard Worker        }
1917*c8dee2aaSAndroid Build Coastguard Worker
1918*c8dee2aaSAndroid Build Coastguard Worker        // Call portable code to finish up the tail of [0,4) pixels.
1919*c8dee2aaSAndroid Build Coastguard Worker        auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1920*c8dee2aaSAndroid Build Coastguard Worker        proc(dst, src, count);
1921*c8dee2aaSAndroid Build Coastguard Worker    }
1922*c8dee2aaSAndroid Build Coastguard Worker
1923*c8dee2aaSAndroid Build Coastguard Worker    void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1924*c8dee2aaSAndroid Build Coastguard Worker        insert_alpha_should_swaprb(false, dst, src, count);
1925*c8dee2aaSAndroid Build Coastguard Worker    }
1926*c8dee2aaSAndroid Build Coastguard Worker    void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1927*c8dee2aaSAndroid Build Coastguard Worker        insert_alpha_should_swaprb(true, dst, src, count);
1928*c8dee2aaSAndroid Build Coastguard Worker    }
1929*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
1930*c8dee2aaSAndroid Build Coastguard Worker    static void insert_alpha_should_swaprb(bool kSwapRB,
1931*c8dee2aaSAndroid Build Coastguard Worker                                           uint32_t dst[], const uint8_t* src, int count) {
1932*c8dee2aaSAndroid Build Coastguard Worker        const __m256i alphaMask = __lasx_xvreplgr2vr_w(0xFF000000);
1933*c8dee2aaSAndroid Build Coastguard Worker
1934*c8dee2aaSAndroid Build Coastguard Worker        __m256i expand = __lasx_xvldi(0);
1935*c8dee2aaSAndroid Build Coastguard Worker        if (kSwapRB) {
1936*c8dee2aaSAndroid Build Coastguard Worker            expand = __lasx_xvinsgr2vr_d(expand, 0x0503040502000102, 0);
1937*c8dee2aaSAndroid Build Coastguard Worker            expand = __lasx_xvinsgr2vr_d(expand, 0x0b090a0b08060708, 1);
1938*c8dee2aaSAndroid Build Coastguard Worker            expand = __lasx_xvinsgr2vr_d(expand, 0x110f10110e0c0d0e, 2);
1939*c8dee2aaSAndroid Build Coastguard Worker            expand = __lasx_xvinsgr2vr_d(expand, 0x1715161714121314, 3);
1940*c8dee2aaSAndroid Build Coastguard Worker        } else {
1941*c8dee2aaSAndroid Build Coastguard Worker            expand = __lasx_xvinsgr2vr_d(expand, 0x0505040302020100, 0);
1942*c8dee2aaSAndroid Build Coastguard Worker            expand = __lasx_xvinsgr2vr_d(expand, 0x0b0b0a0908080706, 1);
1943*c8dee2aaSAndroid Build Coastguard Worker            expand = __lasx_xvinsgr2vr_d(expand, 0x1111100f0e0e0d0c, 2);
1944*c8dee2aaSAndroid Build Coastguard Worker            expand = __lasx_xvinsgr2vr_d(expand, 0x1717161514141312, 3);
1945*c8dee2aaSAndroid Build Coastguard Worker        }
1946*c8dee2aaSAndroid Build Coastguard Worker
1947*c8dee2aaSAndroid Build Coastguard Worker        while (count >= 8) {
1948*c8dee2aaSAndroid Build Coastguard Worker            // Load a vector.  While this actually contains 5 pixels plus an
1949*c8dee2aaSAndroid Build Coastguard Worker            // extra component, we will discard all but the first four pixels on
1950*c8dee2aaSAndroid Build Coastguard Worker            // this iteration.
1951*c8dee2aaSAndroid Build Coastguard Worker            __m256i rgb = __lasx_xvld(src, 0);
1952*c8dee2aaSAndroid Build Coastguard Worker            __m256i rgb_l = __lasx_xvpermi_d(rgb, 0x44);
1953*c8dee2aaSAndroid Build Coastguard Worker            __m256i rgb_h = __lasx_xvpermi_d(rgb, 0xEE);
1954*c8dee2aaSAndroid Build Coastguard Worker
1955*c8dee2aaSAndroid Build Coastguard Worker            // Expand the first four pixels to RGBX and then mask to RGB(FF).
1956*c8dee2aaSAndroid Build Coastguard Worker            __m256i rgba = __lasx_xvor_v(__lasx_xvshuf_b(rgb_h, rgb_l, expand), alphaMask);
1957*c8dee2aaSAndroid Build Coastguard Worker
1958*c8dee2aaSAndroid Build Coastguard Worker            // Store 8 pixels.
1959*c8dee2aaSAndroid Build Coastguard Worker            __lasx_xvst(rgba, dst, 0);
1960*c8dee2aaSAndroid Build Coastguard Worker
1961*c8dee2aaSAndroid Build Coastguard Worker            src += 4*6;
1962*c8dee2aaSAndroid Build Coastguard Worker            dst += 8;
1963*c8dee2aaSAndroid Build Coastguard Worker            count -= 8;
1964*c8dee2aaSAndroid Build Coastguard Worker        }
1965*c8dee2aaSAndroid Build Coastguard Worker
1966*c8dee2aaSAndroid Build Coastguard Worker        // Call portable code to finish up the tail of [0,4) pixels.
1967*c8dee2aaSAndroid Build Coastguard Worker        auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1968*c8dee2aaSAndroid Build Coastguard Worker        proc(dst, src, count);
1969*c8dee2aaSAndroid Build Coastguard Worker    }
1970*c8dee2aaSAndroid Build Coastguard Worker    /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1971*c8dee2aaSAndroid Build Coastguard Worker        insert_alpha_should_swaprb(false, dst, src, count);
1972*c8dee2aaSAndroid Build Coastguard Worker    }
1973*c8dee2aaSAndroid Build Coastguard Worker    /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1974*c8dee2aaSAndroid Build Coastguard Worker        insert_alpha_should_swaprb(true, dst, src, count);
1975*c8dee2aaSAndroid Build Coastguard Worker    }
1976*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1977*c8dee2aaSAndroid Build Coastguard Worker    static void insert_alpha_should_swaprb(bool kSwapRB,
1978*c8dee2aaSAndroid Build Coastguard Worker                                           uint32_t dst[], const uint8_t* src, int count) {
1979*c8dee2aaSAndroid Build Coastguard Worker        const __m128i alphaMask = __lsx_vreplgr2vr_w(0xFF000000);
1980*c8dee2aaSAndroid Build Coastguard Worker
1981*c8dee2aaSAndroid Build Coastguard Worker        __m128i expand = __lsx_vldi(0);
1982*c8dee2aaSAndroid Build Coastguard Worker        if (kSwapRB) {
1983*c8dee2aaSAndroid Build Coastguard Worker            expand = __lsx_vinsgr2vr_d(expand, 0x0503040502000102, 0);
1984*c8dee2aaSAndroid Build Coastguard Worker            expand = __lsx_vinsgr2vr_d(expand, 0x0b090a0b08060708, 1);
1985*c8dee2aaSAndroid Build Coastguard Worker        } else {
1986*c8dee2aaSAndroid Build Coastguard Worker            expand = __lsx_vinsgr2vr_d(expand, 0x0505040302020100, 0);
1987*c8dee2aaSAndroid Build Coastguard Worker            expand = __lsx_vinsgr2vr_d(expand, 0x0b0b0a0908080706, 1);
1988*c8dee2aaSAndroid Build Coastguard Worker        }
1989*c8dee2aaSAndroid Build Coastguard Worker
1990*c8dee2aaSAndroid Build Coastguard Worker        while (count >= 6) {
1991*c8dee2aaSAndroid Build Coastguard Worker            // Load a vector.  While this actually contains 5 pixels plus an
1992*c8dee2aaSAndroid Build Coastguard Worker            // extra component, we will discard all but the first four pixels on
1993*c8dee2aaSAndroid Build Coastguard Worker            // this iteration.
1994*c8dee2aaSAndroid Build Coastguard Worker            __m128i rgb = __lsx_vld(src, 0);
1995*c8dee2aaSAndroid Build Coastguard Worker
1996*c8dee2aaSAndroid Build Coastguard Worker            // Expand the first four pixels to RGBX and then mask to RGB(FF).
1997*c8dee2aaSAndroid Build Coastguard Worker            __m128i rgba = __lsx_vor_v(__lsx_vshuf_b(rgb, rgb, expand), alphaMask);
1998*c8dee2aaSAndroid Build Coastguard Worker
1999*c8dee2aaSAndroid Build Coastguard Worker            // Store 4 pixels.
2000*c8dee2aaSAndroid Build Coastguard Worker            __lsx_vst(rgba, dst, 0);
2001*c8dee2aaSAndroid Build Coastguard Worker
2002*c8dee2aaSAndroid Build Coastguard Worker            src += 4*3;
2003*c8dee2aaSAndroid Build Coastguard Worker            dst += 4;
2004*c8dee2aaSAndroid Build Coastguard Worker            count -= 4;
2005*c8dee2aaSAndroid Build Coastguard Worker        }
2006*c8dee2aaSAndroid Build Coastguard Worker
2007*c8dee2aaSAndroid Build Coastguard Worker        // Call portable code to finish up the tail of [0,4) pixels.
2008*c8dee2aaSAndroid Build Coastguard Worker        auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
2009*c8dee2aaSAndroid Build Coastguard Worker        proc(dst, src, count);
2010*c8dee2aaSAndroid Build Coastguard Worker    }
2011*c8dee2aaSAndroid Build Coastguard Worker    /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
2012*c8dee2aaSAndroid Build Coastguard Worker        insert_alpha_should_swaprb(false, dst, src, count);
2013*c8dee2aaSAndroid Build Coastguard Worker    }
2014*c8dee2aaSAndroid Build Coastguard Worker    /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
2015*c8dee2aaSAndroid Build Coastguard Worker        insert_alpha_should_swaprb(true, dst, src, count);
2016*c8dee2aaSAndroid Build Coastguard Worker    }
2017*c8dee2aaSAndroid Build Coastguard Worker#else
2018*c8dee2aaSAndroid Build Coastguard Worker    void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
2019*c8dee2aaSAndroid Build Coastguard Worker        RGB_to_RGB1_portable(dst, src, count);
2020*c8dee2aaSAndroid Build Coastguard Worker    }
2021*c8dee2aaSAndroid Build Coastguard Worker    void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
2022*c8dee2aaSAndroid Build Coastguard Worker        RGB_to_BGR1_portable(dst, src, count);
2023*c8dee2aaSAndroid Build Coastguard Worker    }
2024*c8dee2aaSAndroid Build Coastguard Worker#endif
2025*c8dee2aaSAndroid Build Coastguard Worker
2026*c8dee2aaSAndroid Build Coastguard Worker}  // namespace SK_OPTS_NS
2027*c8dee2aaSAndroid Build Coastguard Worker
2028*c8dee2aaSAndroid Build Coastguard Worker#undef SI
2029