1*c8dee2aaSAndroid Build Coastguard Worker/* 2*c8dee2aaSAndroid Build Coastguard Worker * Copyright 2016 Google Inc. 3*c8dee2aaSAndroid Build Coastguard Worker * 4*c8dee2aaSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license that can be 5*c8dee2aaSAndroid Build Coastguard Worker * found in the LICENSE file. 6*c8dee2aaSAndroid Build Coastguard Worker */ 7*c8dee2aaSAndroid Build Coastguard Worker 8*c8dee2aaSAndroid Build Coastguard Worker#include "include/private/SkColorData.h" 9*c8dee2aaSAndroid Build Coastguard Worker#include "src/base/SkUtils.h" 10*c8dee2aaSAndroid Build Coastguard Worker#include "src/base/SkVx.h" 11*c8dee2aaSAndroid Build Coastguard Worker#include "src/core/SkSwizzlePriv.h" 12*c8dee2aaSAndroid Build Coastguard Worker 13*c8dee2aaSAndroid Build Coastguard Worker#include <algorithm> 14*c8dee2aaSAndroid Build Coastguard Worker#include <cmath> 15*c8dee2aaSAndroid Build Coastguard Worker#include <utility> 16*c8dee2aaSAndroid Build Coastguard Worker 17*c8dee2aaSAndroid Build Coastguard Worker#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 18*c8dee2aaSAndroid Build Coastguard Worker #include <immintrin.h> 19*c8dee2aaSAndroid Build Coastguard Worker#elif defined(SK_ARM_HAS_NEON) 20*c8dee2aaSAndroid Build Coastguard Worker #include <arm_neon.h> 21*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX 22*c8dee2aaSAndroid Build Coastguard Worker #include <lasxintrin.h> 23*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX 24*c8dee2aaSAndroid Build Coastguard Worker #include <lsxintrin.h> 25*c8dee2aaSAndroid Build Coastguard Worker#endif 26*c8dee2aaSAndroid Build Coastguard Worker 27*c8dee2aaSAndroid Build Coastguard Worker// This file is included in multiple translation units with different #defines set enabling 28*c8dee2aaSAndroid Build Coastguard Worker// different instruction use for different CPU architectures. 29*c8dee2aaSAndroid Build Coastguard Worker// 30*c8dee2aaSAndroid Build Coastguard Worker// A pair of files controls what #defines are defined: SkOpts_SetTarget.h set the flags, and 31*c8dee2aaSAndroid Build Coastguard Worker// SkOpts_RestoreTarget.h restores them. SkOpts_SetTarget is controlled by setting the 32*c8dee2aaSAndroid Build Coastguard Worker// SK_OPTS_TARGET define before included it. 33*c8dee2aaSAndroid Build Coastguard Worker// 34*c8dee2aaSAndroid Build Coastguard Worker// SkOpts_SetTarget also sets the #define SK_OPTS_NS to the unique namespace for this code. 35*c8dee2aaSAndroid Build Coastguard Worker 36*c8dee2aaSAndroid Build Coastguard Worker#if defined(__clang__) || defined(__GNUC__) 37*c8dee2aaSAndroid Build Coastguard Worker#define SI __attribute__((always_inline)) static inline 38*c8dee2aaSAndroid Build Coastguard Worker#else 39*c8dee2aaSAndroid Build Coastguard Worker#define SI static inline 40*c8dee2aaSAndroid Build Coastguard Worker#endif 41*c8dee2aaSAndroid Build Coastguard Worker 42*c8dee2aaSAndroid Build Coastguard Workernamespace SK_OPTS_NS { 43*c8dee2aaSAndroid Build Coastguard Worker 44*c8dee2aaSAndroid Build Coastguard Worker#if defined(SK_USE_FAST_UNPREMUL_324099025) 45*c8dee2aaSAndroid Build Coastguard Workerconstexpr bool kFastUnpremul = true; 46*c8dee2aaSAndroid Build Coastguard Worker#else 47*c8dee2aaSAndroid Build Coastguard Workerconstexpr bool kFastUnpremul = false; 48*c8dee2aaSAndroid Build Coastguard Worker#endif 49*c8dee2aaSAndroid Build Coastguard Worker 50*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha_times_255_portable(float a) { 51*c8dee2aaSAndroid Build Coastguard Worker return a != 0 ? 255.0f / a : 0.0f; 52*c8dee2aaSAndroid Build Coastguard Worker} 53*c8dee2aaSAndroid Build Coastguard Worker 54*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha_portable(float a) { 55*c8dee2aaSAndroid Build Coastguard Worker return a != 0 ? 1.0f / a : 0.0f; 56*c8dee2aaSAndroid Build Coastguard Worker} 57*c8dee2aaSAndroid Build Coastguard Worker 58*c8dee2aaSAndroid Build Coastguard Worker#if defined(SK_ARM_HAS_NEON) 59*c8dee2aaSAndroid Build Coastguard Worker// -- NEON -- Harden against timing attacks 60*c8dee2aaSAndroid Build Coastguard Worker// For neon, the portable versions create branchless code. 61*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha_times_255(float a) { 62*c8dee2aaSAndroid Build Coastguard Worker return reciprocal_alpha_times_255_portable(a); 63*c8dee2aaSAndroid Build Coastguard Worker} 64*c8dee2aaSAndroid Build Coastguard Worker 65*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha(float a) { 66*c8dee2aaSAndroid Build Coastguard Worker return reciprocal_alpha_portable(a); 67*c8dee2aaSAndroid Build Coastguard Worker} 68*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 && (defined(__clang__) || !defined(_MSC_VER)) 69*c8dee2aaSAndroid Build Coastguard Worker// -- SSE -- Harden against timing attacks -- MSVC is not supported. 70*c8dee2aaSAndroid Build Coastguard Workerusing F4 = __m128; 71*c8dee2aaSAndroid Build Coastguard Worker 72*c8dee2aaSAndroid Build Coastguard WorkerSK_NO_SANITIZE("float-divide-by-zero") 73*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha_times_255(float a) { 74*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(0 <= a && a <= 255); 75*c8dee2aaSAndroid Build Coastguard Worker F4 vA{a, a, a, a}; 76*c8dee2aaSAndroid Build Coastguard Worker auto q = F4{255.0f} / vA; 77*c8dee2aaSAndroid Build Coastguard Worker return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0]; 78*c8dee2aaSAndroid Build Coastguard Worker} 79*c8dee2aaSAndroid Build Coastguard Worker 80*c8dee2aaSAndroid Build Coastguard WorkerSK_NO_SANITIZE("float-divide-by-zero") 81*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha(float a) { 82*c8dee2aaSAndroid Build Coastguard Worker SkASSERT(0 <= a && a <= 1); 83*c8dee2aaSAndroid Build Coastguard Worker F4 vA{a, a, a, a}; 84*c8dee2aaSAndroid Build Coastguard Worker auto q = F4{1.0f} / vA; 85*c8dee2aaSAndroid Build Coastguard Worker return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0]; 86*c8dee2aaSAndroid Build Coastguard Worker} 87*c8dee2aaSAndroid Build Coastguard Worker#else 88*c8dee2aaSAndroid Build Coastguard Worker// -- Portable -- *Not* hardened against timing attacks 89*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha_times_255(float a) { 90*c8dee2aaSAndroid Build Coastguard Worker return reciprocal_alpha_times_255_portable(a); 91*c8dee2aaSAndroid Build Coastguard Worker} 92*c8dee2aaSAndroid Build Coastguard Worker 93*c8dee2aaSAndroid Build Coastguard WorkerSI float reciprocal_alpha(float a) { 94*c8dee2aaSAndroid Build Coastguard Worker return reciprocal_alpha_portable(a); 95*c8dee2aaSAndroid Build Coastguard Worker} 96*c8dee2aaSAndroid Build Coastguard Worker#endif 97*c8dee2aaSAndroid Build Coastguard Worker 98*c8dee2aaSAndroid Build Coastguard Workerstatic void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) { 99*c8dee2aaSAndroid Build Coastguard Worker for (int i = 0; i < count; i++) { 100*c8dee2aaSAndroid Build Coastguard Worker uint8_t a = (src[i] >> 24) & 0xFF, 101*c8dee2aaSAndroid Build Coastguard Worker b = (src[i] >> 16) & 0xFF, 102*c8dee2aaSAndroid Build Coastguard Worker g = (src[i] >> 8) & 0xFF, 103*c8dee2aaSAndroid Build Coastguard Worker r = (src[i] >> 0) & 0xFF; 104*c8dee2aaSAndroid Build Coastguard Worker b = (b*a+127)/255; 105*c8dee2aaSAndroid Build Coastguard Worker g = (g*a+127)/255; 106*c8dee2aaSAndroid Build Coastguard Worker r = (r*a+127)/255; 107*c8dee2aaSAndroid Build Coastguard Worker dst[i] = (uint32_t)a << 24 108*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)b << 16 109*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)g << 8 110*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)r << 0; 111*c8dee2aaSAndroid Build Coastguard Worker } 112*c8dee2aaSAndroid Build Coastguard Worker} 113*c8dee2aaSAndroid Build Coastguard Worker 114*c8dee2aaSAndroid Build Coastguard Worker// RP uses the following rounding routines in store_8888. There are three different 115*c8dee2aaSAndroid Build Coastguard Worker// styles of rounding: 116*c8dee2aaSAndroid Build Coastguard Worker// 1) +0.5 and floor - used by scalar and ARMv7 117*c8dee2aaSAndroid Build Coastguard Worker// 2) round to even for sure - ARMv8 118*c8dee2aaSAndroid Build Coastguard Worker// 3) round to even maybe - intel. The rounding on intel depends on MXCSR which 119*c8dee2aaSAndroid Build Coastguard Worker// defaults to round to even. 120*c8dee2aaSAndroid Build Coastguard Worker// 121*c8dee2aaSAndroid Build Coastguard Worker// Note: that vrndns_f32 is the single float version of vcvtnq_u32_f32. 122*c8dee2aaSAndroid Build Coastguard Worker 123*c8dee2aaSAndroid Build Coastguard WorkerSI uint32_t pixel_round_as_RP(float n) { 124*c8dee2aaSAndroid Build Coastguard Worker#if defined(SK_ARM_HAS_NEON) && defined(SK_CPU_ARM64) 125*c8dee2aaSAndroid Build Coastguard Worker return vrndns_f32(n); 126*c8dee2aaSAndroid Build Coastguard Worker#elif defined(SK_ARM_HAS_NEON) && !defined(SK_CPU_ARM64) 127*c8dee2aaSAndroid Build Coastguard Worker float32x4_t vN{n + 0.5f}; 128*c8dee2aaSAndroid Build Coastguard Worker return vcvtq_u32_f32(vN)[0]; 129*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 && (defined(__clang__) || !defined(_MSC_VER)) 130*c8dee2aaSAndroid Build Coastguard Worker return _mm_cvtps_epi32(__m128{n})[0]; 131*c8dee2aaSAndroid Build Coastguard Worker#else 132*c8dee2aaSAndroid Build Coastguard Worker return (uint32_t)(n + 0.5f); 133*c8dee2aaSAndroid Build Coastguard Worker#endif 134*c8dee2aaSAndroid Build Coastguard Worker} 135*c8dee2aaSAndroid Build Coastguard Worker 136*c8dee2aaSAndroid Build Coastguard Worker// Doing the math for an original color b resulting in a premul color x, 137*c8dee2aaSAndroid Build Coastguard Worker// x = ⌊(b * a + 127) / 255⌋, 138*c8dee2aaSAndroid Build Coastguard Worker// x ≤ (b * a + 127) / 255 < x + 1, 139*c8dee2aaSAndroid Build Coastguard Worker// 255 * x ≤ b * a + 127 < 255 * (x + 1), 140*c8dee2aaSAndroid Build Coastguard Worker// 255 * x - 127 ≤ b * a < 255 * (x + 1) - 127, 141*c8dee2aaSAndroid Build Coastguard Worker// 255 * x - 127 ≤ b * a < 255 * x + 128, 142*c8dee2aaSAndroid Build Coastguard Worker// (255 * x - 127) / a ≤ b < (255 * x + 128) / a. 143*c8dee2aaSAndroid Build Coastguard Worker// So, given a premul value x < a, the original color b can be in the above range. 144*c8dee2aaSAndroid Build Coastguard Worker// We can pick the middle of that range as 145*c8dee2aaSAndroid Build Coastguard Worker// b = 255 * x / a 146*c8dee2aaSAndroid Build Coastguard Worker// b = x * (255 / a) 147*c8dee2aaSAndroid Build Coastguard WorkerSI uint32_t unpremul_quick(float reciprocalA, float c) { 148*c8dee2aaSAndroid Build Coastguard Worker return (uint32_t)std::min(255.0f, (c * reciprocalA + 0.5f)); 149*c8dee2aaSAndroid Build Coastguard Worker} 150*c8dee2aaSAndroid Build Coastguard Worker 151*c8dee2aaSAndroid Build Coastguard Worker// Similar to unpremul but simulates Raster Pipeline by normalizing the pixel on the interval 152*c8dee2aaSAndroid Build Coastguard Worker// [0, 1] and uses round-to-even in most cases instead of round-up. 153*c8dee2aaSAndroid Build Coastguard WorkerSI uint32_t unpremul_simulating_RP(float reciprocalA, float c) { 154*c8dee2aaSAndroid Build Coastguard Worker const float normalizedC = c * (1.0f / 255.0f); 155*c8dee2aaSAndroid Build Coastguard Worker const float answer = std::min(255.0f, normalizedC * reciprocalA * 255.0f); 156*c8dee2aaSAndroid Build Coastguard Worker return pixel_round_as_RP(answer); 157*c8dee2aaSAndroid Build Coastguard Worker} 158*c8dee2aaSAndroid Build Coastguard Worker 159*c8dee2aaSAndroid Build Coastguard WorkerSI uint32_t rgbA_to_CCCA(float c00, float c08, float c16, float a) { 160*c8dee2aaSAndroid Build Coastguard Worker if constexpr (kFastUnpremul) { 161*c8dee2aaSAndroid Build Coastguard Worker const float reciprocalA = reciprocal_alpha_times_255(a); 162*c8dee2aaSAndroid Build Coastguard Worker auto unpremul = [reciprocalA](float c) -> uint32_t { 163*c8dee2aaSAndroid Build Coastguard Worker return unpremul_quick(reciprocalA, c); 164*c8dee2aaSAndroid Build Coastguard Worker }; 165*c8dee2aaSAndroid Build Coastguard Worker return (uint32_t) a << 24 166*c8dee2aaSAndroid Build Coastguard Worker | unpremul(c16) << 16 167*c8dee2aaSAndroid Build Coastguard Worker | unpremul(c08) << 8 168*c8dee2aaSAndroid Build Coastguard Worker | unpremul(c00) << 0; 169*c8dee2aaSAndroid Build Coastguard Worker } else { 170*c8dee2aaSAndroid Build Coastguard Worker const float normalizedA = a * (1.0f / 255.0f); 171*c8dee2aaSAndroid Build Coastguard Worker const float reciprocalA = reciprocal_alpha(normalizedA); 172*c8dee2aaSAndroid Build Coastguard Worker auto unpremul = [reciprocalA](float c) -> uint32_t { 173*c8dee2aaSAndroid Build Coastguard Worker return unpremul_simulating_RP(reciprocalA, c); 174*c8dee2aaSAndroid Build Coastguard Worker }; 175*c8dee2aaSAndroid Build Coastguard Worker return (uint32_t) a << 24 176*c8dee2aaSAndroid Build Coastguard Worker | unpremul(c16) << 16 177*c8dee2aaSAndroid Build Coastguard Worker | unpremul(c08) << 8 178*c8dee2aaSAndroid Build Coastguard Worker | unpremul(c00) << 0; 179*c8dee2aaSAndroid Build Coastguard Worker } 180*c8dee2aaSAndroid Build Coastguard Worker} 181*c8dee2aaSAndroid Build Coastguard Worker 182*c8dee2aaSAndroid Build Coastguard Workerstatic void rgbA_to_RGBA_portable(uint32_t* dst, const uint32_t* src, int count) { 183*c8dee2aaSAndroid Build Coastguard Worker for (int i = 0; i < count; i++) { 184*c8dee2aaSAndroid Build Coastguard Worker const uint32_t p = src[i]; 185*c8dee2aaSAndroid Build Coastguard Worker 186*c8dee2aaSAndroid Build Coastguard Worker const float a = (p >> 24) & 0xFF, 187*c8dee2aaSAndroid Build Coastguard Worker b = (p >> 16) & 0xFF, 188*c8dee2aaSAndroid Build Coastguard Worker g = (p >> 8) & 0xFF, 189*c8dee2aaSAndroid Build Coastguard Worker r = (p >> 0) & 0xFF; 190*c8dee2aaSAndroid Build Coastguard Worker 191*c8dee2aaSAndroid Build Coastguard Worker dst[i] = rgbA_to_CCCA(r, g, b, a); 192*c8dee2aaSAndroid Build Coastguard Worker } 193*c8dee2aaSAndroid Build Coastguard Worker} 194*c8dee2aaSAndroid Build Coastguard Worker 195*c8dee2aaSAndroid Build Coastguard Workerstatic void rgbA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) { 196*c8dee2aaSAndroid Build Coastguard Worker for (int i = 0; i < count; i++) { 197*c8dee2aaSAndroid Build Coastguard Worker const uint32_t p = src[i]; 198*c8dee2aaSAndroid Build Coastguard Worker 199*c8dee2aaSAndroid Build Coastguard Worker const uint32_t a = (p >> 24) & 0xFF, 200*c8dee2aaSAndroid Build Coastguard Worker b = (p >> 16) & 0xFF, 201*c8dee2aaSAndroid Build Coastguard Worker g = (p >> 8) & 0xFF, 202*c8dee2aaSAndroid Build Coastguard Worker r = (p >> 0) & 0xFF; 203*c8dee2aaSAndroid Build Coastguard Worker 204*c8dee2aaSAndroid Build Coastguard Worker dst[i] = rgbA_to_CCCA(b, g, r, a); 205*c8dee2aaSAndroid Build Coastguard Worker } 206*c8dee2aaSAndroid Build Coastguard Worker} 207*c8dee2aaSAndroid Build Coastguard Worker 208*c8dee2aaSAndroid Build Coastguard Workerstatic void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) { 209*c8dee2aaSAndroid Build Coastguard Worker for (int i = 0; i < count; i++) { 210*c8dee2aaSAndroid Build Coastguard Worker uint8_t a = (src[i] >> 24) & 0xFF, 211*c8dee2aaSAndroid Build Coastguard Worker b = (src[i] >> 16) & 0xFF, 212*c8dee2aaSAndroid Build Coastguard Worker g = (src[i] >> 8) & 0xFF, 213*c8dee2aaSAndroid Build Coastguard Worker r = (src[i] >> 0) & 0xFF; 214*c8dee2aaSAndroid Build Coastguard Worker b = (b*a+127)/255; 215*c8dee2aaSAndroid Build Coastguard Worker g = (g*a+127)/255; 216*c8dee2aaSAndroid Build Coastguard Worker r = (r*a+127)/255; 217*c8dee2aaSAndroid Build Coastguard Worker dst[i] = (uint32_t)a << 24 218*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)r << 16 219*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)g << 8 220*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)b << 0; 221*c8dee2aaSAndroid Build Coastguard Worker } 222*c8dee2aaSAndroid Build Coastguard Worker} 223*c8dee2aaSAndroid Build Coastguard Worker 224*c8dee2aaSAndroid Build Coastguard Workerstatic void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) { 225*c8dee2aaSAndroid Build Coastguard Worker for (int i = 0; i < count; i++) { 226*c8dee2aaSAndroid Build Coastguard Worker uint8_t a = (src[i] >> 24) & 0xFF, 227*c8dee2aaSAndroid Build Coastguard Worker b = (src[i] >> 16) & 0xFF, 228*c8dee2aaSAndroid Build Coastguard Worker g = (src[i] >> 8) & 0xFF, 229*c8dee2aaSAndroid Build Coastguard Worker r = (src[i] >> 0) & 0xFF; 230*c8dee2aaSAndroid Build Coastguard Worker dst[i] = (uint32_t)a << 24 231*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)r << 16 232*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)g << 8 233*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)b << 0; 234*c8dee2aaSAndroid Build Coastguard Worker } 235*c8dee2aaSAndroid Build Coastguard Worker} 236*c8dee2aaSAndroid Build Coastguard Worker 237*c8dee2aaSAndroid Build Coastguard Workerstatic void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) { 238*c8dee2aaSAndroid Build Coastguard Worker for (int i = 0; i < count; i++) { 239*c8dee2aaSAndroid Build Coastguard Worker uint8_t g = src[0], 240*c8dee2aaSAndroid Build Coastguard Worker a = src[1]; 241*c8dee2aaSAndroid Build Coastguard Worker src += 2; 242*c8dee2aaSAndroid Build Coastguard Worker dst[i] = (uint32_t)a << 24 243*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)g << 16 244*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)g << 8 245*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)g << 0; 246*c8dee2aaSAndroid Build Coastguard Worker } 247*c8dee2aaSAndroid Build Coastguard Worker} 248*c8dee2aaSAndroid Build Coastguard Worker 249*c8dee2aaSAndroid Build Coastguard Workerstatic void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) { 250*c8dee2aaSAndroid Build Coastguard Worker for (int i = 0; i < count; i++) { 251*c8dee2aaSAndroid Build Coastguard Worker uint8_t g = src[0], 252*c8dee2aaSAndroid Build Coastguard Worker a = src[1]; 253*c8dee2aaSAndroid Build Coastguard Worker src += 2; 254*c8dee2aaSAndroid Build Coastguard Worker g = (g*a+127)/255; 255*c8dee2aaSAndroid Build Coastguard Worker dst[i] = (uint32_t)a << 24 256*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)g << 16 257*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)g << 8 258*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)g << 0; 259*c8dee2aaSAndroid Build Coastguard Worker } 260*c8dee2aaSAndroid Build Coastguard Worker} 261*c8dee2aaSAndroid Build Coastguard Worker 262*c8dee2aaSAndroid Build Coastguard Workerstatic void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) { 263*c8dee2aaSAndroid Build Coastguard Worker for (int i = 0; i < count; i++) { 264*c8dee2aaSAndroid Build Coastguard Worker uint8_t k = (src[i] >> 24) & 0xFF, 265*c8dee2aaSAndroid Build Coastguard Worker y = (src[i] >> 16) & 0xFF, 266*c8dee2aaSAndroid Build Coastguard Worker m = (src[i] >> 8) & 0xFF, 267*c8dee2aaSAndroid Build Coastguard Worker c = (src[i] >> 0) & 0xFF; 268*c8dee2aaSAndroid Build Coastguard Worker // See comments in SkSwizzler.cpp for details on the conversion formula. 269*c8dee2aaSAndroid Build Coastguard Worker uint8_t b = (y*k+127)/255, 270*c8dee2aaSAndroid Build Coastguard Worker g = (m*k+127)/255, 271*c8dee2aaSAndroid Build Coastguard Worker r = (c*k+127)/255; 272*c8dee2aaSAndroid Build Coastguard Worker dst[i] = (uint32_t)0xFF << 24 273*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t) b << 16 274*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t) g << 8 275*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t) r << 0; 276*c8dee2aaSAndroid Build Coastguard Worker } 277*c8dee2aaSAndroid Build Coastguard Worker} 278*c8dee2aaSAndroid Build Coastguard Worker 279*c8dee2aaSAndroid Build Coastguard Workerstatic void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) { 280*c8dee2aaSAndroid Build Coastguard Worker for (int i = 0; i < count; i++) { 281*c8dee2aaSAndroid Build Coastguard Worker uint8_t k = (src[i] >> 24) & 0xFF, 282*c8dee2aaSAndroid Build Coastguard Worker y = (src[i] >> 16) & 0xFF, 283*c8dee2aaSAndroid Build Coastguard Worker m = (src[i] >> 8) & 0xFF, 284*c8dee2aaSAndroid Build Coastguard Worker c = (src[i] >> 0) & 0xFF; 285*c8dee2aaSAndroid Build Coastguard Worker uint8_t b = (y*k+127)/255, 286*c8dee2aaSAndroid Build Coastguard Worker g = (m*k+127)/255, 287*c8dee2aaSAndroid Build Coastguard Worker r = (c*k+127)/255; 288*c8dee2aaSAndroid Build Coastguard Worker dst[i] = (uint32_t)0xFF << 24 289*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t) r << 16 290*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t) g << 8 291*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t) b << 0; 292*c8dee2aaSAndroid Build Coastguard Worker } 293*c8dee2aaSAndroid Build Coastguard Worker} 294*c8dee2aaSAndroid Build Coastguard Worker 295*c8dee2aaSAndroid Build Coastguard Worker#if defined(SK_ARM_HAS_NEON) 296*c8dee2aaSAndroid Build Coastguard Worker// -- NEON ----------------------------------------------------------------------------------------- 297*c8dee2aaSAndroid Build Coastguard Worker// Rounded divide by 255, (x + 127) / 255 298*c8dee2aaSAndroid Build Coastguard WorkerSI uint8x8_t div255_round(uint16x8_t x) { 299*c8dee2aaSAndroid Build Coastguard Worker // result = (x + 127) / 255 300*c8dee2aaSAndroid Build Coastguard Worker // result = (x + 127) / 256 + error1 301*c8dee2aaSAndroid Build Coastguard Worker // 302*c8dee2aaSAndroid Build Coastguard Worker // error1 = (x + 127) / (255 * 256) 303*c8dee2aaSAndroid Build Coastguard Worker // error1 = (x + 127) / (256 * 256) + error2 304*c8dee2aaSAndroid Build Coastguard Worker // 305*c8dee2aaSAndroid Build Coastguard Worker // error2 = (x + 127) / (255 * 256 * 256) 306*c8dee2aaSAndroid Build Coastguard Worker // 307*c8dee2aaSAndroid Build Coastguard Worker // The maximum value of error2 is too small to matter. Thus: 308*c8dee2aaSAndroid Build Coastguard Worker // result = (x + 127) / 256 + (x + 127) / (256 * 256) 309*c8dee2aaSAndroid Build Coastguard Worker // result = ((x + 127) / 256 + x + 127) / 256 310*c8dee2aaSAndroid Build Coastguard Worker // result = ((x + 127) >> 8 + x + 127) >> 8 311*c8dee2aaSAndroid Build Coastguard Worker // 312*c8dee2aaSAndroid Build Coastguard Worker // Use >>> to represent "rounded right shift" which, conveniently, 313*c8dee2aaSAndroid Build Coastguard Worker // NEON supports in one instruction. 314*c8dee2aaSAndroid Build Coastguard Worker // result = ((x >>> 8) + x) >>> 8 315*c8dee2aaSAndroid Build Coastguard Worker // 316*c8dee2aaSAndroid Build Coastguard Worker // Note that the second right shift is actually performed as an 317*c8dee2aaSAndroid Build Coastguard Worker // "add, round, and narrow back to 8-bits" instruction. 318*c8dee2aaSAndroid Build Coastguard Worker return vraddhn_u16(x, vrshrq_n_u16(x, 8)); 319*c8dee2aaSAndroid Build Coastguard Worker} 320*c8dee2aaSAndroid Build Coastguard Worker 321*c8dee2aaSAndroid Build Coastguard Worker// Scale a byte by another, (x * y + 127) / 255 322*c8dee2aaSAndroid Build Coastguard WorkerSI uint8x8_t scale(uint8x8_t x, uint8x8_t y) { 323*c8dee2aaSAndroid Build Coastguard Worker return div255_round(vmull_u8(x, y)); 324*c8dee2aaSAndroid Build Coastguard Worker} 325*c8dee2aaSAndroid Build Coastguard Worker 326*c8dee2aaSAndroid Build Coastguard Workerstatic void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) { 327*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 328*c8dee2aaSAndroid Build Coastguard Worker // Load 8 pixels. 329*c8dee2aaSAndroid Build Coastguard Worker uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); 330*c8dee2aaSAndroid Build Coastguard Worker 331*c8dee2aaSAndroid Build Coastguard Worker uint8x8_t a = rgba.val[3], 332*c8dee2aaSAndroid Build Coastguard Worker b = rgba.val[2], 333*c8dee2aaSAndroid Build Coastguard Worker g = rgba.val[1], 334*c8dee2aaSAndroid Build Coastguard Worker r = rgba.val[0]; 335*c8dee2aaSAndroid Build Coastguard Worker 336*c8dee2aaSAndroid Build Coastguard Worker // Premultiply. 337*c8dee2aaSAndroid Build Coastguard Worker b = scale(b, a); 338*c8dee2aaSAndroid Build Coastguard Worker g = scale(g, a); 339*c8dee2aaSAndroid Build Coastguard Worker r = scale(r, a); 340*c8dee2aaSAndroid Build Coastguard Worker 341*c8dee2aaSAndroid Build Coastguard Worker // Store 8 premultiplied pixels. 342*c8dee2aaSAndroid Build Coastguard Worker if (kSwapRB) { 343*c8dee2aaSAndroid Build Coastguard Worker rgba.val[2] = r; 344*c8dee2aaSAndroid Build Coastguard Worker rgba.val[1] = g; 345*c8dee2aaSAndroid Build Coastguard Worker rgba.val[0] = b; 346*c8dee2aaSAndroid Build Coastguard Worker } else { 347*c8dee2aaSAndroid Build Coastguard Worker rgba.val[2] = b; 348*c8dee2aaSAndroid Build Coastguard Worker rgba.val[1] = g; 349*c8dee2aaSAndroid Build Coastguard Worker rgba.val[0] = r; 350*c8dee2aaSAndroid Build Coastguard Worker } 351*c8dee2aaSAndroid Build Coastguard Worker vst4_u8((uint8_t*) dst, rgba); 352*c8dee2aaSAndroid Build Coastguard Worker src += 8; 353*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 354*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 355*c8dee2aaSAndroid Build Coastguard Worker } 356*c8dee2aaSAndroid Build Coastguard Worker 357*c8dee2aaSAndroid Build Coastguard Worker // Call portable code to finish up the tail of [0,8) pixels. 358*c8dee2aaSAndroid Build Coastguard Worker auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 359*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 360*c8dee2aaSAndroid Build Coastguard Worker} 361*c8dee2aaSAndroid Build Coastguard Worker 362*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 363*c8dee2aaSAndroid Build Coastguard Worker premul_should_swapRB(false, dst, src, count); 364*c8dee2aaSAndroid Build Coastguard Worker} 365*c8dee2aaSAndroid Build Coastguard Worker 366*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 367*c8dee2aaSAndroid Build Coastguard Worker premul_should_swapRB(true, dst, src, count); 368*c8dee2aaSAndroid Build Coastguard Worker} 369*c8dee2aaSAndroid Build Coastguard Worker 370*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 371*c8dee2aaSAndroid Build Coastguard Worker using std::swap; 372*c8dee2aaSAndroid Build Coastguard Worker while (count >= 16) { 373*c8dee2aaSAndroid Build Coastguard Worker // Load 16 pixels. 374*c8dee2aaSAndroid Build Coastguard Worker uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src); 375*c8dee2aaSAndroid Build Coastguard Worker 376*c8dee2aaSAndroid Build Coastguard Worker // Swap r and b. 377*c8dee2aaSAndroid Build Coastguard Worker swap(rgba.val[0], rgba.val[2]); 378*c8dee2aaSAndroid Build Coastguard Worker 379*c8dee2aaSAndroid Build Coastguard Worker // Store 16 pixels. 380*c8dee2aaSAndroid Build Coastguard Worker vst4q_u8((uint8_t*) dst, rgba); 381*c8dee2aaSAndroid Build Coastguard Worker src += 16; 382*c8dee2aaSAndroid Build Coastguard Worker dst += 16; 383*c8dee2aaSAndroid Build Coastguard Worker count -= 16; 384*c8dee2aaSAndroid Build Coastguard Worker } 385*c8dee2aaSAndroid Build Coastguard Worker 386*c8dee2aaSAndroid Build Coastguard Worker if (count >= 8) { 387*c8dee2aaSAndroid Build Coastguard Worker // Load 8 pixels. 388*c8dee2aaSAndroid Build Coastguard Worker uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); 389*c8dee2aaSAndroid Build Coastguard Worker 390*c8dee2aaSAndroid Build Coastguard Worker // Swap r and b. 391*c8dee2aaSAndroid Build Coastguard Worker swap(rgba.val[0], rgba.val[2]); 392*c8dee2aaSAndroid Build Coastguard Worker 393*c8dee2aaSAndroid Build Coastguard Worker // Store 8 pixels. 394*c8dee2aaSAndroid Build Coastguard Worker vst4_u8((uint8_t*) dst, rgba); 395*c8dee2aaSAndroid Build Coastguard Worker src += 8; 396*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 397*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 398*c8dee2aaSAndroid Build Coastguard Worker } 399*c8dee2aaSAndroid Build Coastguard Worker 400*c8dee2aaSAndroid Build Coastguard Worker RGBA_to_BGRA_portable(dst, src, count); 401*c8dee2aaSAndroid Build Coastguard Worker} 402*c8dee2aaSAndroid Build Coastguard Worker 403*c8dee2aaSAndroid Build Coastguard Workerstatic void expand_grayA(bool kPremul, uint32_t dst[], const uint8_t* src, int count) { 404*c8dee2aaSAndroid Build Coastguard Worker while (count >= 16) { 405*c8dee2aaSAndroid Build Coastguard Worker // Load 16 pixels. 406*c8dee2aaSAndroid Build Coastguard Worker uint8x16x2_t ga = vld2q_u8(src); 407*c8dee2aaSAndroid Build Coastguard Worker 408*c8dee2aaSAndroid Build Coastguard Worker // Premultiply if requested. 409*c8dee2aaSAndroid Build Coastguard Worker if (kPremul) { 410*c8dee2aaSAndroid Build Coastguard Worker ga.val[0] = vcombine_u8( 411*c8dee2aaSAndroid Build Coastguard Worker scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])), 412*c8dee2aaSAndroid Build Coastguard Worker scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1]))); 413*c8dee2aaSAndroid Build Coastguard Worker } 414*c8dee2aaSAndroid Build Coastguard Worker 415*c8dee2aaSAndroid Build Coastguard Worker // Set each of the color channels. 416*c8dee2aaSAndroid Build Coastguard Worker uint8x16x4_t rgba; 417*c8dee2aaSAndroid Build Coastguard Worker rgba.val[0] = ga.val[0]; 418*c8dee2aaSAndroid Build Coastguard Worker rgba.val[1] = ga.val[0]; 419*c8dee2aaSAndroid Build Coastguard Worker rgba.val[2] = ga.val[0]; 420*c8dee2aaSAndroid Build Coastguard Worker rgba.val[3] = ga.val[1]; 421*c8dee2aaSAndroid Build Coastguard Worker 422*c8dee2aaSAndroid Build Coastguard Worker // Store 16 pixels. 423*c8dee2aaSAndroid Build Coastguard Worker vst4q_u8((uint8_t*) dst, rgba); 424*c8dee2aaSAndroid Build Coastguard Worker src += 16*2; 425*c8dee2aaSAndroid Build Coastguard Worker dst += 16; 426*c8dee2aaSAndroid Build Coastguard Worker count -= 16; 427*c8dee2aaSAndroid Build Coastguard Worker } 428*c8dee2aaSAndroid Build Coastguard Worker 429*c8dee2aaSAndroid Build Coastguard Worker if (count >= 8) { 430*c8dee2aaSAndroid Build Coastguard Worker // Load 8 pixels. 431*c8dee2aaSAndroid Build Coastguard Worker uint8x8x2_t ga = vld2_u8(src); 432*c8dee2aaSAndroid Build Coastguard Worker 433*c8dee2aaSAndroid Build Coastguard Worker // Premultiply if requested. 434*c8dee2aaSAndroid Build Coastguard Worker if (kPremul) { 435*c8dee2aaSAndroid Build Coastguard Worker ga.val[0] = scale(ga.val[0], ga.val[1]); 436*c8dee2aaSAndroid Build Coastguard Worker } 437*c8dee2aaSAndroid Build Coastguard Worker 438*c8dee2aaSAndroid Build Coastguard Worker // Set each of the color channels. 439*c8dee2aaSAndroid Build Coastguard Worker uint8x8x4_t rgba; 440*c8dee2aaSAndroid Build Coastguard Worker rgba.val[0] = ga.val[0]; 441*c8dee2aaSAndroid Build Coastguard Worker rgba.val[1] = ga.val[0]; 442*c8dee2aaSAndroid Build Coastguard Worker rgba.val[2] = ga.val[0]; 443*c8dee2aaSAndroid Build Coastguard Worker rgba.val[3] = ga.val[1]; 444*c8dee2aaSAndroid Build Coastguard Worker 445*c8dee2aaSAndroid Build Coastguard Worker // Store 8 pixels. 446*c8dee2aaSAndroid Build Coastguard Worker vst4_u8((uint8_t*) dst, rgba); 447*c8dee2aaSAndroid Build Coastguard Worker src += 8*2; 448*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 449*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 450*c8dee2aaSAndroid Build Coastguard Worker } 451*c8dee2aaSAndroid Build Coastguard Worker 452*c8dee2aaSAndroid Build Coastguard Worker auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable; 453*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 454*c8dee2aaSAndroid Build Coastguard Worker} 455*c8dee2aaSAndroid Build Coastguard Worker 456*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 457*c8dee2aaSAndroid Build Coastguard Worker expand_grayA(false, dst, src, count); 458*c8dee2aaSAndroid Build Coastguard Worker} 459*c8dee2aaSAndroid Build Coastguard Worker 460*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 461*c8dee2aaSAndroid Build Coastguard Worker expand_grayA(true, dst, src, count); 462*c8dee2aaSAndroid Build Coastguard Worker} 463*c8dee2aaSAndroid Build Coastguard Worker 464*c8dee2aaSAndroid Build Coastguard Workerenum Format { kRGB1, kBGR1 }; 465*c8dee2aaSAndroid Build Coastguard Workerstatic void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) { 466*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 467*c8dee2aaSAndroid Build Coastguard Worker // Load 8 cmyk pixels. 468*c8dee2aaSAndroid Build Coastguard Worker uint8x8x4_t pixels = vld4_u8((const uint8_t*) src); 469*c8dee2aaSAndroid Build Coastguard Worker 470*c8dee2aaSAndroid Build Coastguard Worker uint8x8_t k = pixels.val[3], 471*c8dee2aaSAndroid Build Coastguard Worker y = pixels.val[2], 472*c8dee2aaSAndroid Build Coastguard Worker m = pixels.val[1], 473*c8dee2aaSAndroid Build Coastguard Worker c = pixels.val[0]; 474*c8dee2aaSAndroid Build Coastguard Worker 475*c8dee2aaSAndroid Build Coastguard Worker // Scale to r, g, b. 476*c8dee2aaSAndroid Build Coastguard Worker uint8x8_t b = scale(y, k); 477*c8dee2aaSAndroid Build Coastguard Worker uint8x8_t g = scale(m, k); 478*c8dee2aaSAndroid Build Coastguard Worker uint8x8_t r = scale(c, k); 479*c8dee2aaSAndroid Build Coastguard Worker 480*c8dee2aaSAndroid Build Coastguard Worker // Store 8 rgba pixels. 481*c8dee2aaSAndroid Build Coastguard Worker if (kBGR1 == format) { 482*c8dee2aaSAndroid Build Coastguard Worker pixels.val[3] = vdup_n_u8(0xFF); 483*c8dee2aaSAndroid Build Coastguard Worker pixels.val[2] = r; 484*c8dee2aaSAndroid Build Coastguard Worker pixels.val[1] = g; 485*c8dee2aaSAndroid Build Coastguard Worker pixels.val[0] = b; 486*c8dee2aaSAndroid Build Coastguard Worker } else { 487*c8dee2aaSAndroid Build Coastguard Worker pixels.val[3] = vdup_n_u8(0xFF); 488*c8dee2aaSAndroid Build Coastguard Worker pixels.val[2] = b; 489*c8dee2aaSAndroid Build Coastguard Worker pixels.val[1] = g; 490*c8dee2aaSAndroid Build Coastguard Worker pixels.val[0] = r; 491*c8dee2aaSAndroid Build Coastguard Worker } 492*c8dee2aaSAndroid Build Coastguard Worker vst4_u8((uint8_t*) dst, pixels); 493*c8dee2aaSAndroid Build Coastguard Worker src += 8; 494*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 495*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 496*c8dee2aaSAndroid Build Coastguard Worker } 497*c8dee2aaSAndroid Build Coastguard Worker 498*c8dee2aaSAndroid Build Coastguard Worker auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 499*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 500*c8dee2aaSAndroid Build Coastguard Worker} 501*c8dee2aaSAndroid Build Coastguard Worker 502*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 503*c8dee2aaSAndroid Build Coastguard Worker inverted_cmyk_to(kRGB1, dst, src, count); 504*c8dee2aaSAndroid Build Coastguard Worker} 505*c8dee2aaSAndroid Build Coastguard Worker 506*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 507*c8dee2aaSAndroid Build Coastguard Worker inverted_cmyk_to(kBGR1, dst, src, count); 508*c8dee2aaSAndroid Build Coastguard Worker} 509*c8dee2aaSAndroid Build Coastguard Worker 510*c8dee2aaSAndroid Build Coastguard Workertemplate <bool swapRB> 511*c8dee2aaSAndroid Build Coastguard Workerstatic void common_rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) { 512*c8dee2aaSAndroid Build Coastguard Worker 513*c8dee2aaSAndroid Build Coastguard Worker // Only use the SIMD code if simulating RP, otherwise the quick code auto-vectorizes will 514*c8dee2aaSAndroid Build Coastguard Worker // enough on ARM to not need a SIMD implementation. 515*c8dee2aaSAndroid Build Coastguard Worker if constexpr (!kFastUnpremul) { 516*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 517*c8dee2aaSAndroid Build Coastguard Worker const uint8x8x4_t in = vld4_u8((const uint8_t*)src); 518*c8dee2aaSAndroid Build Coastguard Worker 519*c8dee2aaSAndroid Build Coastguard Worker auto round = [](float32x4_t v) -> uint32x4_t { 520*c8dee2aaSAndroid Build Coastguard Worker #if defined(SK_CPU_ARM64) 521*c8dee2aaSAndroid Build Coastguard Worker return vcvtnq_u32_f32(v); 522*c8dee2aaSAndroid Build Coastguard Worker #else 523*c8dee2aaSAndroid Build Coastguard Worker return vcvtq_u32_f32(v + 0.5f); 524*c8dee2aaSAndroid Build Coastguard Worker #endif 525*c8dee2aaSAndroid Build Coastguard Worker }; 526*c8dee2aaSAndroid Build Coastguard Worker 527*c8dee2aaSAndroid Build Coastguard Worker static constexpr float kN = 1.0f / 255.0f; 528*c8dee2aaSAndroid Build Coastguard Worker auto toNormalized = [](uint16x4_t v) -> float32x4_t { 529*c8dee2aaSAndroid Build Coastguard Worker return vcvtq_f32_u32(vmovl_u16(v)) * kN; 530*c8dee2aaSAndroid Build Coastguard Worker }; 531*c8dee2aaSAndroid Build Coastguard Worker 532*c8dee2aaSAndroid Build Coastguard Worker auto unpremulHalf = 533*c8dee2aaSAndroid Build Coastguard Worker [toNormalized, round](float32x4_t invA, uint16x4_t v) -> uint16x4_t { 534*c8dee2aaSAndroid Build Coastguard Worker const float32x4_t normalizedV = toNormalized(v); 535*c8dee2aaSAndroid Build Coastguard Worker const float32x4_t divided = invA * normalizedV; 536*c8dee2aaSAndroid Build Coastguard Worker const float32x4_t denormalized = divided * 255.0f; 537*c8dee2aaSAndroid Build Coastguard Worker const uint32x4_t rounded = round(denormalized); 538*c8dee2aaSAndroid Build Coastguard Worker return vqmovn_u32(rounded); 539*c8dee2aaSAndroid Build Coastguard Worker }; 540*c8dee2aaSAndroid Build Coastguard Worker 541*c8dee2aaSAndroid Build Coastguard Worker auto reciprocal = [](float32x4_t a) -> float32x4_t { 542*c8dee2aaSAndroid Build Coastguard Worker uint32x4_t mask = sk_bit_cast<uint32x4_t>(a != float32x4_t{0, 0, 0, 0}); 543*c8dee2aaSAndroid Build Coastguard Worker auto recip = 1.0f / a; 544*c8dee2aaSAndroid Build Coastguard Worker return sk_bit_cast<float32x4_t>(mask & sk_bit_cast<uint32x4_t>(recip)); 545*c8dee2aaSAndroid Build Coastguard Worker }; 546*c8dee2aaSAndroid Build Coastguard Worker 547*c8dee2aaSAndroid Build Coastguard Worker const uint8x8_t a = in.val[3]; 548*c8dee2aaSAndroid Build Coastguard Worker const uint16x8_t intA = vmovl_u8(a); 549*c8dee2aaSAndroid Build Coastguard Worker const float32x4_t invALow = reciprocal(toNormalized(vget_low_u16(intA))); 550*c8dee2aaSAndroid Build Coastguard Worker const float32x4_t invAHigh = reciprocal(toNormalized(vget_high_u16(intA))); 551*c8dee2aaSAndroid Build Coastguard Worker 552*c8dee2aaSAndroid Build Coastguard Worker auto unpremul = [unpremulHalf, invALow, invAHigh](uint8x8_t v) -> uint8x8_t { 553*c8dee2aaSAndroid Build Coastguard Worker const uint16x8_t to16 = vmovl_u8(v); 554*c8dee2aaSAndroid Build Coastguard Worker 555*c8dee2aaSAndroid Build Coastguard Worker const uint16x4_t low = unpremulHalf(invALow, vget_low_u16(to16)); 556*c8dee2aaSAndroid Build Coastguard Worker const uint16x4_t high = unpremulHalf(invAHigh, vget_high_u16(to16)); 557*c8dee2aaSAndroid Build Coastguard Worker 558*c8dee2aaSAndroid Build Coastguard Worker const uint16x8_t combined = vcombine_u16(low, high); 559*c8dee2aaSAndroid Build Coastguard Worker return vqmovn_u16(combined); 560*c8dee2aaSAndroid Build Coastguard Worker }; 561*c8dee2aaSAndroid Build Coastguard Worker 562*c8dee2aaSAndroid Build Coastguard Worker const uint8x8_t b = unpremul(in.val[2]); 563*c8dee2aaSAndroid Build Coastguard Worker const uint8x8_t g = unpremul(in.val[1]); 564*c8dee2aaSAndroid Build Coastguard Worker const uint8x8_t r = unpremul(in.val[0]); 565*c8dee2aaSAndroid Build Coastguard Worker 566*c8dee2aaSAndroid Build Coastguard Worker if constexpr (swapRB) { 567*c8dee2aaSAndroid Build Coastguard Worker const uint8x8x4_t out{b, g, r, a}; 568*c8dee2aaSAndroid Build Coastguard Worker vst4_u8((uint8_t*)dst, out); 569*c8dee2aaSAndroid Build Coastguard Worker } else { 570*c8dee2aaSAndroid Build Coastguard Worker const uint8x8x4_t out{r, g, b, a}; 571*c8dee2aaSAndroid Build Coastguard Worker vst4_u8((uint8_t*)dst, out); 572*c8dee2aaSAndroid Build Coastguard Worker } 573*c8dee2aaSAndroid Build Coastguard Worker 574*c8dee2aaSAndroid Build Coastguard Worker src += 8; 575*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 576*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 577*c8dee2aaSAndroid Build Coastguard Worker } 578*c8dee2aaSAndroid Build Coastguard Worker } 579*c8dee2aaSAndroid Build Coastguard Worker 580*c8dee2aaSAndroid Build Coastguard Worker // Handle the tail. Count will be < 8. 581*c8dee2aaSAndroid Build Coastguard Worker if constexpr (swapRB) { 582*c8dee2aaSAndroid Build Coastguard Worker rgbA_to_BGRA_portable(dst, src, count); 583*c8dee2aaSAndroid Build Coastguard Worker } else { 584*c8dee2aaSAndroid Build Coastguard Worker rgbA_to_RGBA_portable(dst, src, count); 585*c8dee2aaSAndroid Build Coastguard Worker } 586*c8dee2aaSAndroid Build Coastguard Worker} 587*c8dee2aaSAndroid Build Coastguard Worker 588*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) { 589*c8dee2aaSAndroid Build Coastguard Worker common_rgbA_to_RGBA</*swapRB=*/false>(dst, src, count); 590*c8dee2aaSAndroid Build Coastguard Worker} 591*c8dee2aaSAndroid Build Coastguard Worker 592*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 593*c8dee2aaSAndroid Build Coastguard Worker common_rgbA_to_RGBA</*swapRB=*/true>(dst, src, count); 594*c8dee2aaSAndroid Build Coastguard Worker} 595*c8dee2aaSAndroid Build Coastguard Worker 596*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 597*c8dee2aaSAndroid Build Coastguard Worker// -- AVX2 ----------------------------------------------------------------------------------------- 598*c8dee2aaSAndroid Build Coastguard Worker 599*c8dee2aaSAndroid Build Coastguard Worker// Scale a byte by another. 600*c8dee2aaSAndroid Build Coastguard Worker// Inputs are stored in 16-bit lanes, but are not larger than 8-bits. 601*c8dee2aaSAndroid Build Coastguard Workerstatic __m256i scale(__m256i x, __m256i y) { 602*c8dee2aaSAndroid Build Coastguard Worker const __m256i _128 = _mm256_set1_epi16(128); 603*c8dee2aaSAndroid Build Coastguard Worker const __m256i _257 = _mm256_set1_epi16(257); 604*c8dee2aaSAndroid Build Coastguard Worker 605*c8dee2aaSAndroid Build Coastguard Worker // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. 606*c8dee2aaSAndroid Build Coastguard Worker return _mm256_mulhi_epu16(_mm256_add_epi16(_mm256_mullo_epi16(x, y), _128), _257); 607*c8dee2aaSAndroid Build Coastguard Worker} 608*c8dee2aaSAndroid Build Coastguard Worker 609*c8dee2aaSAndroid Build Coastguard Workerstatic void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) { 610*c8dee2aaSAndroid Build Coastguard Worker 611*c8dee2aaSAndroid Build Coastguard Worker auto premul8 = [=](__m256i* lo, __m256i* hi) { 612*c8dee2aaSAndroid Build Coastguard Worker const __m256i zeros = _mm256_setzero_si256(); 613*c8dee2aaSAndroid Build Coastguard Worker __m256i planar; 614*c8dee2aaSAndroid Build Coastguard Worker if (kSwapRB) { 615*c8dee2aaSAndroid Build Coastguard Worker planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15, 616*c8dee2aaSAndroid Build Coastguard Worker 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 617*c8dee2aaSAndroid Build Coastguard Worker } else { 618*c8dee2aaSAndroid Build Coastguard Worker planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15, 619*c8dee2aaSAndroid Build Coastguard Worker 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 620*c8dee2aaSAndroid Build Coastguard Worker } 621*c8dee2aaSAndroid Build Coastguard Worker 622*c8dee2aaSAndroid Build Coastguard Worker // Swizzle the pixels to 8-bit planar. 623*c8dee2aaSAndroid Build Coastguard Worker *lo = _mm256_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa rrrrgggg bbbbaaaa 624*c8dee2aaSAndroid Build Coastguard Worker *hi = _mm256_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA RRRRGGGG BBBBAAAA 625*c8dee2aaSAndroid Build Coastguard Worker __m256i rg = _mm256_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG rrrrRRRR ggggGGGG 626*c8dee2aaSAndroid Build Coastguard Worker ba = _mm256_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA bbbbBBBB aaaaAAAA 627*c8dee2aaSAndroid Build Coastguard Worker 628*c8dee2aaSAndroid Build Coastguard Worker // Unpack to 16-bit planar. 629*c8dee2aaSAndroid Build Coastguard Worker __m256i r = _mm256_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_ r_r_r_r_ R_R_R_R_ 630*c8dee2aaSAndroid Build Coastguard Worker g = _mm256_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_ g_g_g_g_ G_G_G_G_ 631*c8dee2aaSAndroid Build Coastguard Worker b = _mm256_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ b_b_b_b_ B_B_B_B_ 632*c8dee2aaSAndroid Build Coastguard Worker a = _mm256_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ a_a_a_a_ A_A_A_A_ 633*c8dee2aaSAndroid Build Coastguard Worker 634*c8dee2aaSAndroid Build Coastguard Worker // Premultiply! 635*c8dee2aaSAndroid Build Coastguard Worker r = scale(r, a); 636*c8dee2aaSAndroid Build Coastguard Worker g = scale(g, a); 637*c8dee2aaSAndroid Build Coastguard Worker b = scale(b, a); 638*c8dee2aaSAndroid Build Coastguard Worker 639*c8dee2aaSAndroid Build Coastguard Worker // Repack into interlaced pixels. 640*c8dee2aaSAndroid Build Coastguard Worker rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG 641*c8dee2aaSAndroid Build Coastguard Worker ba = _mm256_or_si256(b, _mm256_slli_epi16(a, 8)); // babababa BABABABA babababa BABABABA 642*c8dee2aaSAndroid Build Coastguard Worker *lo = _mm256_unpacklo_epi16(rg, ba); // rgbargba rgbargba rgbargba rgbargba 643*c8dee2aaSAndroid Build Coastguard Worker *hi = _mm256_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA RGBARGBA RGBARGBA 644*c8dee2aaSAndroid Build Coastguard Worker }; 645*c8dee2aaSAndroid Build Coastguard Worker 646*c8dee2aaSAndroid Build Coastguard Worker while (count >= 16) { 647*c8dee2aaSAndroid Build Coastguard Worker __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)), 648*c8dee2aaSAndroid Build Coastguard Worker hi = _mm256_loadu_si256((const __m256i*) (src + 8)); 649*c8dee2aaSAndroid Build Coastguard Worker 650*c8dee2aaSAndroid Build Coastguard Worker premul8(&lo, &hi); 651*c8dee2aaSAndroid Build Coastguard Worker 652*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) (dst + 0), lo); 653*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) (dst + 8), hi); 654*c8dee2aaSAndroid Build Coastguard Worker 655*c8dee2aaSAndroid Build Coastguard Worker src += 16; 656*c8dee2aaSAndroid Build Coastguard Worker dst += 16; 657*c8dee2aaSAndroid Build Coastguard Worker count -= 16; 658*c8dee2aaSAndroid Build Coastguard Worker } 659*c8dee2aaSAndroid Build Coastguard Worker 660*c8dee2aaSAndroid Build Coastguard Worker if (count >= 8) { 661*c8dee2aaSAndroid Build Coastguard Worker __m256i lo = _mm256_loadu_si256((const __m256i*) src), 662*c8dee2aaSAndroid Build Coastguard Worker hi = _mm256_setzero_si256(); 663*c8dee2aaSAndroid Build Coastguard Worker 664*c8dee2aaSAndroid Build Coastguard Worker premul8(&lo, &hi); 665*c8dee2aaSAndroid Build Coastguard Worker 666*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) dst, lo); 667*c8dee2aaSAndroid Build Coastguard Worker 668*c8dee2aaSAndroid Build Coastguard Worker src += 8; 669*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 670*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 671*c8dee2aaSAndroid Build Coastguard Worker } 672*c8dee2aaSAndroid Build Coastguard Worker 673*c8dee2aaSAndroid Build Coastguard Worker // Call portable code to finish up the tail of [0,8) pixels. 674*c8dee2aaSAndroid Build Coastguard Worker auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 675*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 676*c8dee2aaSAndroid Build Coastguard Worker} 677*c8dee2aaSAndroid Build Coastguard Worker 678*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 679*c8dee2aaSAndroid Build Coastguard Worker premul_should_swapRB(false, dst, src, count); 680*c8dee2aaSAndroid Build Coastguard Worker} 681*c8dee2aaSAndroid Build Coastguard Worker 682*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 683*c8dee2aaSAndroid Build Coastguard Worker premul_should_swapRB(true, dst, src, count); 684*c8dee2aaSAndroid Build Coastguard Worker} 685*c8dee2aaSAndroid Build Coastguard Worker 686*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 687*c8dee2aaSAndroid Build Coastguard Worker const __m256i swapRB = _mm256_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15, 688*c8dee2aaSAndroid Build Coastguard Worker 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15); 689*c8dee2aaSAndroid Build Coastguard Worker 690*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 691*c8dee2aaSAndroid Build Coastguard Worker __m256i rgba = _mm256_loadu_si256((const __m256i*) src); 692*c8dee2aaSAndroid Build Coastguard Worker __m256i bgra = _mm256_shuffle_epi8(rgba, swapRB); 693*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) dst, bgra); 694*c8dee2aaSAndroid Build Coastguard Worker 695*c8dee2aaSAndroid Build Coastguard Worker src += 8; 696*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 697*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 698*c8dee2aaSAndroid Build Coastguard Worker } 699*c8dee2aaSAndroid Build Coastguard Worker 700*c8dee2aaSAndroid Build Coastguard Worker RGBA_to_BGRA_portable(dst, src, count); 701*c8dee2aaSAndroid Build Coastguard Worker} 702*c8dee2aaSAndroid Build Coastguard Worker 703*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 704*c8dee2aaSAndroid Build Coastguard Worker while (count >= 16) { 705*c8dee2aaSAndroid Build Coastguard Worker __m256i ga = _mm256_loadu_si256((const __m256i*) src); 706*c8dee2aaSAndroid Build Coastguard Worker 707*c8dee2aaSAndroid Build Coastguard Worker __m256i gg = _mm256_or_si256(_mm256_and_si256(ga, _mm256_set1_epi16(0x00FF)), 708*c8dee2aaSAndroid Build Coastguard Worker _mm256_slli_epi16(ga, 8)); 709*c8dee2aaSAndroid Build Coastguard Worker 710*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga); 711*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga); 712*c8dee2aaSAndroid Build Coastguard Worker 713*c8dee2aaSAndroid Build Coastguard Worker // Shuffle for pixel reorder 714*c8dee2aaSAndroid Build Coastguard Worker // Note. 'p' stands for 'ggga' 715*c8dee2aaSAndroid Build Coastguard Worker // Before shuffle: 716*c8dee2aaSAndroid Build Coastguard Worker // ggga_lo = p0 p1 p2 p3 | p8 p9 p10 p11 717*c8dee2aaSAndroid Build Coastguard Worker // ggga_hi = p4 p5 p6 p7 | p12 p13 p14 p15 718*c8dee2aaSAndroid Build Coastguard Worker // 719*c8dee2aaSAndroid Build Coastguard Worker // After shuffle: 720*c8dee2aaSAndroid Build Coastguard Worker // ggga_lo_shuffle = p0 p1 p2 p3 | p4 p5 p6 p7 721*c8dee2aaSAndroid Build Coastguard Worker // ggga_hi_shuffle = p8 p9 p10 p11 | p12 p13 p14 p15 722*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20), 723*c8dee2aaSAndroid Build Coastguard Worker ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31); 724*c8dee2aaSAndroid Build Coastguard Worker 725*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) (dst + 0), ggga_lo_shuffle); 726*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) (dst + 8), ggga_hi_shuffle); 727*c8dee2aaSAndroid Build Coastguard Worker 728*c8dee2aaSAndroid Build Coastguard Worker src += 16*2; 729*c8dee2aaSAndroid Build Coastguard Worker dst += 16; 730*c8dee2aaSAndroid Build Coastguard Worker count -= 16; 731*c8dee2aaSAndroid Build Coastguard Worker } 732*c8dee2aaSAndroid Build Coastguard Worker 733*c8dee2aaSAndroid Build Coastguard Worker grayA_to_RGBA_portable(dst, src, count); 734*c8dee2aaSAndroid Build Coastguard Worker} 735*c8dee2aaSAndroid Build Coastguard Worker 736*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 737*c8dee2aaSAndroid Build Coastguard Worker while (count >= 16) { 738*c8dee2aaSAndroid Build Coastguard Worker __m256i grayA = _mm256_loadu_si256((const __m256i*) src); 739*c8dee2aaSAndroid Build Coastguard Worker 740*c8dee2aaSAndroid Build Coastguard Worker __m256i g0 = _mm256_and_si256(grayA, _mm256_set1_epi16(0x00FF)); 741*c8dee2aaSAndroid Build Coastguard Worker __m256i a0 = _mm256_srli_epi16(grayA, 8); 742*c8dee2aaSAndroid Build Coastguard Worker 743*c8dee2aaSAndroid Build Coastguard Worker // Premultiply 744*c8dee2aaSAndroid Build Coastguard Worker g0 = scale(g0, a0); 745*c8dee2aaSAndroid Build Coastguard Worker 746*c8dee2aaSAndroid Build Coastguard Worker __m256i gg = _mm256_or_si256(g0, _mm256_slli_epi16(g0, 8)); 747*c8dee2aaSAndroid Build Coastguard Worker __m256i ga = _mm256_or_si256(g0, _mm256_slli_epi16(a0, 8)); 748*c8dee2aaSAndroid Build Coastguard Worker 749*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga); 750*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga); 751*c8dee2aaSAndroid Build Coastguard Worker 752*c8dee2aaSAndroid Build Coastguard Worker // Shuffle for pixel reorder, similar as grayA_to_RGBA 753*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20), 754*c8dee2aaSAndroid Build Coastguard Worker ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31); 755*c8dee2aaSAndroid Build Coastguard Worker 756*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) (dst + 0), ggga_lo_shuffle); 757*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) (dst + 8), ggga_hi_shuffle); 758*c8dee2aaSAndroid Build Coastguard Worker 759*c8dee2aaSAndroid Build Coastguard Worker src += 16*2; 760*c8dee2aaSAndroid Build Coastguard Worker dst += 16; 761*c8dee2aaSAndroid Build Coastguard Worker count -= 16; 762*c8dee2aaSAndroid Build Coastguard Worker } 763*c8dee2aaSAndroid Build Coastguard Worker 764*c8dee2aaSAndroid Build Coastguard Worker grayA_to_rgbA_portable(dst, src, count); 765*c8dee2aaSAndroid Build Coastguard Worker} 766*c8dee2aaSAndroid Build Coastguard Worker 767*c8dee2aaSAndroid Build Coastguard Workerenum Format { kRGB1, kBGR1 }; 768*c8dee2aaSAndroid Build Coastguard Workerstatic void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) { 769*c8dee2aaSAndroid Build Coastguard Worker auto convert8 = [=](__m256i* lo, __m256i* hi) { 770*c8dee2aaSAndroid Build Coastguard Worker const __m256i zeros = _mm256_setzero_si256(); 771*c8dee2aaSAndroid Build Coastguard Worker __m256i planar; 772*c8dee2aaSAndroid Build Coastguard Worker if (kBGR1 == format) { 773*c8dee2aaSAndroid Build Coastguard Worker planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15, 774*c8dee2aaSAndroid Build Coastguard Worker 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 775*c8dee2aaSAndroid Build Coastguard Worker } else { 776*c8dee2aaSAndroid Build Coastguard Worker planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15, 777*c8dee2aaSAndroid Build Coastguard Worker 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 778*c8dee2aaSAndroid Build Coastguard Worker } 779*c8dee2aaSAndroid Build Coastguard Worker 780*c8dee2aaSAndroid Build Coastguard Worker // Swizzle the pixels to 8-bit planar. 781*c8dee2aaSAndroid Build Coastguard Worker *lo = _mm256_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk ccccmmmm yyyykkkk 782*c8dee2aaSAndroid Build Coastguard Worker *hi = _mm256_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK CCCCMMMM YYYYKKKK 783*c8dee2aaSAndroid Build Coastguard Worker __m256i cm = _mm256_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM ccccCCCC mmmmMMMM 784*c8dee2aaSAndroid Build Coastguard Worker yk = _mm256_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK yyyyYYYY kkkkKKKK 785*c8dee2aaSAndroid Build Coastguard Worker 786*c8dee2aaSAndroid Build Coastguard Worker // Unpack to 16-bit planar. 787*c8dee2aaSAndroid Build Coastguard Worker __m256i c = _mm256_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_ c_c_c_c_ C_C_C_C_ 788*c8dee2aaSAndroid Build Coastguard Worker m = _mm256_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_ m_m_m_m_ M_M_M_M_ 789*c8dee2aaSAndroid Build Coastguard Worker y = _mm256_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_ y_y_y_y_ Y_Y_Y_Y_ 790*c8dee2aaSAndroid Build Coastguard Worker k = _mm256_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_ k_k_k_k_ K_K_K_K_ 791*c8dee2aaSAndroid Build Coastguard Worker 792*c8dee2aaSAndroid Build Coastguard Worker // Scale to r, g, b. 793*c8dee2aaSAndroid Build Coastguard Worker __m256i r = scale(c, k), 794*c8dee2aaSAndroid Build Coastguard Worker g = scale(m, k), 795*c8dee2aaSAndroid Build Coastguard Worker b = scale(y, k); 796*c8dee2aaSAndroid Build Coastguard Worker 797*c8dee2aaSAndroid Build Coastguard Worker // Repack into interlaced pixels: 798*c8dee2aaSAndroid Build Coastguard Worker // rg = rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG 799*c8dee2aaSAndroid Build Coastguard Worker // ba = b1b1b1b1 B1B1B1B1 b1b1b1b1 B1B1B1B1 800*c8dee2aaSAndroid Build Coastguard Worker __m256i rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8)), 801*c8dee2aaSAndroid Build Coastguard Worker ba = _mm256_or_si256(b, _mm256_set1_epi16((uint16_t) 0xFF00)); 802*c8dee2aaSAndroid Build Coastguard Worker *lo = _mm256_unpacklo_epi16(rg, ba); // rgb1rgb1 rgb1rgb1 rgb1rgb1 rgb1rgb1 803*c8dee2aaSAndroid Build Coastguard Worker *hi = _mm256_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1 RGB1RGB1 RGB1RGB1 804*c8dee2aaSAndroid Build Coastguard Worker }; 805*c8dee2aaSAndroid Build Coastguard Worker 806*c8dee2aaSAndroid Build Coastguard Worker while (count >= 16) { 807*c8dee2aaSAndroid Build Coastguard Worker __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)), 808*c8dee2aaSAndroid Build Coastguard Worker hi = _mm256_loadu_si256((const __m256i*) (src + 8)); 809*c8dee2aaSAndroid Build Coastguard Worker 810*c8dee2aaSAndroid Build Coastguard Worker convert8(&lo, &hi); 811*c8dee2aaSAndroid Build Coastguard Worker 812*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) (dst + 0), lo); 813*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) (dst + 8), hi); 814*c8dee2aaSAndroid Build Coastguard Worker 815*c8dee2aaSAndroid Build Coastguard Worker src += 16; 816*c8dee2aaSAndroid Build Coastguard Worker dst += 16; 817*c8dee2aaSAndroid Build Coastguard Worker count -= 16; 818*c8dee2aaSAndroid Build Coastguard Worker } 819*c8dee2aaSAndroid Build Coastguard Worker 820*c8dee2aaSAndroid Build Coastguard Worker if (count >= 8) { 821*c8dee2aaSAndroid Build Coastguard Worker __m256i lo = _mm256_loadu_si256((const __m256i*) src), 822*c8dee2aaSAndroid Build Coastguard Worker hi = _mm256_setzero_si256(); 823*c8dee2aaSAndroid Build Coastguard Worker 824*c8dee2aaSAndroid Build Coastguard Worker convert8(&lo, &hi); 825*c8dee2aaSAndroid Build Coastguard Worker 826*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) dst, lo); 827*c8dee2aaSAndroid Build Coastguard Worker 828*c8dee2aaSAndroid Build Coastguard Worker src += 8; 829*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 830*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 831*c8dee2aaSAndroid Build Coastguard Worker } 832*c8dee2aaSAndroid Build Coastguard Worker 833*c8dee2aaSAndroid Build Coastguard Worker auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 834*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 835*c8dee2aaSAndroid Build Coastguard Worker} 836*c8dee2aaSAndroid Build Coastguard Worker 837*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 838*c8dee2aaSAndroid Build Coastguard Worker inverted_cmyk_to(kRGB1, dst, src, count); 839*c8dee2aaSAndroid Build Coastguard Worker} 840*c8dee2aaSAndroid Build Coastguard Worker 841*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 842*c8dee2aaSAndroid Build Coastguard Worker inverted_cmyk_to(kBGR1, dst, src, count); 843*c8dee2aaSAndroid Build Coastguard Worker} 844*c8dee2aaSAndroid Build Coastguard Worker 845*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) { 846*c8dee2aaSAndroid Build Coastguard Worker rgbA_to_RGBA_portable(dst, src, count); 847*c8dee2aaSAndroid Build Coastguard Worker} 848*c8dee2aaSAndroid Build Coastguard Worker 849*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 850*c8dee2aaSAndroid Build Coastguard Worker rgbA_to_BGRA_portable(dst, src, count); 851*c8dee2aaSAndroid Build Coastguard Worker} 852*c8dee2aaSAndroid Build Coastguard Worker 853*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 854*c8dee2aaSAndroid Build Coastguard Worker// -- SSSE3 ---------------------------------------------------------------------------------------- 855*c8dee2aaSAndroid Build Coastguard Worker 856*c8dee2aaSAndroid Build Coastguard Worker// Scale a byte by another. 857*c8dee2aaSAndroid Build Coastguard Worker// Inputs are stored in 16-bit lanes, but are not larger than 8-bits. 858*c8dee2aaSAndroid Build Coastguard Workerstatic __m128i scale(__m128i x, __m128i y) { 859*c8dee2aaSAndroid Build Coastguard Worker const __m128i _128 = _mm_set1_epi16(128); 860*c8dee2aaSAndroid Build Coastguard Worker const __m128i _257 = _mm_set1_epi16(257); 861*c8dee2aaSAndroid Build Coastguard Worker 862*c8dee2aaSAndroid Build Coastguard Worker // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. 863*c8dee2aaSAndroid Build Coastguard Worker return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); 864*c8dee2aaSAndroid Build Coastguard Worker} 865*c8dee2aaSAndroid Build Coastguard Worker 866*c8dee2aaSAndroid Build Coastguard Workerstatic void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) { 867*c8dee2aaSAndroid Build Coastguard Worker 868*c8dee2aaSAndroid Build Coastguard Worker auto premul8 = [=](__m128i* lo, __m128i* hi) { 869*c8dee2aaSAndroid Build Coastguard Worker const __m128i zeros = _mm_setzero_si128(); 870*c8dee2aaSAndroid Build Coastguard Worker __m128i planar; 871*c8dee2aaSAndroid Build Coastguard Worker if (kSwapRB) { 872*c8dee2aaSAndroid Build Coastguard Worker planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 873*c8dee2aaSAndroid Build Coastguard Worker } else { 874*c8dee2aaSAndroid Build Coastguard Worker planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 875*c8dee2aaSAndroid Build Coastguard Worker } 876*c8dee2aaSAndroid Build Coastguard Worker 877*c8dee2aaSAndroid Build Coastguard Worker // Swizzle the pixels to 8-bit planar. 878*c8dee2aaSAndroid Build Coastguard Worker *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa 879*c8dee2aaSAndroid Build Coastguard Worker *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA 880*c8dee2aaSAndroid Build Coastguard Worker __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG 881*c8dee2aaSAndroid Build Coastguard Worker ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA 882*c8dee2aaSAndroid Build Coastguard Worker 883*c8dee2aaSAndroid Build Coastguard Worker // Unpack to 16-bit planar. 884*c8dee2aaSAndroid Build Coastguard Worker __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_ 885*c8dee2aaSAndroid Build Coastguard Worker g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_ 886*c8dee2aaSAndroid Build Coastguard Worker b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ 887*c8dee2aaSAndroid Build Coastguard Worker a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ 888*c8dee2aaSAndroid Build Coastguard Worker 889*c8dee2aaSAndroid Build Coastguard Worker // Premultiply! 890*c8dee2aaSAndroid Build Coastguard Worker r = scale(r, a); 891*c8dee2aaSAndroid Build Coastguard Worker g = scale(g, a); 892*c8dee2aaSAndroid Build Coastguard Worker b = scale(b, a); 893*c8dee2aaSAndroid Build Coastguard Worker 894*c8dee2aaSAndroid Build Coastguard Worker // Repack into interlaced pixels. 895*c8dee2aaSAndroid Build Coastguard Worker rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG 896*c8dee2aaSAndroid Build Coastguard Worker ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA 897*c8dee2aaSAndroid Build Coastguard Worker *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba 898*c8dee2aaSAndroid Build Coastguard Worker *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA 899*c8dee2aaSAndroid Build Coastguard Worker }; 900*c8dee2aaSAndroid Build Coastguard Worker 901*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 902*c8dee2aaSAndroid Build Coastguard Worker __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), 903*c8dee2aaSAndroid Build Coastguard Worker hi = _mm_loadu_si128((const __m128i*) (src + 4)); 904*c8dee2aaSAndroid Build Coastguard Worker 905*c8dee2aaSAndroid Build Coastguard Worker premul8(&lo, &hi); 906*c8dee2aaSAndroid Build Coastguard Worker 907*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (dst + 0), lo); 908*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (dst + 4), hi); 909*c8dee2aaSAndroid Build Coastguard Worker 910*c8dee2aaSAndroid Build Coastguard Worker src += 8; 911*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 912*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 913*c8dee2aaSAndroid Build Coastguard Worker } 914*c8dee2aaSAndroid Build Coastguard Worker 915*c8dee2aaSAndroid Build Coastguard Worker if (count >= 4) { 916*c8dee2aaSAndroid Build Coastguard Worker __m128i lo = _mm_loadu_si128((const __m128i*) src), 917*c8dee2aaSAndroid Build Coastguard Worker hi = _mm_setzero_si128(); 918*c8dee2aaSAndroid Build Coastguard Worker 919*c8dee2aaSAndroid Build Coastguard Worker premul8(&lo, &hi); 920*c8dee2aaSAndroid Build Coastguard Worker 921*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) dst, lo); 922*c8dee2aaSAndroid Build Coastguard Worker 923*c8dee2aaSAndroid Build Coastguard Worker src += 4; 924*c8dee2aaSAndroid Build Coastguard Worker dst += 4; 925*c8dee2aaSAndroid Build Coastguard Worker count -= 4; 926*c8dee2aaSAndroid Build Coastguard Worker } 927*c8dee2aaSAndroid Build Coastguard Worker 928*c8dee2aaSAndroid Build Coastguard Worker // Call portable code to finish up the tail of [0,4) pixels. 929*c8dee2aaSAndroid Build Coastguard Worker auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 930*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 931*c8dee2aaSAndroid Build Coastguard Worker} 932*c8dee2aaSAndroid Build Coastguard Worker 933*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 934*c8dee2aaSAndroid Build Coastguard Worker premul_should_swapRB(false, dst, src, count); 935*c8dee2aaSAndroid Build Coastguard Worker} 936*c8dee2aaSAndroid Build Coastguard Worker 937*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 938*c8dee2aaSAndroid Build Coastguard Worker premul_should_swapRB(true, dst, src, count); 939*c8dee2aaSAndroid Build Coastguard Worker} 940*c8dee2aaSAndroid Build Coastguard Worker 941*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 942*c8dee2aaSAndroid Build Coastguard Worker const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15); 943*c8dee2aaSAndroid Build Coastguard Worker 944*c8dee2aaSAndroid Build Coastguard Worker while (count >= 4) { 945*c8dee2aaSAndroid Build Coastguard Worker __m128i rgba = _mm_loadu_si128((const __m128i*) src); 946*c8dee2aaSAndroid Build Coastguard Worker __m128i bgra = _mm_shuffle_epi8(rgba, swapRB); 947*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) dst, bgra); 948*c8dee2aaSAndroid Build Coastguard Worker 949*c8dee2aaSAndroid Build Coastguard Worker src += 4; 950*c8dee2aaSAndroid Build Coastguard Worker dst += 4; 951*c8dee2aaSAndroid Build Coastguard Worker count -= 4; 952*c8dee2aaSAndroid Build Coastguard Worker } 953*c8dee2aaSAndroid Build Coastguard Worker 954*c8dee2aaSAndroid Build Coastguard Worker RGBA_to_BGRA_portable(dst, src, count); 955*c8dee2aaSAndroid Build Coastguard Worker} 956*c8dee2aaSAndroid Build Coastguard Worker 957*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 958*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 959*c8dee2aaSAndroid Build Coastguard Worker __m128i ga = _mm_loadu_si128((const __m128i*) src); 960*c8dee2aaSAndroid Build Coastguard Worker 961*c8dee2aaSAndroid Build Coastguard Worker __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)), 962*c8dee2aaSAndroid Build Coastguard Worker _mm_slli_epi16(ga, 8)); 963*c8dee2aaSAndroid Build Coastguard Worker 964*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); 965*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); 966*c8dee2aaSAndroid Build Coastguard Worker 967*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); 968*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); 969*c8dee2aaSAndroid Build Coastguard Worker 970*c8dee2aaSAndroid Build Coastguard Worker src += 8*2; 971*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 972*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 973*c8dee2aaSAndroid Build Coastguard Worker } 974*c8dee2aaSAndroid Build Coastguard Worker 975*c8dee2aaSAndroid Build Coastguard Worker grayA_to_RGBA_portable(dst, src, count); 976*c8dee2aaSAndroid Build Coastguard Worker} 977*c8dee2aaSAndroid Build Coastguard Worker 978*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 979*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 980*c8dee2aaSAndroid Build Coastguard Worker __m128i grayA = _mm_loadu_si128((const __m128i*) src); 981*c8dee2aaSAndroid Build Coastguard Worker 982*c8dee2aaSAndroid Build Coastguard Worker __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF)); 983*c8dee2aaSAndroid Build Coastguard Worker __m128i a0 = _mm_srli_epi16(grayA, 8); 984*c8dee2aaSAndroid Build Coastguard Worker 985*c8dee2aaSAndroid Build Coastguard Worker // Premultiply 986*c8dee2aaSAndroid Build Coastguard Worker g0 = scale(g0, a0); 987*c8dee2aaSAndroid Build Coastguard Worker 988*c8dee2aaSAndroid Build Coastguard Worker __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8)); 989*c8dee2aaSAndroid Build Coastguard Worker __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8)); 990*c8dee2aaSAndroid Build Coastguard Worker 991*c8dee2aaSAndroid Build Coastguard Worker 992*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); 993*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); 994*c8dee2aaSAndroid Build Coastguard Worker 995*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); 996*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); 997*c8dee2aaSAndroid Build Coastguard Worker 998*c8dee2aaSAndroid Build Coastguard Worker src += 8*2; 999*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 1000*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 1001*c8dee2aaSAndroid Build Coastguard Worker } 1002*c8dee2aaSAndroid Build Coastguard Worker 1003*c8dee2aaSAndroid Build Coastguard Worker grayA_to_rgbA_portable(dst, src, count); 1004*c8dee2aaSAndroid Build Coastguard Worker} 1005*c8dee2aaSAndroid Build Coastguard Worker 1006*c8dee2aaSAndroid Build Coastguard Workerenum Format { kRGB1, kBGR1 }; 1007*c8dee2aaSAndroid Build Coastguard Workerstatic void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) { 1008*c8dee2aaSAndroid Build Coastguard Worker auto convert8 = [=](__m128i* lo, __m128i* hi) { 1009*c8dee2aaSAndroid Build Coastguard Worker const __m128i zeros = _mm_setzero_si128(); 1010*c8dee2aaSAndroid Build Coastguard Worker __m128i planar; 1011*c8dee2aaSAndroid Build Coastguard Worker if (kBGR1 == format) { 1012*c8dee2aaSAndroid Build Coastguard Worker planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 1013*c8dee2aaSAndroid Build Coastguard Worker } else { 1014*c8dee2aaSAndroid Build Coastguard Worker planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 1015*c8dee2aaSAndroid Build Coastguard Worker } 1016*c8dee2aaSAndroid Build Coastguard Worker 1017*c8dee2aaSAndroid Build Coastguard Worker // Swizzle the pixels to 8-bit planar. 1018*c8dee2aaSAndroid Build Coastguard Worker *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk 1019*c8dee2aaSAndroid Build Coastguard Worker *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK 1020*c8dee2aaSAndroid Build Coastguard Worker __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM 1021*c8dee2aaSAndroid Build Coastguard Worker yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK 1022*c8dee2aaSAndroid Build Coastguard Worker 1023*c8dee2aaSAndroid Build Coastguard Worker // Unpack to 16-bit planar. 1024*c8dee2aaSAndroid Build Coastguard Worker __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_ 1025*c8dee2aaSAndroid Build Coastguard Worker m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_ 1026*c8dee2aaSAndroid Build Coastguard Worker y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_ 1027*c8dee2aaSAndroid Build Coastguard Worker k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_ 1028*c8dee2aaSAndroid Build Coastguard Worker 1029*c8dee2aaSAndroid Build Coastguard Worker // Scale to r, g, b. 1030*c8dee2aaSAndroid Build Coastguard Worker __m128i r = scale(c, k), 1031*c8dee2aaSAndroid Build Coastguard Worker g = scale(m, k), 1032*c8dee2aaSAndroid Build Coastguard Worker b = scale(y, k); 1033*c8dee2aaSAndroid Build Coastguard Worker 1034*c8dee2aaSAndroid Build Coastguard Worker // Repack into interlaced pixels. 1035*c8dee2aaSAndroid Build Coastguard Worker __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG 1036*c8dee2aaSAndroid Build Coastguard Worker ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1 1037*c8dee2aaSAndroid Build Coastguard Worker *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba 1038*c8dee2aaSAndroid Build Coastguard Worker *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1 1039*c8dee2aaSAndroid Build Coastguard Worker }; 1040*c8dee2aaSAndroid Build Coastguard Worker 1041*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 1042*c8dee2aaSAndroid Build Coastguard Worker __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), 1043*c8dee2aaSAndroid Build Coastguard Worker hi = _mm_loadu_si128((const __m128i*) (src + 4)); 1044*c8dee2aaSAndroid Build Coastguard Worker 1045*c8dee2aaSAndroid Build Coastguard Worker convert8(&lo, &hi); 1046*c8dee2aaSAndroid Build Coastguard Worker 1047*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (dst + 0), lo); 1048*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (dst + 4), hi); 1049*c8dee2aaSAndroid Build Coastguard Worker 1050*c8dee2aaSAndroid Build Coastguard Worker src += 8; 1051*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 1052*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 1053*c8dee2aaSAndroid Build Coastguard Worker } 1054*c8dee2aaSAndroid Build Coastguard Worker 1055*c8dee2aaSAndroid Build Coastguard Worker if (count >= 4) { 1056*c8dee2aaSAndroid Build Coastguard Worker __m128i lo = _mm_loadu_si128((const __m128i*) src), 1057*c8dee2aaSAndroid Build Coastguard Worker hi = _mm_setzero_si128(); 1058*c8dee2aaSAndroid Build Coastguard Worker 1059*c8dee2aaSAndroid Build Coastguard Worker convert8(&lo, &hi); 1060*c8dee2aaSAndroid Build Coastguard Worker 1061*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) dst, lo); 1062*c8dee2aaSAndroid Build Coastguard Worker 1063*c8dee2aaSAndroid Build Coastguard Worker src += 4; 1064*c8dee2aaSAndroid Build Coastguard Worker dst += 4; 1065*c8dee2aaSAndroid Build Coastguard Worker count -= 4; 1066*c8dee2aaSAndroid Build Coastguard Worker } 1067*c8dee2aaSAndroid Build Coastguard Worker 1068*c8dee2aaSAndroid Build Coastguard Worker auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 1069*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 1070*c8dee2aaSAndroid Build Coastguard Worker} 1071*c8dee2aaSAndroid Build Coastguard Worker 1072*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 1073*c8dee2aaSAndroid Build Coastguard Worker inverted_cmyk_to(kRGB1, dst, src, count); 1074*c8dee2aaSAndroid Build Coastguard Worker} 1075*c8dee2aaSAndroid Build Coastguard Worker 1076*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 1077*c8dee2aaSAndroid Build Coastguard Worker inverted_cmyk_to(kBGR1, dst, src, count); 1078*c8dee2aaSAndroid Build Coastguard Worker} 1079*c8dee2aaSAndroid Build Coastguard Worker 1080*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) { 1081*c8dee2aaSAndroid Build Coastguard Worker rgbA_to_RGBA_portable(dst, src, count); 1082*c8dee2aaSAndroid Build Coastguard Worker} 1083*c8dee2aaSAndroid Build Coastguard Worker 1084*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 1085*c8dee2aaSAndroid Build Coastguard Worker rgbA_to_BGRA_portable(dst, src, count); 1086*c8dee2aaSAndroid Build Coastguard Worker} 1087*c8dee2aaSAndroid Build Coastguard Worker 1088*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX 1089*c8dee2aaSAndroid Build Coastguard Worker// -- LASX ---------------------------------------------------------------------------------------- 1090*c8dee2aaSAndroid Build Coastguard Worker 1091*c8dee2aaSAndroid Build Coastguard Worker// Scale a byte by another. 1092*c8dee2aaSAndroid Build Coastguard Worker// Inputs are stored in 16-bit lanes, but are not larger than 8-bits. 1093*c8dee2aaSAndroid Build Coastguard Worker// (x+127)/255 == ((x+128)*257)>>16 1094*c8dee2aaSAndroid Build Coastguard WorkerSI __m256i scale(__m256i x, __m256i y) { 1095*c8dee2aaSAndroid Build Coastguard Worker const __m256i _128 = __lasx_xvreplgr2vr_h(128); 1096*c8dee2aaSAndroid Build Coastguard Worker const __m256i _257 = __lasx_xvreplgr2vr_h(257); 1097*c8dee2aaSAndroid Build Coastguard Worker 1098*c8dee2aaSAndroid Build Coastguard Worker // (x+127)/255 == ((x+128)*257)>>16 1099*c8dee2aaSAndroid Build Coastguard Worker return __lasx_xvmuh_hu(__lasx_xvadd_h(__lasx_xvmul_h(x, y), _128), _257); 1100*c8dee2aaSAndroid Build Coastguard Worker} 1101*c8dee2aaSAndroid Build Coastguard Worker 1102*c8dee2aaSAndroid Build Coastguard Workerstatic void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) { 1103*c8dee2aaSAndroid Build Coastguard Worker auto premul8 = [=](__m256i* lo, __m256i* hi) { 1104*c8dee2aaSAndroid Build Coastguard Worker const __m256i zeros = __lasx_xvldi(0); 1105*c8dee2aaSAndroid Build Coastguard Worker __m256i planar = __lasx_xvldi(0); 1106*c8dee2aaSAndroid Build Coastguard Worker if (kSwapRB) { 1107*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,0); 1108*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,1); 1109*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,2); 1110*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,3); 1111*c8dee2aaSAndroid Build Coastguard Worker } else { 1112*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,0); 1113*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,1); 1114*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,2); 1115*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,3); 1116*c8dee2aaSAndroid Build Coastguard Worker } 1117*c8dee2aaSAndroid Build Coastguard Worker 1118*c8dee2aaSAndroid Build Coastguard Worker // Swizzle the pixels to 8-bit planar. 1119*c8dee2aaSAndroid Build Coastguard Worker *lo = __lasx_xvshuf_b(zeros, *lo, planar); // rrrrgggg bbbbaaaa rrrrgggg bbbbaaaa 1120*c8dee2aaSAndroid Build Coastguard Worker *hi = __lasx_xvshuf_b(zeros, *hi, planar); // RRRRGGGG BBBBAAAA RRRRGGGG BBBBAAAA 1121*c8dee2aaSAndroid Build Coastguard Worker __m256i rg = __lasx_xvilvl_w(*hi, *lo), // rrrrRRRR ggggGGGG rrrrRRRR ggggGGGG 1122*c8dee2aaSAndroid Build Coastguard Worker ba = __lasx_xvilvh_w(*hi, *lo); // bbbbBBBB aaaaAAAA bbbbBBBB aaaaAAAA 1123*c8dee2aaSAndroid Build Coastguard Worker 1124*c8dee2aaSAndroid Build Coastguard Worker // Unpack to 16-bit planar. 1125*c8dee2aaSAndroid Build Coastguard Worker __m256i r = __lasx_xvilvl_b(zeros, rg), // r_r_r_r_ R_R_R_R_ r_r_r_r_ R_R_R_R_ 1126*c8dee2aaSAndroid Build Coastguard Worker g = __lasx_xvilvh_b(zeros, rg), // g_g_g_g_ G_G_G_G_ g_g_g_g_ G_G_G_G_ 1127*c8dee2aaSAndroid Build Coastguard Worker b = __lasx_xvilvl_b(zeros, ba), // b_b_b_b_ B_B_B_B_ b_b_b_b_ B_B_B_B_ 1128*c8dee2aaSAndroid Build Coastguard Worker a = __lasx_xvilvh_b(zeros, ba); // a_a_a_a_ A_A_A_A_ a_a_a_a_ A_A_A_A_ 1129*c8dee2aaSAndroid Build Coastguard Worker 1130*c8dee2aaSAndroid Build Coastguard Worker // Premultiply! 1131*c8dee2aaSAndroid Build Coastguard Worker r = scale(r, a); 1132*c8dee2aaSAndroid Build Coastguard Worker g = scale(g, a); 1133*c8dee2aaSAndroid Build Coastguard Worker b = scale(b, a); 1134*c8dee2aaSAndroid Build Coastguard Worker 1135*c8dee2aaSAndroid Build Coastguard Worker // Repack into interlaced pixels. 1136*c8dee2aaSAndroid Build Coastguard Worker rg = __lasx_xvor_v(r, __lasx_xvslli_h(g, 8)); // rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG 1137*c8dee2aaSAndroid Build Coastguard Worker ba = __lasx_xvor_v(b, __lasx_xvslli_h(a, 8)); // babababa BABABABA babababa BABABABA 1138*c8dee2aaSAndroid Build Coastguard Worker *lo = __lasx_xvilvl_h(ba, rg); // rgbargba rgbargba rgbargba rgbargba 1139*c8dee2aaSAndroid Build Coastguard Worker *hi = __lasx_xvilvh_h(ba, rg); // RGBARGBA RGBARGBA RGBARGBA RGBARGBA 1140*c8dee2aaSAndroid Build Coastguard Worker }; 1141*c8dee2aaSAndroid Build Coastguard Worker 1142*c8dee2aaSAndroid Build Coastguard Worker while (count >= 16) { 1143*c8dee2aaSAndroid Build Coastguard Worker __m256i lo = __lasx_xvld(src, 0), 1144*c8dee2aaSAndroid Build Coastguard Worker hi = __lasx_xvld(src, 32); 1145*c8dee2aaSAndroid Build Coastguard Worker 1146*c8dee2aaSAndroid Build Coastguard Worker premul8(&lo, &hi); 1147*c8dee2aaSAndroid Build Coastguard Worker 1148*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(lo, dst, 0); 1149*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(hi, dst, 32); 1150*c8dee2aaSAndroid Build Coastguard Worker 1151*c8dee2aaSAndroid Build Coastguard Worker src += 16; 1152*c8dee2aaSAndroid Build Coastguard Worker dst += 16; 1153*c8dee2aaSAndroid Build Coastguard Worker count -= 16; 1154*c8dee2aaSAndroid Build Coastguard Worker } 1155*c8dee2aaSAndroid Build Coastguard Worker 1156*c8dee2aaSAndroid Build Coastguard Worker if (count >= 8) { 1157*c8dee2aaSAndroid Build Coastguard Worker __m256i lo = __lasx_xvld(src, 0), 1158*c8dee2aaSAndroid Build Coastguard Worker hi = __lasx_xvldi(0); 1159*c8dee2aaSAndroid Build Coastguard Worker 1160*c8dee2aaSAndroid Build Coastguard Worker premul8(&lo, &hi); 1161*c8dee2aaSAndroid Build Coastguard Worker 1162*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(lo, dst, 0); 1163*c8dee2aaSAndroid Build Coastguard Worker 1164*c8dee2aaSAndroid Build Coastguard Worker src += 8; 1165*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 1166*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 1167*c8dee2aaSAndroid Build Coastguard Worker } 1168*c8dee2aaSAndroid Build Coastguard Worker 1169*c8dee2aaSAndroid Build Coastguard Worker // Call portable code to finish up the tail of [0,4) pixels. 1170*c8dee2aaSAndroid Build Coastguard Worker auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 1171*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 1172*c8dee2aaSAndroid Build Coastguard Worker} 1173*c8dee2aaSAndroid Build Coastguard Worker 1174*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 1175*c8dee2aaSAndroid Build Coastguard Worker premul_should_swapRB(false, dst, src, count); 1176*c8dee2aaSAndroid Build Coastguard Worker} 1177*c8dee2aaSAndroid Build Coastguard Worker 1178*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 1179*c8dee2aaSAndroid Build Coastguard Worker premul_should_swapRB(true, dst, src, count); 1180*c8dee2aaSAndroid Build Coastguard Worker} 1181*c8dee2aaSAndroid Build Coastguard Worker 1182*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 1183*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 1184*c8dee2aaSAndroid Build Coastguard Worker __m256i rgba = __lasx_xvld(src, 0); 1185*c8dee2aaSAndroid Build Coastguard Worker __m256i bgra = __lasx_xvshuf4i_b(rgba, 0xC6); 1186*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(bgra, dst, 0); 1187*c8dee2aaSAndroid Build Coastguard Worker 1188*c8dee2aaSAndroid Build Coastguard Worker src += 8; 1189*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 1190*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 1191*c8dee2aaSAndroid Build Coastguard Worker } 1192*c8dee2aaSAndroid Build Coastguard Worker 1193*c8dee2aaSAndroid Build Coastguard Worker RGBA_to_BGRA_portable(dst, src, count); 1194*c8dee2aaSAndroid Build Coastguard Worker} 1195*c8dee2aaSAndroid Build Coastguard Worker 1196*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 1197*c8dee2aaSAndroid Build Coastguard Worker while (count >= 16) { 1198*c8dee2aaSAndroid Build Coastguard Worker __m256i ga = __lasx_xvld(src, 0); 1199*c8dee2aaSAndroid Build Coastguard Worker 1200*c8dee2aaSAndroid Build Coastguard Worker __m256i gg = __lasx_xvor_v(__lasx_xvand_v(ga, __lasx_xvreplgr2vr_h(0x00FF)), 1201*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvslli_h(ga, 8)); 1202*c8dee2aaSAndroid Build Coastguard Worker 1203*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga_lo = __lasx_xvilvl_h(ga, gg); 1204*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga_hi = __lasx_xvilvh_h(ga, gg); 1205*c8dee2aaSAndroid Build Coastguard Worker 1206*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(__lasx_xvpermi_q(ggga_lo, ggga_hi, 0x02), dst, 0); 1207*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(__lasx_xvpermi_q(ggga_lo, ggga_hi, 0x13), dst, 32); 1208*c8dee2aaSAndroid Build Coastguard Worker 1209*c8dee2aaSAndroid Build Coastguard Worker src += 16*2; 1210*c8dee2aaSAndroid Build Coastguard Worker dst += 16; 1211*c8dee2aaSAndroid Build Coastguard Worker count -= 16; 1212*c8dee2aaSAndroid Build Coastguard Worker } 1213*c8dee2aaSAndroid Build Coastguard Worker 1214*c8dee2aaSAndroid Build Coastguard Worker grayA_to_RGBA_portable(dst, src, count); 1215*c8dee2aaSAndroid Build Coastguard Worker} 1216*c8dee2aaSAndroid Build Coastguard Worker 1217*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 1218*c8dee2aaSAndroid Build Coastguard Worker while (count >= 16) { 1219*c8dee2aaSAndroid Build Coastguard Worker __m256i grayA = __lasx_xvld(src, 0); 1220*c8dee2aaSAndroid Build Coastguard Worker 1221*c8dee2aaSAndroid Build Coastguard Worker __m256i val = __lasx_xvreplgr2vr_h(0x00FF); 1222*c8dee2aaSAndroid Build Coastguard Worker 1223*c8dee2aaSAndroid Build Coastguard Worker __m256i g0 = __lasx_xvand_v(grayA, val); 1224*c8dee2aaSAndroid Build Coastguard Worker __m256i a0 = __lasx_xvsrli_h(grayA, 8); 1225*c8dee2aaSAndroid Build Coastguard Worker 1226*c8dee2aaSAndroid Build Coastguard Worker // Premultiply 1227*c8dee2aaSAndroid Build Coastguard Worker g0 = scale(g0, a0); 1228*c8dee2aaSAndroid Build Coastguard Worker 1229*c8dee2aaSAndroid Build Coastguard Worker __m256i gg = __lasx_xvor_v(g0, __lasx_xvslli_h(g0, 8)); 1230*c8dee2aaSAndroid Build Coastguard Worker __m256i ga = __lasx_xvor_v(g0, __lasx_xvslli_h(a0, 8)); 1231*c8dee2aaSAndroid Build Coastguard Worker 1232*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga_lo = __lasx_xvilvl_h(ga, gg); 1233*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga_hi = __lasx_xvilvh_h(ga, gg); 1234*c8dee2aaSAndroid Build Coastguard Worker 1235*c8dee2aaSAndroid Build Coastguard Worker val = __lasx_xvpermi_q(ggga_lo, ggga_hi, 0x02); 1236*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(val, dst, 0); 1237*c8dee2aaSAndroid Build Coastguard Worker 1238*c8dee2aaSAndroid Build Coastguard Worker val = __lasx_xvpermi_q(ggga_lo, ggga_hi, 0x13); 1239*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(val, dst, 32); 1240*c8dee2aaSAndroid Build Coastguard Worker 1241*c8dee2aaSAndroid Build Coastguard Worker src += 16*2; 1242*c8dee2aaSAndroid Build Coastguard Worker dst += 16; 1243*c8dee2aaSAndroid Build Coastguard Worker count -= 16; 1244*c8dee2aaSAndroid Build Coastguard Worker } 1245*c8dee2aaSAndroid Build Coastguard Worker 1246*c8dee2aaSAndroid Build Coastguard Worker grayA_to_rgbA_portable(dst, src, count); 1247*c8dee2aaSAndroid Build Coastguard Worker} 1248*c8dee2aaSAndroid Build Coastguard Worker 1249*c8dee2aaSAndroid Build Coastguard Workerenum Format { kRGB1, kBGR1 }; 1250*c8dee2aaSAndroid Build Coastguard Workerstatic void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) { 1251*c8dee2aaSAndroid Build Coastguard Worker auto convert8 = [=](__m256i *lo, __m256i* hi) { 1252*c8dee2aaSAndroid Build Coastguard Worker const __m256i zeros = __lasx_xvldi(0); 1253*c8dee2aaSAndroid Build Coastguard Worker __m256i planar = __lasx_xvldi(0); 1254*c8dee2aaSAndroid Build Coastguard Worker if (kBGR1 == format) { 1255*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,0); 1256*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,1); 1257*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,2); 1258*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,3); 1259*c8dee2aaSAndroid Build Coastguard Worker } else { 1260*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,0); 1261*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,1); 1262*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,2); 1263*c8dee2aaSAndroid Build Coastguard Worker planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,3); 1264*c8dee2aaSAndroid Build Coastguard Worker } 1265*c8dee2aaSAndroid Build Coastguard Worker 1266*c8dee2aaSAndroid Build Coastguard Worker // Swizzle the pixels to 8-bit planar. 1267*c8dee2aaSAndroid Build Coastguard Worker *lo = __lasx_xvshuf_b(zeros, *lo, planar); // ccccmmmm yyyykkkk ccccmmmm yyyykkkk 1268*c8dee2aaSAndroid Build Coastguard Worker *hi = __lasx_xvshuf_b(zeros, *hi, planar); // CCCCMMMM YYYYKKKK CCCCMMMM YYYYKKKK 1269*c8dee2aaSAndroid Build Coastguard Worker __m256i cm = __lasx_xvilvl_w(*hi, *lo), // ccccCCCC mmmmMMMM ccccCCCC mmmmMMMM 1270*c8dee2aaSAndroid Build Coastguard Worker yk = __lasx_xvilvh_w(*hi, *lo); // yyyyYYYY kkkkKKKK yyyyYYYY kkkkKKKK 1271*c8dee2aaSAndroid Build Coastguard Worker 1272*c8dee2aaSAndroid Build Coastguard Worker // Unpack to 16-bit planar. 1273*c8dee2aaSAndroid Build Coastguard Worker __m256i c = __lasx_xvilvl_b(zeros, cm), // c_c_c_c_ C_C_C_C_ c_c_c_c_ C_C_C_C_ 1274*c8dee2aaSAndroid Build Coastguard Worker m = __lasx_xvilvh_b(zeros, cm), // m_m_m_m_ M_M_M_M_ m_m_m_m_ M_M_M_M_ 1275*c8dee2aaSAndroid Build Coastguard Worker y = __lasx_xvilvl_b(zeros, yk), // y_y_y_y_ Y_Y_Y_Y_ y_y_y_y_ Y_Y_Y_Y_ 1276*c8dee2aaSAndroid Build Coastguard Worker k = __lasx_xvilvh_b(zeros, yk); // k_k_k_k_ K_K_K_K_ k_k_k_k_ K_K_K_K_ 1277*c8dee2aaSAndroid Build Coastguard Worker 1278*c8dee2aaSAndroid Build Coastguard Worker // Scale to r, g, b. 1279*c8dee2aaSAndroid Build Coastguard Worker __m256i r = scale(c, k), 1280*c8dee2aaSAndroid Build Coastguard Worker g = scale(m, k), 1281*c8dee2aaSAndroid Build Coastguard Worker b = scale(y, k); 1282*c8dee2aaSAndroid Build Coastguard Worker 1283*c8dee2aaSAndroid Build Coastguard Worker // Repack into interlaced pixels: 1284*c8dee2aaSAndroid Build Coastguard Worker // rg = rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG 1285*c8dee2aaSAndroid Build Coastguard Worker // ba = b1b1b1b1 B1B1B1B1 b1b1b1b1 B1B1B1B1 1286*c8dee2aaSAndroid Build Coastguard Worker __m256i rg = __lasx_xvor_v(r, __lasx_xvslli_h(g, 8)), 1287*c8dee2aaSAndroid Build Coastguard Worker ba = __lasx_xvor_v(b, __lasx_xvreplgr2vr_h(0xff00)); 1288*c8dee2aaSAndroid Build Coastguard Worker *lo = __lasx_xvilvl_h(ba, rg); // rgb1rgb1 rgb1rgb1 rgb1rgb1 rgb1rgb1 1289*c8dee2aaSAndroid Build Coastguard Worker *hi = __lasx_xvilvh_h(ba, rg); // RGB1RGB1 RGB1RGB1 RGB1RGB1 RGB1RGB1 1290*c8dee2aaSAndroid Build Coastguard Worker }; 1291*c8dee2aaSAndroid Build Coastguard Worker 1292*c8dee2aaSAndroid Build Coastguard Worker while (count >= 16) { 1293*c8dee2aaSAndroid Build Coastguard Worker __m256i lo = __lasx_xvld(src, 0), 1294*c8dee2aaSAndroid Build Coastguard Worker hi = __lasx_xvld(src, 32); 1295*c8dee2aaSAndroid Build Coastguard Worker 1296*c8dee2aaSAndroid Build Coastguard Worker convert8(&lo, &hi); 1297*c8dee2aaSAndroid Build Coastguard Worker 1298*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(lo, dst, 0); 1299*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(hi, dst, 32); 1300*c8dee2aaSAndroid Build Coastguard Worker 1301*c8dee2aaSAndroid Build Coastguard Worker src += 16; 1302*c8dee2aaSAndroid Build Coastguard Worker dst += 16; 1303*c8dee2aaSAndroid Build Coastguard Worker count -= 16; 1304*c8dee2aaSAndroid Build Coastguard Worker } 1305*c8dee2aaSAndroid Build Coastguard Worker 1306*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 1307*c8dee2aaSAndroid Build Coastguard Worker __m256i lo = __lasx_xvld(src, 0), 1308*c8dee2aaSAndroid Build Coastguard Worker hi = __lasx_xvldi(0); 1309*c8dee2aaSAndroid Build Coastguard Worker 1310*c8dee2aaSAndroid Build Coastguard Worker convert8(&lo, &hi); 1311*c8dee2aaSAndroid Build Coastguard Worker 1312*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(lo, dst, 0); 1313*c8dee2aaSAndroid Build Coastguard Worker 1314*c8dee2aaSAndroid Build Coastguard Worker src += 8; 1315*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 1316*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 1317*c8dee2aaSAndroid Build Coastguard Worker } 1318*c8dee2aaSAndroid Build Coastguard Worker 1319*c8dee2aaSAndroid Build Coastguard Worker auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 1320*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 1321*c8dee2aaSAndroid Build Coastguard Worker} 1322*c8dee2aaSAndroid Build Coastguard Worker 1323*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 1324*c8dee2aaSAndroid Build Coastguard Worker inverted_cmyk_to(kRGB1, dst, src, count); 1325*c8dee2aaSAndroid Build Coastguard Worker} 1326*c8dee2aaSAndroid Build Coastguard Worker 1327*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 1328*c8dee2aaSAndroid Build Coastguard Worker inverted_cmyk_to(kBGR1, dst, src, count); 1329*c8dee2aaSAndroid Build Coastguard Worker} 1330*c8dee2aaSAndroid Build Coastguard Worker 1331*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) { 1332*c8dee2aaSAndroid Build Coastguard Worker rgbA_to_RGBA_portable(dst, src, count); 1333*c8dee2aaSAndroid Build Coastguard Worker} 1334*c8dee2aaSAndroid Build Coastguard Worker 1335*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 1336*c8dee2aaSAndroid Build Coastguard Worker rgbA_to_BGRA_portable(dst, src, count); 1337*c8dee2aaSAndroid Build Coastguard Worker} 1338*c8dee2aaSAndroid Build Coastguard Worker 1339*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX 1340*c8dee2aaSAndroid Build Coastguard Worker// -- LSX ----------------------------------------------------------------------------------------- 1341*c8dee2aaSAndroid Build Coastguard Worker 1342*c8dee2aaSAndroid Build Coastguard Worker// Scale a byte by another. 1343*c8dee2aaSAndroid Build Coastguard Worker// Inputs are stored in 16-bit lanes, but are not larger than 8-bits. 1344*c8dee2aaSAndroid Build Coastguard WorkerSI __m128i scale(__m128i x, __m128i y) { 1345*c8dee2aaSAndroid Build Coastguard Worker const __m128i _128 = __lsx_vreplgr2vr_h(128); 1346*c8dee2aaSAndroid Build Coastguard Worker const __m128i _257 = __lsx_vreplgr2vr_h(257); 1347*c8dee2aaSAndroid Build Coastguard Worker 1348*c8dee2aaSAndroid Build Coastguard Worker // (x+127)/255 == ((x+128)*257)>>16 1349*c8dee2aaSAndroid Build Coastguard Worker return __lsx_vmuh_hu(__lsx_vadd_h(__lsx_vmul_h(x, y), _128), _257); 1350*c8dee2aaSAndroid Build Coastguard Worker} 1351*c8dee2aaSAndroid Build Coastguard Worker 1352*c8dee2aaSAndroid Build Coastguard Workerstatic void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) { 1353*c8dee2aaSAndroid Build Coastguard Worker 1354*c8dee2aaSAndroid Build Coastguard Worker auto premul8 = [=](__m128i *lo, __m128i *hi){ 1355*c8dee2aaSAndroid Build Coastguard Worker const __m128i zeros = __lsx_vldi(0); 1356*c8dee2aaSAndroid Build Coastguard Worker __m128i planar = __lsx_vldi(0); 1357*c8dee2aaSAndroid Build Coastguard Worker if (kSwapRB) { 1358*c8dee2aaSAndroid Build Coastguard Worker planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010e0a0602, 0); 1359*c8dee2aaSAndroid Build Coastguard Worker planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030c080400, 1); 1360*c8dee2aaSAndroid Build Coastguard Worker } else { 1361*c8dee2aaSAndroid Build Coastguard Worker planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010c080400, 0); 1362*c8dee2aaSAndroid Build Coastguard Worker planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030e0a0602, 1); 1363*c8dee2aaSAndroid Build Coastguard Worker } 1364*c8dee2aaSAndroid Build Coastguard Worker 1365*c8dee2aaSAndroid Build Coastguard Worker // Swizzle the pixels to 8-bit planar. 1366*c8dee2aaSAndroid Build Coastguard Worker *lo = __lsx_vshuf_b(zeros, *lo, planar); // rrrrgggg bbbbaaaa 1367*c8dee2aaSAndroid Build Coastguard Worker *hi = __lsx_vshuf_b(zeros, *hi, planar); // RRRRGGGG BBBBAAAA 1368*c8dee2aaSAndroid Build Coastguard Worker __m128i rg = __lsx_vilvl_w(*hi, *lo), // rrrrRRRR ggggGGGG 1369*c8dee2aaSAndroid Build Coastguard Worker ba = __lsx_vilvh_w(*hi, *lo); // bbbbBBBB aaaaAAAA 1370*c8dee2aaSAndroid Build Coastguard Worker 1371*c8dee2aaSAndroid Build Coastguard Worker // Unpack to 16-bit planar. 1372*c8dee2aaSAndroid Build Coastguard Worker __m128i r = __lsx_vilvl_b(zeros, rg), // r_r_r_r_ R_R_R_R_ 1373*c8dee2aaSAndroid Build Coastguard Worker g = __lsx_vilvh_b(zeros, rg), // g_g_g_g_ G_G_G_G_ 1374*c8dee2aaSAndroid Build Coastguard Worker b = __lsx_vilvl_b(zeros, ba), // b_b_b_b_ B_B_B_B_ 1375*c8dee2aaSAndroid Build Coastguard Worker a = __lsx_vilvh_b(zeros, ba); // a_a_a_a_ A_A_A_A_ 1376*c8dee2aaSAndroid Build Coastguard Worker 1377*c8dee2aaSAndroid Build Coastguard Worker // Premultiply! 1378*c8dee2aaSAndroid Build Coastguard Worker r = scale(r, a); 1379*c8dee2aaSAndroid Build Coastguard Worker g = scale(g, a); 1380*c8dee2aaSAndroid Build Coastguard Worker b = scale(b, a); 1381*c8dee2aaSAndroid Build Coastguard Worker 1382*c8dee2aaSAndroid Build Coastguard Worker // Repack into interlaced pixels. 1383*c8dee2aaSAndroid Build Coastguard Worker rg = __lsx_vor_v(r, __lsx_vslli_h(g, 8)); // rgrgrgrg RGRGRGRG 1384*c8dee2aaSAndroid Build Coastguard Worker ba = __lsx_vor_v(b, __lsx_vslli_h(a, 8)); // babababa BABABABA 1385*c8dee2aaSAndroid Build Coastguard Worker *lo = __lsx_vilvl_h(ba, rg); // rgbargba rgbargba 1386*c8dee2aaSAndroid Build Coastguard Worker *hi = __lsx_vilvh_h(ba, rg); // RGBARGBA RGBARGBA 1387*c8dee2aaSAndroid Build Coastguard Worker }; 1388*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 1389*c8dee2aaSAndroid Build Coastguard Worker __m128i lo = __lsx_vld(src ,0), 1390*c8dee2aaSAndroid Build Coastguard Worker hi = __lsx_vld(src ,16); 1391*c8dee2aaSAndroid Build Coastguard Worker 1392*c8dee2aaSAndroid Build Coastguard Worker premul8(&lo, &hi); 1393*c8dee2aaSAndroid Build Coastguard Worker 1394*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(lo, dst, 0); 1395*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(hi, dst, 16); 1396*c8dee2aaSAndroid Build Coastguard Worker 1397*c8dee2aaSAndroid Build Coastguard Worker src += 8; 1398*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 1399*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 1400*c8dee2aaSAndroid Build Coastguard Worker } 1401*c8dee2aaSAndroid Build Coastguard Worker 1402*c8dee2aaSAndroid Build Coastguard Worker if (count >= 4) { 1403*c8dee2aaSAndroid Build Coastguard Worker __m128i lo = __lsx_vld(src, 0), 1404*c8dee2aaSAndroid Build Coastguard Worker hi = __lsx_vldi(0); 1405*c8dee2aaSAndroid Build Coastguard Worker 1406*c8dee2aaSAndroid Build Coastguard Worker premul8(&lo, &hi); 1407*c8dee2aaSAndroid Build Coastguard Worker 1408*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(lo, dst, 0); 1409*c8dee2aaSAndroid Build Coastguard Worker 1410*c8dee2aaSAndroid Build Coastguard Worker src += 4; 1411*c8dee2aaSAndroid Build Coastguard Worker dst += 4; 1412*c8dee2aaSAndroid Build Coastguard Worker count -= 4; 1413*c8dee2aaSAndroid Build Coastguard Worker } 1414*c8dee2aaSAndroid Build Coastguard Worker 1415*c8dee2aaSAndroid Build Coastguard Worker // Call portable code to finish up the tail of [0,4) pixels. 1416*c8dee2aaSAndroid Build Coastguard Worker auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 1417*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 1418*c8dee2aaSAndroid Build Coastguard Worker} 1419*c8dee2aaSAndroid Build Coastguard Worker 1420*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 1421*c8dee2aaSAndroid Build Coastguard Worker premul_should_swapRB(false, dst, src, count); 1422*c8dee2aaSAndroid Build Coastguard Worker} 1423*c8dee2aaSAndroid Build Coastguard Worker 1424*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 1425*c8dee2aaSAndroid Build Coastguard Worker premul_should_swapRB(true, dst, src, count); 1426*c8dee2aaSAndroid Build Coastguard Worker} 1427*c8dee2aaSAndroid Build Coastguard Worker 1428*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 1429*c8dee2aaSAndroid Build Coastguard Worker __m128i swapRB = __lsx_vldi(0); 1430*c8dee2aaSAndroid Build Coastguard Worker swapRB = __lsx_vinsgr2vr_d(swapRB, 0x0704050603000102, 0); 1431*c8dee2aaSAndroid Build Coastguard Worker swapRB = __lsx_vinsgr2vr_d(swapRB, 0x0f0c0d0e0b08090a, 1); 1432*c8dee2aaSAndroid Build Coastguard Worker 1433*c8dee2aaSAndroid Build Coastguard Worker while (count >= 4) { 1434*c8dee2aaSAndroid Build Coastguard Worker __m128i rgba = __lsx_vld(src, 0); 1435*c8dee2aaSAndroid Build Coastguard Worker __m128i bgra = __lsx_vshuf4i_b(rgba, 0xC6); 1436*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(bgra, dst, 0); 1437*c8dee2aaSAndroid Build Coastguard Worker 1438*c8dee2aaSAndroid Build Coastguard Worker src += 4; 1439*c8dee2aaSAndroid Build Coastguard Worker dst += 4; 1440*c8dee2aaSAndroid Build Coastguard Worker count -= 4; 1441*c8dee2aaSAndroid Build Coastguard Worker } 1442*c8dee2aaSAndroid Build Coastguard Worker 1443*c8dee2aaSAndroid Build Coastguard Worker RGBA_to_BGRA_portable(dst, src, count); 1444*c8dee2aaSAndroid Build Coastguard Worker} 1445*c8dee2aaSAndroid Build Coastguard Worker 1446*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 1447*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 1448*c8dee2aaSAndroid Build Coastguard Worker __m128i ga = __lsx_vld(src, 0); 1449*c8dee2aaSAndroid Build Coastguard Worker 1450*c8dee2aaSAndroid Build Coastguard Worker __m128i gg = __lsx_vor_v(__lsx_vand_v(ga, __lsx_vreplgr2vr_h(0x00FF)), 1451*c8dee2aaSAndroid Build Coastguard Worker __lsx_vslli_h(ga, 8)); 1452*c8dee2aaSAndroid Build Coastguard Worker 1453*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga_lo = __lsx_vilvl_h(ga, gg); 1454*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga_hi = __lsx_vilvh_h(ga, gg); 1455*c8dee2aaSAndroid Build Coastguard Worker 1456*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(ggga_lo, dst, 0); 1457*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(ggga_hi, dst, 16); 1458*c8dee2aaSAndroid Build Coastguard Worker 1459*c8dee2aaSAndroid Build Coastguard Worker src += 8*2; 1460*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 1461*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 1462*c8dee2aaSAndroid Build Coastguard Worker } 1463*c8dee2aaSAndroid Build Coastguard Worker 1464*c8dee2aaSAndroid Build Coastguard Worker grayA_to_RGBA_portable(dst, src, count); 1465*c8dee2aaSAndroid Build Coastguard Worker} 1466*c8dee2aaSAndroid Build Coastguard Worker 1467*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 1468*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 1469*c8dee2aaSAndroid Build Coastguard Worker __m128i grayA = __lsx_vld(src, 0); 1470*c8dee2aaSAndroid Build Coastguard Worker 1471*c8dee2aaSAndroid Build Coastguard Worker __m128i g0 = __lsx_vand_v(grayA, __lsx_vreplgr2vr_h(0x00FF)); 1472*c8dee2aaSAndroid Build Coastguard Worker __m128i a0 = __lsx_vsrli_h(grayA, 8); 1473*c8dee2aaSAndroid Build Coastguard Worker 1474*c8dee2aaSAndroid Build Coastguard Worker // Premultiply 1475*c8dee2aaSAndroid Build Coastguard Worker g0 = scale(g0, a0); 1476*c8dee2aaSAndroid Build Coastguard Worker 1477*c8dee2aaSAndroid Build Coastguard Worker __m128i gg = __lsx_vor_v(g0, __lsx_vslli_h(g0, 8)); 1478*c8dee2aaSAndroid Build Coastguard Worker __m128i ga = __lsx_vor_v(g0, __lsx_vslli_h(a0, 8)); 1479*c8dee2aaSAndroid Build Coastguard Worker 1480*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga_lo = __lsx_vilvl_h(ga, gg); 1481*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga_hi = __lsx_vilvh_h(ga, gg); 1482*c8dee2aaSAndroid Build Coastguard Worker 1483*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(ggga_lo, dst, 0); 1484*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(ggga_hi, dst, 16); 1485*c8dee2aaSAndroid Build Coastguard Worker 1486*c8dee2aaSAndroid Build Coastguard Worker src += 8*2; 1487*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 1488*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 1489*c8dee2aaSAndroid Build Coastguard Worker } 1490*c8dee2aaSAndroid Build Coastguard Worker 1491*c8dee2aaSAndroid Build Coastguard Worker grayA_to_rgbA_portable(dst, src, count); 1492*c8dee2aaSAndroid Build Coastguard Worker} 1493*c8dee2aaSAndroid Build Coastguard Worker 1494*c8dee2aaSAndroid Build Coastguard Workerenum Format { kRGB1, kBGR1 }; 1495*c8dee2aaSAndroid Build Coastguard Workerstatic void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) { 1496*c8dee2aaSAndroid Build Coastguard Worker auto convert8 = [=](__m128i *lo, __m128i* hi) { 1497*c8dee2aaSAndroid Build Coastguard Worker const __m128i zeros = __lsx_vldi(0); 1498*c8dee2aaSAndroid Build Coastguard Worker __m128i planar = __lsx_vldi(0); 1499*c8dee2aaSAndroid Build Coastguard Worker if (kBGR1 == format) { 1500*c8dee2aaSAndroid Build Coastguard Worker planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010e0a0602, 0); 1501*c8dee2aaSAndroid Build Coastguard Worker planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030c080400, 1); 1502*c8dee2aaSAndroid Build Coastguard Worker } else { 1503*c8dee2aaSAndroid Build Coastguard Worker planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010c080400, 0); 1504*c8dee2aaSAndroid Build Coastguard Worker planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030e0a0602, 1); 1505*c8dee2aaSAndroid Build Coastguard Worker } 1506*c8dee2aaSAndroid Build Coastguard Worker 1507*c8dee2aaSAndroid Build Coastguard Worker // Swizzle the pixels to 8-bit planar. 1508*c8dee2aaSAndroid Build Coastguard Worker *lo = __lsx_vshuf_b(zeros, *lo, planar); // ccccmmmm yyyykkkk 1509*c8dee2aaSAndroid Build Coastguard Worker *hi = __lsx_vshuf_b(zeros, *hi, planar); // CCCCMMMM YYYYKKKK 1510*c8dee2aaSAndroid Build Coastguard Worker __m128i cm = __lsx_vilvl_w(*hi, *lo), // ccccCCCC mmmmMMMM 1511*c8dee2aaSAndroid Build Coastguard Worker yk = __lsx_vilvh_w(*hi, *lo); // yyyyYYYY kkkkKKKK 1512*c8dee2aaSAndroid Build Coastguard Worker 1513*c8dee2aaSAndroid Build Coastguard Worker // Unpack to 16-bit planar. 1514*c8dee2aaSAndroid Build Coastguard Worker __m128i c = __lsx_vilvl_b(zeros, cm), // c_c_c_c_ C_C_C_C_ 1515*c8dee2aaSAndroid Build Coastguard Worker m = __lsx_vilvh_b(zeros, cm), // m_m_m_m_ M_M_M_M_ 1516*c8dee2aaSAndroid Build Coastguard Worker y = __lsx_vilvl_b(zeros, yk), // y_y_y_y_ Y_Y_Y_Y_ 1517*c8dee2aaSAndroid Build Coastguard Worker k = __lsx_vilvh_b(zeros, yk); // k_k_k_k_ K_K_K_K_ 1518*c8dee2aaSAndroid Build Coastguard Worker 1519*c8dee2aaSAndroid Build Coastguard Worker // Scale to r, g, b. 1520*c8dee2aaSAndroid Build Coastguard Worker __m128i r = scale(c, k), 1521*c8dee2aaSAndroid Build Coastguard Worker g = scale(m, k), 1522*c8dee2aaSAndroid Build Coastguard Worker b = scale(y, k); 1523*c8dee2aaSAndroid Build Coastguard Worker 1524*c8dee2aaSAndroid Build Coastguard Worker // Repack into interlaced pixels. 1525*c8dee2aaSAndroid Build Coastguard Worker // rgrgrgrg RGRGRGRG 1526*c8dee2aaSAndroid Build Coastguard Worker // b1b1b1b1 B1B1B1B1 1527*c8dee2aaSAndroid Build Coastguard Worker __m128i rg = __lsx_vor_v(r, __lsx_vslli_h(g, 8)), 1528*c8dee2aaSAndroid Build Coastguard Worker ba = __lsx_vor_v(b, __lsx_vreplgr2vr_h(0xff00)); 1529*c8dee2aaSAndroid Build Coastguard Worker *lo = __lsx_vilvl_h(ba, rg); // rgbargba rgbargba 1530*c8dee2aaSAndroid Build Coastguard Worker *hi = __lsx_vilvl_h(ba, rg); // RGB1RGB1 RGB1RGB1 1531*c8dee2aaSAndroid Build Coastguard Worker }; 1532*c8dee2aaSAndroid Build Coastguard Worker 1533*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 1534*c8dee2aaSAndroid Build Coastguard Worker __m128i lo = __lsx_vld(src, 0), 1535*c8dee2aaSAndroid Build Coastguard Worker hi = __lsx_vld(src, 16); 1536*c8dee2aaSAndroid Build Coastguard Worker 1537*c8dee2aaSAndroid Build Coastguard Worker convert8(&lo, &hi); 1538*c8dee2aaSAndroid Build Coastguard Worker 1539*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(lo, dst, 0); 1540*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(hi, dst, 16); 1541*c8dee2aaSAndroid Build Coastguard Worker 1542*c8dee2aaSAndroid Build Coastguard Worker src += 8; 1543*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 1544*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 1545*c8dee2aaSAndroid Build Coastguard Worker } 1546*c8dee2aaSAndroid Build Coastguard Worker 1547*c8dee2aaSAndroid Build Coastguard Worker if (count >= 4) { 1548*c8dee2aaSAndroid Build Coastguard Worker __m128i lo = __lsx_vld(src, 0), 1549*c8dee2aaSAndroid Build Coastguard Worker hi = __lsx_vldi(0); 1550*c8dee2aaSAndroid Build Coastguard Worker 1551*c8dee2aaSAndroid Build Coastguard Worker convert8(&lo, &hi); 1552*c8dee2aaSAndroid Build Coastguard Worker 1553*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(lo, dst, 0); 1554*c8dee2aaSAndroid Build Coastguard Worker 1555*c8dee2aaSAndroid Build Coastguard Worker src += 4; 1556*c8dee2aaSAndroid Build Coastguard Worker dst += 4; 1557*c8dee2aaSAndroid Build Coastguard Worker count -= 4; 1558*c8dee2aaSAndroid Build Coastguard Worker } 1559*c8dee2aaSAndroid Build Coastguard Worker 1560*c8dee2aaSAndroid Build Coastguard Worker auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 1561*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 1562*c8dee2aaSAndroid Build Coastguard Worker} 1563*c8dee2aaSAndroid Build Coastguard Worker 1564*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 1565*c8dee2aaSAndroid Build Coastguard Worker inverted_cmyk_to(kRGB1, dst, src, count); 1566*c8dee2aaSAndroid Build Coastguard Worker} 1567*c8dee2aaSAndroid Build Coastguard Worker 1568*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 1569*c8dee2aaSAndroid Build Coastguard Worker inverted_cmyk_to(kBGR1, dst, src, count); 1570*c8dee2aaSAndroid Build Coastguard Worker} 1571*c8dee2aaSAndroid Build Coastguard Worker 1572*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) { 1573*c8dee2aaSAndroid Build Coastguard Worker rgbA_to_RGBA_portable(dst, src, count); 1574*c8dee2aaSAndroid Build Coastguard Worker} 1575*c8dee2aaSAndroid Build Coastguard Worker 1576*c8dee2aaSAndroid Build Coastguard Worker/*not static*/ inline void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 1577*c8dee2aaSAndroid Build Coastguard Worker rgbA_to_BGRA_portable(dst, src, count); 1578*c8dee2aaSAndroid Build Coastguard Worker} 1579*c8dee2aaSAndroid Build Coastguard Worker 1580*c8dee2aaSAndroid Build Coastguard Worker#else 1581*c8dee2aaSAndroid Build Coastguard Worker// -- No Opts -------------------------------------------------------------------------------------- 1582*c8dee2aaSAndroid Build Coastguard Worker 1583*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) { 1584*c8dee2aaSAndroid Build Coastguard Worker rgbA_to_RGBA_portable(dst, src, count); 1585*c8dee2aaSAndroid Build Coastguard Worker} 1586*c8dee2aaSAndroid Build Coastguard Worker 1587*c8dee2aaSAndroid Build Coastguard Workervoid rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 1588*c8dee2aaSAndroid Build Coastguard Worker rgbA_to_BGRA_portable(dst, src, count); 1589*c8dee2aaSAndroid Build Coastguard Worker} 1590*c8dee2aaSAndroid Build Coastguard Worker 1591*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 1592*c8dee2aaSAndroid Build Coastguard Worker RGBA_to_rgbA_portable(dst, src, count); 1593*c8dee2aaSAndroid Build Coastguard Worker} 1594*c8dee2aaSAndroid Build Coastguard Worker 1595*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 1596*c8dee2aaSAndroid Build Coastguard Worker RGBA_to_bgrA_portable(dst, src, count); 1597*c8dee2aaSAndroid Build Coastguard Worker} 1598*c8dee2aaSAndroid Build Coastguard Worker 1599*c8dee2aaSAndroid Build Coastguard Workervoid RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 1600*c8dee2aaSAndroid Build Coastguard Worker RGBA_to_BGRA_portable(dst, src, count); 1601*c8dee2aaSAndroid Build Coastguard Worker} 1602*c8dee2aaSAndroid Build Coastguard Worker 1603*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 1604*c8dee2aaSAndroid Build Coastguard Worker grayA_to_RGBA_portable(dst, src, count); 1605*c8dee2aaSAndroid Build Coastguard Worker} 1606*c8dee2aaSAndroid Build Coastguard Worker 1607*c8dee2aaSAndroid Build Coastguard Workervoid grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 1608*c8dee2aaSAndroid Build Coastguard Worker grayA_to_rgbA_portable(dst, src, count); 1609*c8dee2aaSAndroid Build Coastguard Worker} 1610*c8dee2aaSAndroid Build Coastguard Worker 1611*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 1612*c8dee2aaSAndroid Build Coastguard Worker inverted_CMYK_to_RGB1_portable(dst, src, count); 1613*c8dee2aaSAndroid Build Coastguard Worker} 1614*c8dee2aaSAndroid Build Coastguard Worker 1615*c8dee2aaSAndroid Build Coastguard Workervoid inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 1616*c8dee2aaSAndroid Build Coastguard Worker inverted_CMYK_to_BGR1_portable(dst, src, count); 1617*c8dee2aaSAndroid Build Coastguard Worker} 1618*c8dee2aaSAndroid Build Coastguard Worker#endif 1619*c8dee2aaSAndroid Build Coastguard Worker 1620*c8dee2aaSAndroid Build Coastguard Worker// Basically as above, but we found no benefit from AVX-512 for gray_to_RGB1. 1621*c8dee2aaSAndroid Build Coastguard Workerstatic void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) { 1622*c8dee2aaSAndroid Build Coastguard Worker for (int i = 0; i < count; i++) { 1623*c8dee2aaSAndroid Build Coastguard Worker dst[i] = (uint32_t)0xFF << 24 1624*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)src[i] << 16 1625*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)src[i] << 8 1626*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)src[i] << 0; 1627*c8dee2aaSAndroid Build Coastguard Worker } 1628*c8dee2aaSAndroid Build Coastguard Worker} 1629*c8dee2aaSAndroid Build Coastguard Worker#if defined(SK_ARM_HAS_NEON) 1630*c8dee2aaSAndroid Build Coastguard Worker void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1631*c8dee2aaSAndroid Build Coastguard Worker while (count >= 16) { 1632*c8dee2aaSAndroid Build Coastguard Worker // Load 16 pixels. 1633*c8dee2aaSAndroid Build Coastguard Worker uint8x16_t gray = vld1q_u8(src); 1634*c8dee2aaSAndroid Build Coastguard Worker 1635*c8dee2aaSAndroid Build Coastguard Worker // Set each of the color channels. 1636*c8dee2aaSAndroid Build Coastguard Worker uint8x16x4_t rgba; 1637*c8dee2aaSAndroid Build Coastguard Worker rgba.val[0] = gray; 1638*c8dee2aaSAndroid Build Coastguard Worker rgba.val[1] = gray; 1639*c8dee2aaSAndroid Build Coastguard Worker rgba.val[2] = gray; 1640*c8dee2aaSAndroid Build Coastguard Worker rgba.val[3] = vdupq_n_u8(0xFF); 1641*c8dee2aaSAndroid Build Coastguard Worker 1642*c8dee2aaSAndroid Build Coastguard Worker // Store 16 pixels. 1643*c8dee2aaSAndroid Build Coastguard Worker vst4q_u8((uint8_t*) dst, rgba); 1644*c8dee2aaSAndroid Build Coastguard Worker src += 16; 1645*c8dee2aaSAndroid Build Coastguard Worker dst += 16; 1646*c8dee2aaSAndroid Build Coastguard Worker count -= 16; 1647*c8dee2aaSAndroid Build Coastguard Worker } 1648*c8dee2aaSAndroid Build Coastguard Worker if (count >= 8) { 1649*c8dee2aaSAndroid Build Coastguard Worker // Load 8 pixels. 1650*c8dee2aaSAndroid Build Coastguard Worker uint8x8_t gray = vld1_u8(src); 1651*c8dee2aaSAndroid Build Coastguard Worker 1652*c8dee2aaSAndroid Build Coastguard Worker // Set each of the color channels. 1653*c8dee2aaSAndroid Build Coastguard Worker uint8x8x4_t rgba; 1654*c8dee2aaSAndroid Build Coastguard Worker rgba.val[0] = gray; 1655*c8dee2aaSAndroid Build Coastguard Worker rgba.val[1] = gray; 1656*c8dee2aaSAndroid Build Coastguard Worker rgba.val[2] = gray; 1657*c8dee2aaSAndroid Build Coastguard Worker rgba.val[3] = vdup_n_u8(0xFF); 1658*c8dee2aaSAndroid Build Coastguard Worker 1659*c8dee2aaSAndroid Build Coastguard Worker // Store 8 pixels. 1660*c8dee2aaSAndroid Build Coastguard Worker vst4_u8((uint8_t*) dst, rgba); 1661*c8dee2aaSAndroid Build Coastguard Worker src += 8; 1662*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 1663*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 1664*c8dee2aaSAndroid Build Coastguard Worker } 1665*c8dee2aaSAndroid Build Coastguard Worker gray_to_RGB1_portable(dst, src, count); 1666*c8dee2aaSAndroid Build Coastguard Worker } 1667*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 1668*c8dee2aaSAndroid Build Coastguard Worker void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1669*c8dee2aaSAndroid Build Coastguard Worker const __m256i alphas = _mm256_set1_epi8((uint8_t) 0xFF); 1670*c8dee2aaSAndroid Build Coastguard Worker while (count >= 32) { 1671*c8dee2aaSAndroid Build Coastguard Worker __m256i grays = _mm256_loadu_si256((const __m256i*) src); 1672*c8dee2aaSAndroid Build Coastguard Worker 1673*c8dee2aaSAndroid Build Coastguard Worker __m256i gg_lo = _mm256_unpacklo_epi8(grays, grays); 1674*c8dee2aaSAndroid Build Coastguard Worker __m256i gg_hi = _mm256_unpackhi_epi8(grays, grays); 1675*c8dee2aaSAndroid Build Coastguard Worker __m256i ga_lo = _mm256_unpacklo_epi8(grays, alphas); 1676*c8dee2aaSAndroid Build Coastguard Worker __m256i ga_hi = _mm256_unpackhi_epi8(grays, alphas); 1677*c8dee2aaSAndroid Build Coastguard Worker 1678*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga0 = _mm256_unpacklo_epi16(gg_lo, ga_lo); 1679*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga1 = _mm256_unpackhi_epi16(gg_lo, ga_lo); 1680*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga2 = _mm256_unpacklo_epi16(gg_hi, ga_hi); 1681*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga3 = _mm256_unpackhi_epi16(gg_hi, ga_hi); 1682*c8dee2aaSAndroid Build Coastguard Worker 1683*c8dee2aaSAndroid Build Coastguard Worker // Shuffle for pixel reorder. 1684*c8dee2aaSAndroid Build Coastguard Worker // Note. 'p' stands for 'ggga' 1685*c8dee2aaSAndroid Build Coastguard Worker // Before shuffle: 1686*c8dee2aaSAndroid Build Coastguard Worker // ggga0 = p0 p1 p2 p3 | p16 p17 p18 p19 1687*c8dee2aaSAndroid Build Coastguard Worker // ggga1 = p4 p5 p6 p7 | p20 p21 p22 p23 1688*c8dee2aaSAndroid Build Coastguard Worker // ggga2 = p8 p9 p10 p11 | p24 p25 p26 p27 1689*c8dee2aaSAndroid Build Coastguard Worker // ggga3 = p12 p13 p14 p15 | p28 p29 p30 p31 1690*c8dee2aaSAndroid Build Coastguard Worker // 1691*c8dee2aaSAndroid Build Coastguard Worker // After shuffle: 1692*c8dee2aaSAndroid Build Coastguard Worker // ggga0_shuffle = p0 p1 p2 p3 | p4 p5 p6 p7 1693*c8dee2aaSAndroid Build Coastguard Worker // ggga1_shuffle = p8 p9 p10 p11 | p12 p13 p14 p15 1694*c8dee2aaSAndroid Build Coastguard Worker // ggga2_shuffle = p16 p17 p18 p19 | p20 p21 p22 p23 1695*c8dee2aaSAndroid Build Coastguard Worker // ggga3_shuffle = p24 p25 p26 p27 | p28 p29 p30 p31 1696*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga0_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x20), 1697*c8dee2aaSAndroid Build Coastguard Worker ggga1_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x20), 1698*c8dee2aaSAndroid Build Coastguard Worker ggga2_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x31), 1699*c8dee2aaSAndroid Build Coastguard Worker ggga3_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x31); 1700*c8dee2aaSAndroid Build Coastguard Worker 1701*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) (dst + 0), ggga0_shuffle); 1702*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) (dst + 8), ggga1_shuffle); 1703*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) (dst + 16), ggga2_shuffle); 1704*c8dee2aaSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i*) (dst + 24), ggga3_shuffle); 1705*c8dee2aaSAndroid Build Coastguard Worker 1706*c8dee2aaSAndroid Build Coastguard Worker src += 32; 1707*c8dee2aaSAndroid Build Coastguard Worker dst += 32; 1708*c8dee2aaSAndroid Build Coastguard Worker count -= 32; 1709*c8dee2aaSAndroid Build Coastguard Worker } 1710*c8dee2aaSAndroid Build Coastguard Worker gray_to_RGB1_portable(dst, src, count); 1711*c8dee2aaSAndroid Build Coastguard Worker } 1712*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 // TODO: just check >= SSE2? 1713*c8dee2aaSAndroid Build Coastguard Worker void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1714*c8dee2aaSAndroid Build Coastguard Worker const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF); 1715*c8dee2aaSAndroid Build Coastguard Worker while (count >= 16) { 1716*c8dee2aaSAndroid Build Coastguard Worker __m128i grays = _mm_loadu_si128((const __m128i*) src); 1717*c8dee2aaSAndroid Build Coastguard Worker 1718*c8dee2aaSAndroid Build Coastguard Worker __m128i gg_lo = _mm_unpacklo_epi8(grays, grays); 1719*c8dee2aaSAndroid Build Coastguard Worker __m128i gg_hi = _mm_unpackhi_epi8(grays, grays); 1720*c8dee2aaSAndroid Build Coastguard Worker __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas); 1721*c8dee2aaSAndroid Build Coastguard Worker __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas); 1722*c8dee2aaSAndroid Build Coastguard Worker 1723*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo); 1724*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo); 1725*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi); 1726*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi); 1727*c8dee2aaSAndroid Build Coastguard Worker 1728*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (dst + 0), ggga0); 1729*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (dst + 4), ggga1); 1730*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (dst + 8), ggga2); 1731*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (dst + 12), ggga3); 1732*c8dee2aaSAndroid Build Coastguard Worker 1733*c8dee2aaSAndroid Build Coastguard Worker src += 16; 1734*c8dee2aaSAndroid Build Coastguard Worker dst += 16; 1735*c8dee2aaSAndroid Build Coastguard Worker count -= 16; 1736*c8dee2aaSAndroid Build Coastguard Worker } 1737*c8dee2aaSAndroid Build Coastguard Worker gray_to_RGB1_portable(dst, src, count); 1738*c8dee2aaSAndroid Build Coastguard Worker } 1739*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX 1740*c8dee2aaSAndroid Build Coastguard Worker /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1741*c8dee2aaSAndroid Build Coastguard Worker const __m256i alphas = __lasx_xvreplgr2vr_b(0xFF); 1742*c8dee2aaSAndroid Build Coastguard Worker while (count >= 32) { 1743*c8dee2aaSAndroid Build Coastguard Worker __m256i grays = __lasx_xvld(src, 0); 1744*c8dee2aaSAndroid Build Coastguard Worker 1745*c8dee2aaSAndroid Build Coastguard Worker __m256i gg_lo = __lasx_xvilvl_b(grays, grays); 1746*c8dee2aaSAndroid Build Coastguard Worker __m256i gg_hi = __lasx_xvilvh_b(grays, grays); 1747*c8dee2aaSAndroid Build Coastguard Worker __m256i ga_lo = __lasx_xvilvl_b(alphas, grays); 1748*c8dee2aaSAndroid Build Coastguard Worker __m256i ga_hi = __lasx_xvilvh_b(alphas, grays); 1749*c8dee2aaSAndroid Build Coastguard Worker 1750*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga0 = __lasx_xvilvl_h(ga_lo, gg_lo); 1751*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga1 = __lasx_xvilvh_h(ga_lo, gg_lo); 1752*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga2 = __lasx_xvilvl_h(ga_hi, gg_hi); 1753*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga3 = __lasx_xvilvh_h(ga_hi, gg_hi); 1754*c8dee2aaSAndroid Build Coastguard Worker 1755*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga_0 = __lasx_xvpermi_q(ggga0, ggga1, 0x02); 1756*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga_1 = __lasx_xvpermi_q(ggga2, ggga3, 0x02); 1757*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga_2 = __lasx_xvpermi_q(ggga0, ggga1, 0x13); 1758*c8dee2aaSAndroid Build Coastguard Worker __m256i ggga_3 = __lasx_xvpermi_q(ggga2, ggga3, 0x13); 1759*c8dee2aaSAndroid Build Coastguard Worker 1760*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(ggga_0, dst, 0); 1761*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(ggga_1, dst, 32); 1762*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(ggga_2, dst, 64); 1763*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(ggga_3, dst, 96); 1764*c8dee2aaSAndroid Build Coastguard Worker 1765*c8dee2aaSAndroid Build Coastguard Worker src += 32; 1766*c8dee2aaSAndroid Build Coastguard Worker dst += 32; 1767*c8dee2aaSAndroid Build Coastguard Worker count -= 32; 1768*c8dee2aaSAndroid Build Coastguard Worker } 1769*c8dee2aaSAndroid Build Coastguard Worker gray_to_RGB1_portable(dst, src, count); 1770*c8dee2aaSAndroid Build Coastguard Worker } 1771*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX 1772*c8dee2aaSAndroid Build Coastguard Worker /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1773*c8dee2aaSAndroid Build Coastguard Worker const __m128i alphas = __lsx_vreplgr2vr_b(0xFF); 1774*c8dee2aaSAndroid Build Coastguard Worker while (count >= 16) { 1775*c8dee2aaSAndroid Build Coastguard Worker __m128i grays = __lsx_vld(src, 0); 1776*c8dee2aaSAndroid Build Coastguard Worker 1777*c8dee2aaSAndroid Build Coastguard Worker __m128i gg_lo = __lsx_vilvl_b(grays, grays); 1778*c8dee2aaSAndroid Build Coastguard Worker __m128i gg_hi = __lsx_vilvh_b(grays, grays); 1779*c8dee2aaSAndroid Build Coastguard Worker __m128i ga_lo = __lsx_vilvl_b(alphas, grays); 1780*c8dee2aaSAndroid Build Coastguard Worker __m128i ga_hi = __lsx_vilvh_b(alphas, grays); 1781*c8dee2aaSAndroid Build Coastguard Worker 1782*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga0 = __lsx_vilvl_h(ga_lo, gg_lo); 1783*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga1 = __lsx_vilvh_h(ga_lo, gg_lo); 1784*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga2 = __lsx_vilvl_h(ga_hi, gg_hi); 1785*c8dee2aaSAndroid Build Coastguard Worker __m128i ggga3 = __lsx_vilvh_h(ga_hi, gg_hi); 1786*c8dee2aaSAndroid Build Coastguard Worker 1787*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(ggga0, dst, 0); 1788*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(ggga1, dst, 16); 1789*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(ggga2, dst, 32); 1790*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(ggga3, dst, 48); 1791*c8dee2aaSAndroid Build Coastguard Worker 1792*c8dee2aaSAndroid Build Coastguard Worker src += 16; 1793*c8dee2aaSAndroid Build Coastguard Worker dst += 16; 1794*c8dee2aaSAndroid Build Coastguard Worker count -= 16; 1795*c8dee2aaSAndroid Build Coastguard Worker } 1796*c8dee2aaSAndroid Build Coastguard Worker gray_to_RGB1_portable(dst, src, count); 1797*c8dee2aaSAndroid Build Coastguard Worker } 1798*c8dee2aaSAndroid Build Coastguard Worker#else 1799*c8dee2aaSAndroid Build Coastguard Worker void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1800*c8dee2aaSAndroid Build Coastguard Worker gray_to_RGB1_portable(dst, src, count); 1801*c8dee2aaSAndroid Build Coastguard Worker } 1802*c8dee2aaSAndroid Build Coastguard Worker#endif 1803*c8dee2aaSAndroid Build Coastguard Worker 1804*c8dee2aaSAndroid Build Coastguard Worker// Again as above, this time not even finding benefit from AVX2 for RGB_to_{RGB,BGR}1. 1805*c8dee2aaSAndroid Build Coastguard Workerstatic void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) { 1806*c8dee2aaSAndroid Build Coastguard Worker for (int i = 0; i < count; i++) { 1807*c8dee2aaSAndroid Build Coastguard Worker uint8_t r = src[0], 1808*c8dee2aaSAndroid Build Coastguard Worker g = src[1], 1809*c8dee2aaSAndroid Build Coastguard Worker b = src[2]; 1810*c8dee2aaSAndroid Build Coastguard Worker src += 3; 1811*c8dee2aaSAndroid Build Coastguard Worker dst[i] = (uint32_t)0xFF << 24 1812*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)b << 16 1813*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)g << 8 1814*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)r << 0; 1815*c8dee2aaSAndroid Build Coastguard Worker } 1816*c8dee2aaSAndroid Build Coastguard Worker} 1817*c8dee2aaSAndroid Build Coastguard Workerstatic void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) { 1818*c8dee2aaSAndroid Build Coastguard Worker for (int i = 0; i < count; i++) { 1819*c8dee2aaSAndroid Build Coastguard Worker uint8_t r = src[0], 1820*c8dee2aaSAndroid Build Coastguard Worker g = src[1], 1821*c8dee2aaSAndroid Build Coastguard Worker b = src[2]; 1822*c8dee2aaSAndroid Build Coastguard Worker src += 3; 1823*c8dee2aaSAndroid Build Coastguard Worker dst[i] = (uint32_t)0xFF << 24 1824*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)r << 16 1825*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)g << 8 1826*c8dee2aaSAndroid Build Coastguard Worker | (uint32_t)b << 0; 1827*c8dee2aaSAndroid Build Coastguard Worker } 1828*c8dee2aaSAndroid Build Coastguard Worker} 1829*c8dee2aaSAndroid Build Coastguard Worker#if defined(SK_ARM_HAS_NEON) 1830*c8dee2aaSAndroid Build Coastguard Worker static void insert_alpha_should_swaprb(bool kSwapRB, 1831*c8dee2aaSAndroid Build Coastguard Worker uint32_t dst[], const uint8_t* src, int count) { 1832*c8dee2aaSAndroid Build Coastguard Worker while (count >= 16) { 1833*c8dee2aaSAndroid Build Coastguard Worker // Load 16 pixels. 1834*c8dee2aaSAndroid Build Coastguard Worker uint8x16x3_t rgb = vld3q_u8(src); 1835*c8dee2aaSAndroid Build Coastguard Worker 1836*c8dee2aaSAndroid Build Coastguard Worker // Insert an opaque alpha channel and swap if needed. 1837*c8dee2aaSAndroid Build Coastguard Worker uint8x16x4_t rgba; 1838*c8dee2aaSAndroid Build Coastguard Worker if (kSwapRB) { 1839*c8dee2aaSAndroid Build Coastguard Worker rgba.val[0] = rgb.val[2]; 1840*c8dee2aaSAndroid Build Coastguard Worker rgba.val[2] = rgb.val[0]; 1841*c8dee2aaSAndroid Build Coastguard Worker } else { 1842*c8dee2aaSAndroid Build Coastguard Worker rgba.val[0] = rgb.val[0]; 1843*c8dee2aaSAndroid Build Coastguard Worker rgba.val[2] = rgb.val[2]; 1844*c8dee2aaSAndroid Build Coastguard Worker } 1845*c8dee2aaSAndroid Build Coastguard Worker rgba.val[1] = rgb.val[1]; 1846*c8dee2aaSAndroid Build Coastguard Worker rgba.val[3] = vdupq_n_u8(0xFF); 1847*c8dee2aaSAndroid Build Coastguard Worker 1848*c8dee2aaSAndroid Build Coastguard Worker // Store 16 pixels. 1849*c8dee2aaSAndroid Build Coastguard Worker vst4q_u8((uint8_t*) dst, rgba); 1850*c8dee2aaSAndroid Build Coastguard Worker src += 16*3; 1851*c8dee2aaSAndroid Build Coastguard Worker dst += 16; 1852*c8dee2aaSAndroid Build Coastguard Worker count -= 16; 1853*c8dee2aaSAndroid Build Coastguard Worker } 1854*c8dee2aaSAndroid Build Coastguard Worker 1855*c8dee2aaSAndroid Build Coastguard Worker if (count >= 8) { 1856*c8dee2aaSAndroid Build Coastguard Worker // Load 8 pixels. 1857*c8dee2aaSAndroid Build Coastguard Worker uint8x8x3_t rgb = vld3_u8(src); 1858*c8dee2aaSAndroid Build Coastguard Worker 1859*c8dee2aaSAndroid Build Coastguard Worker // Insert an opaque alpha channel and swap if needed. 1860*c8dee2aaSAndroid Build Coastguard Worker uint8x8x4_t rgba; 1861*c8dee2aaSAndroid Build Coastguard Worker if (kSwapRB) { 1862*c8dee2aaSAndroid Build Coastguard Worker rgba.val[0] = rgb.val[2]; 1863*c8dee2aaSAndroid Build Coastguard Worker rgba.val[2] = rgb.val[0]; 1864*c8dee2aaSAndroid Build Coastguard Worker } else { 1865*c8dee2aaSAndroid Build Coastguard Worker rgba.val[0] = rgb.val[0]; 1866*c8dee2aaSAndroid Build Coastguard Worker rgba.val[2] = rgb.val[2]; 1867*c8dee2aaSAndroid Build Coastguard Worker } 1868*c8dee2aaSAndroid Build Coastguard Worker rgba.val[1] = rgb.val[1]; 1869*c8dee2aaSAndroid Build Coastguard Worker rgba.val[3] = vdup_n_u8(0xFF); 1870*c8dee2aaSAndroid Build Coastguard Worker 1871*c8dee2aaSAndroid Build Coastguard Worker // Store 8 pixels. 1872*c8dee2aaSAndroid Build Coastguard Worker vst4_u8((uint8_t*) dst, rgba); 1873*c8dee2aaSAndroid Build Coastguard Worker src += 8*3; 1874*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 1875*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 1876*c8dee2aaSAndroid Build Coastguard Worker } 1877*c8dee2aaSAndroid Build Coastguard Worker 1878*c8dee2aaSAndroid Build Coastguard Worker // Call portable code to finish up the tail of [0,8) pixels. 1879*c8dee2aaSAndroid Build Coastguard Worker auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; 1880*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 1881*c8dee2aaSAndroid Build Coastguard Worker } 1882*c8dee2aaSAndroid Build Coastguard Worker 1883*c8dee2aaSAndroid Build Coastguard Worker void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1884*c8dee2aaSAndroid Build Coastguard Worker insert_alpha_should_swaprb(false, dst, src, count); 1885*c8dee2aaSAndroid Build Coastguard Worker } 1886*c8dee2aaSAndroid Build Coastguard Worker void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { 1887*c8dee2aaSAndroid Build Coastguard Worker insert_alpha_should_swaprb(true, dst, src, count); 1888*c8dee2aaSAndroid Build Coastguard Worker } 1889*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 1890*c8dee2aaSAndroid Build Coastguard Worker static void insert_alpha_should_swaprb(bool kSwapRB, 1891*c8dee2aaSAndroid Build Coastguard Worker uint32_t dst[], const uint8_t* src, int count) { 1892*c8dee2aaSAndroid Build Coastguard Worker const __m128i alphaMask = _mm_set1_epi32(0xFF000000); 1893*c8dee2aaSAndroid Build Coastguard Worker __m128i expand; 1894*c8dee2aaSAndroid Build Coastguard Worker const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant. 1895*c8dee2aaSAndroid Build Coastguard Worker if (kSwapRB) { 1896*c8dee2aaSAndroid Build Coastguard Worker expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X); 1897*c8dee2aaSAndroid Build Coastguard Worker } else { 1898*c8dee2aaSAndroid Build Coastguard Worker expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X); 1899*c8dee2aaSAndroid Build Coastguard Worker } 1900*c8dee2aaSAndroid Build Coastguard Worker 1901*c8dee2aaSAndroid Build Coastguard Worker while (count >= 6) { 1902*c8dee2aaSAndroid Build Coastguard Worker // Load a vector. While this actually contains 5 pixels plus an 1903*c8dee2aaSAndroid Build Coastguard Worker // extra component, we will discard all but the first four pixels on 1904*c8dee2aaSAndroid Build Coastguard Worker // this iteration. 1905*c8dee2aaSAndroid Build Coastguard Worker __m128i rgb = _mm_loadu_si128((const __m128i*) src); 1906*c8dee2aaSAndroid Build Coastguard Worker 1907*c8dee2aaSAndroid Build Coastguard Worker // Expand the first four pixels to RGBX and then mask to RGB(FF). 1908*c8dee2aaSAndroid Build Coastguard Worker __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask); 1909*c8dee2aaSAndroid Build Coastguard Worker 1910*c8dee2aaSAndroid Build Coastguard Worker // Store 4 pixels. 1911*c8dee2aaSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) dst, rgba); 1912*c8dee2aaSAndroid Build Coastguard Worker 1913*c8dee2aaSAndroid Build Coastguard Worker src += 4*3; 1914*c8dee2aaSAndroid Build Coastguard Worker dst += 4; 1915*c8dee2aaSAndroid Build Coastguard Worker count -= 4; 1916*c8dee2aaSAndroid Build Coastguard Worker } 1917*c8dee2aaSAndroid Build Coastguard Worker 1918*c8dee2aaSAndroid Build Coastguard Worker // Call portable code to finish up the tail of [0,4) pixels. 1919*c8dee2aaSAndroid Build Coastguard Worker auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; 1920*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 1921*c8dee2aaSAndroid Build Coastguard Worker } 1922*c8dee2aaSAndroid Build Coastguard Worker 1923*c8dee2aaSAndroid Build Coastguard Worker void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1924*c8dee2aaSAndroid Build Coastguard Worker insert_alpha_should_swaprb(false, dst, src, count); 1925*c8dee2aaSAndroid Build Coastguard Worker } 1926*c8dee2aaSAndroid Build Coastguard Worker void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { 1927*c8dee2aaSAndroid Build Coastguard Worker insert_alpha_should_swaprb(true, dst, src, count); 1928*c8dee2aaSAndroid Build Coastguard Worker } 1929*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX 1930*c8dee2aaSAndroid Build Coastguard Worker static void insert_alpha_should_swaprb(bool kSwapRB, 1931*c8dee2aaSAndroid Build Coastguard Worker uint32_t dst[], const uint8_t* src, int count) { 1932*c8dee2aaSAndroid Build Coastguard Worker const __m256i alphaMask = __lasx_xvreplgr2vr_w(0xFF000000); 1933*c8dee2aaSAndroid Build Coastguard Worker 1934*c8dee2aaSAndroid Build Coastguard Worker __m256i expand = __lasx_xvldi(0); 1935*c8dee2aaSAndroid Build Coastguard Worker if (kSwapRB) { 1936*c8dee2aaSAndroid Build Coastguard Worker expand = __lasx_xvinsgr2vr_d(expand, 0x0503040502000102, 0); 1937*c8dee2aaSAndroid Build Coastguard Worker expand = __lasx_xvinsgr2vr_d(expand, 0x0b090a0b08060708, 1); 1938*c8dee2aaSAndroid Build Coastguard Worker expand = __lasx_xvinsgr2vr_d(expand, 0x110f10110e0c0d0e, 2); 1939*c8dee2aaSAndroid Build Coastguard Worker expand = __lasx_xvinsgr2vr_d(expand, 0x1715161714121314, 3); 1940*c8dee2aaSAndroid Build Coastguard Worker } else { 1941*c8dee2aaSAndroid Build Coastguard Worker expand = __lasx_xvinsgr2vr_d(expand, 0x0505040302020100, 0); 1942*c8dee2aaSAndroid Build Coastguard Worker expand = __lasx_xvinsgr2vr_d(expand, 0x0b0b0a0908080706, 1); 1943*c8dee2aaSAndroid Build Coastguard Worker expand = __lasx_xvinsgr2vr_d(expand, 0x1111100f0e0e0d0c, 2); 1944*c8dee2aaSAndroid Build Coastguard Worker expand = __lasx_xvinsgr2vr_d(expand, 0x1717161514141312, 3); 1945*c8dee2aaSAndroid Build Coastguard Worker } 1946*c8dee2aaSAndroid Build Coastguard Worker 1947*c8dee2aaSAndroid Build Coastguard Worker while (count >= 8) { 1948*c8dee2aaSAndroid Build Coastguard Worker // Load a vector. While this actually contains 5 pixels plus an 1949*c8dee2aaSAndroid Build Coastguard Worker // extra component, we will discard all but the first four pixels on 1950*c8dee2aaSAndroid Build Coastguard Worker // this iteration. 1951*c8dee2aaSAndroid Build Coastguard Worker __m256i rgb = __lasx_xvld(src, 0); 1952*c8dee2aaSAndroid Build Coastguard Worker __m256i rgb_l = __lasx_xvpermi_d(rgb, 0x44); 1953*c8dee2aaSAndroid Build Coastguard Worker __m256i rgb_h = __lasx_xvpermi_d(rgb, 0xEE); 1954*c8dee2aaSAndroid Build Coastguard Worker 1955*c8dee2aaSAndroid Build Coastguard Worker // Expand the first four pixels to RGBX and then mask to RGB(FF). 1956*c8dee2aaSAndroid Build Coastguard Worker __m256i rgba = __lasx_xvor_v(__lasx_xvshuf_b(rgb_h, rgb_l, expand), alphaMask); 1957*c8dee2aaSAndroid Build Coastguard Worker 1958*c8dee2aaSAndroid Build Coastguard Worker // Store 8 pixels. 1959*c8dee2aaSAndroid Build Coastguard Worker __lasx_xvst(rgba, dst, 0); 1960*c8dee2aaSAndroid Build Coastguard Worker 1961*c8dee2aaSAndroid Build Coastguard Worker src += 4*6; 1962*c8dee2aaSAndroid Build Coastguard Worker dst += 8; 1963*c8dee2aaSAndroid Build Coastguard Worker count -= 8; 1964*c8dee2aaSAndroid Build Coastguard Worker } 1965*c8dee2aaSAndroid Build Coastguard Worker 1966*c8dee2aaSAndroid Build Coastguard Worker // Call portable code to finish up the tail of [0,4) pixels. 1967*c8dee2aaSAndroid Build Coastguard Worker auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; 1968*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 1969*c8dee2aaSAndroid Build Coastguard Worker } 1970*c8dee2aaSAndroid Build Coastguard Worker /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1971*c8dee2aaSAndroid Build Coastguard Worker insert_alpha_should_swaprb(false, dst, src, count); 1972*c8dee2aaSAndroid Build Coastguard Worker } 1973*c8dee2aaSAndroid Build Coastguard Worker /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { 1974*c8dee2aaSAndroid Build Coastguard Worker insert_alpha_should_swaprb(true, dst, src, count); 1975*c8dee2aaSAndroid Build Coastguard Worker } 1976*c8dee2aaSAndroid Build Coastguard Worker#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX 1977*c8dee2aaSAndroid Build Coastguard Worker static void insert_alpha_should_swaprb(bool kSwapRB, 1978*c8dee2aaSAndroid Build Coastguard Worker uint32_t dst[], const uint8_t* src, int count) { 1979*c8dee2aaSAndroid Build Coastguard Worker const __m128i alphaMask = __lsx_vreplgr2vr_w(0xFF000000); 1980*c8dee2aaSAndroid Build Coastguard Worker 1981*c8dee2aaSAndroid Build Coastguard Worker __m128i expand = __lsx_vldi(0); 1982*c8dee2aaSAndroid Build Coastguard Worker if (kSwapRB) { 1983*c8dee2aaSAndroid Build Coastguard Worker expand = __lsx_vinsgr2vr_d(expand, 0x0503040502000102, 0); 1984*c8dee2aaSAndroid Build Coastguard Worker expand = __lsx_vinsgr2vr_d(expand, 0x0b090a0b08060708, 1); 1985*c8dee2aaSAndroid Build Coastguard Worker } else { 1986*c8dee2aaSAndroid Build Coastguard Worker expand = __lsx_vinsgr2vr_d(expand, 0x0505040302020100, 0); 1987*c8dee2aaSAndroid Build Coastguard Worker expand = __lsx_vinsgr2vr_d(expand, 0x0b0b0a0908080706, 1); 1988*c8dee2aaSAndroid Build Coastguard Worker } 1989*c8dee2aaSAndroid Build Coastguard Worker 1990*c8dee2aaSAndroid Build Coastguard Worker while (count >= 6) { 1991*c8dee2aaSAndroid Build Coastguard Worker // Load a vector. While this actually contains 5 pixels plus an 1992*c8dee2aaSAndroid Build Coastguard Worker // extra component, we will discard all but the first four pixels on 1993*c8dee2aaSAndroid Build Coastguard Worker // this iteration. 1994*c8dee2aaSAndroid Build Coastguard Worker __m128i rgb = __lsx_vld(src, 0); 1995*c8dee2aaSAndroid Build Coastguard Worker 1996*c8dee2aaSAndroid Build Coastguard Worker // Expand the first four pixels to RGBX and then mask to RGB(FF). 1997*c8dee2aaSAndroid Build Coastguard Worker __m128i rgba = __lsx_vor_v(__lsx_vshuf_b(rgb, rgb, expand), alphaMask); 1998*c8dee2aaSAndroid Build Coastguard Worker 1999*c8dee2aaSAndroid Build Coastguard Worker // Store 4 pixels. 2000*c8dee2aaSAndroid Build Coastguard Worker __lsx_vst(rgba, dst, 0); 2001*c8dee2aaSAndroid Build Coastguard Worker 2002*c8dee2aaSAndroid Build Coastguard Worker src += 4*3; 2003*c8dee2aaSAndroid Build Coastguard Worker dst += 4; 2004*c8dee2aaSAndroid Build Coastguard Worker count -= 4; 2005*c8dee2aaSAndroid Build Coastguard Worker } 2006*c8dee2aaSAndroid Build Coastguard Worker 2007*c8dee2aaSAndroid Build Coastguard Worker // Call portable code to finish up the tail of [0,4) pixels. 2008*c8dee2aaSAndroid Build Coastguard Worker auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; 2009*c8dee2aaSAndroid Build Coastguard Worker proc(dst, src, count); 2010*c8dee2aaSAndroid Build Coastguard Worker } 2011*c8dee2aaSAndroid Build Coastguard Worker /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 2012*c8dee2aaSAndroid Build Coastguard Worker insert_alpha_should_swaprb(false, dst, src, count); 2013*c8dee2aaSAndroid Build Coastguard Worker } 2014*c8dee2aaSAndroid Build Coastguard Worker /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { 2015*c8dee2aaSAndroid Build Coastguard Worker insert_alpha_should_swaprb(true, dst, src, count); 2016*c8dee2aaSAndroid Build Coastguard Worker } 2017*c8dee2aaSAndroid Build Coastguard Worker#else 2018*c8dee2aaSAndroid Build Coastguard Worker void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 2019*c8dee2aaSAndroid Build Coastguard Worker RGB_to_RGB1_portable(dst, src, count); 2020*c8dee2aaSAndroid Build Coastguard Worker } 2021*c8dee2aaSAndroid Build Coastguard Worker void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { 2022*c8dee2aaSAndroid Build Coastguard Worker RGB_to_BGR1_portable(dst, src, count); 2023*c8dee2aaSAndroid Build Coastguard Worker } 2024*c8dee2aaSAndroid Build Coastguard Worker#endif 2025*c8dee2aaSAndroid Build Coastguard Worker 2026*c8dee2aaSAndroid Build Coastguard Worker} // namespace SK_OPTS_NS 2027*c8dee2aaSAndroid Build Coastguard Worker 2028*c8dee2aaSAndroid Build Coastguard Worker#undef SI 2029