1*b2055c35SXin Li // Copyright 2016 Google Inc. All Rights Reserved.
2*b2055c35SXin Li //
3*b2055c35SXin Li // Use of this source code is governed by a BSD-style license
4*b2055c35SXin Li // that can be found in the COPYING file in the root of the source
5*b2055c35SXin Li // tree. An additional intellectual property rights grant can be found
6*b2055c35SXin Li // in the file PATENTS. All contributing project authors may
7*b2055c35SXin Li // be found in the AUTHORS file in the root of the source tree.
8*b2055c35SXin Li // -----------------------------------------------------------------------------
9*b2055c35SXin Li //
10*b2055c35SXin Li // MSA version of rescaling functions
11*b2055c35SXin Li //
12*b2055c35SXin Li // Author: Prashant Patil ([email protected])
13*b2055c35SXin Li
14*b2055c35SXin Li #include "src/dsp/dsp.h"
15*b2055c35SXin Li
16*b2055c35SXin Li #if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
17*b2055c35SXin Li
18*b2055c35SXin Li #include <assert.h>
19*b2055c35SXin Li
20*b2055c35SXin Li #include "src/utils/rescaler_utils.h"
21*b2055c35SXin Li #include "src/dsp/msa_macro.h"
22*b2055c35SXin Li
23*b2055c35SXin Li #define ROUNDER (WEBP_RESCALER_ONE >> 1)
24*b2055c35SXin Li #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
25*b2055c35SXin Li #define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
26*b2055c35SXin Li
27*b2055c35SXin Li #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \
28*b2055c35SXin Li v4u32 tmp0, tmp1, tmp2, tmp3; \
29*b2055c35SXin Li v16u8 t0, t1, t2, t3, t4, t5; \
30*b2055c35SXin Li v2u64 out0, out1, out2, out3; \
31*b2055c35SXin Li ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
32*b2055c35SXin Li ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
33*b2055c35SXin Li DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
34*b2055c35SXin Li DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
35*b2055c35SXin Li SRAR_D4_UD(out0, out1, out2, out3, shift); \
36*b2055c35SXin Li PCKEV_B2_UB(out1, out0, out3, out2, t0, t1); \
37*b2055c35SXin Li ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
38*b2055c35SXin Li ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
39*b2055c35SXin Li DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
40*b2055c35SXin Li DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
41*b2055c35SXin Li SRAR_D4_UD(out0, out1, out2, out3, shift); \
42*b2055c35SXin Li PCKEV_B2_UB(out1, out0, out3, out2, t2, t3); \
43*b2055c35SXin Li PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); \
44*b2055c35SXin Li dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); \
45*b2055c35SXin Li } while (0)
46*b2055c35SXin Li
47*b2055c35SXin Li #define CALC_MULT_FIX_4(in0, scale, shift, dst) do { \
48*b2055c35SXin Li v4u32 tmp0, tmp1; \
49*b2055c35SXin Li v16i8 t0, t1; \
50*b2055c35SXin Li v2u64 out0, out1; \
51*b2055c35SXin Li ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
52*b2055c35SXin Li DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
53*b2055c35SXin Li SRAR_D2_UD(out0, out1, shift); \
54*b2055c35SXin Li t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
55*b2055c35SXin Li t1 = __msa_pckev_b(t0, t0); \
56*b2055c35SXin Li t0 = __msa_pckev_b(t1, t1); \
57*b2055c35SXin Li dst = __msa_copy_s_w((v4i32)t0, 0); \
58*b2055c35SXin Li } while (0)
59*b2055c35SXin Li
60*b2055c35SXin Li #define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift, \
61*b2055c35SXin Li dst0, dst1, dst2, dst3) do { \
62*b2055c35SXin Li v4u32 tmp0, tmp1, tmp2, tmp3; \
63*b2055c35SXin Li v2u64 out0, out1, out2, out3; \
64*b2055c35SXin Li ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
65*b2055c35SXin Li ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
66*b2055c35SXin Li DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
67*b2055c35SXin Li DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
68*b2055c35SXin Li SRAR_D4_UD(out0, out1, out2, out3, shift); \
69*b2055c35SXin Li PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1); \
70*b2055c35SXin Li ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
71*b2055c35SXin Li ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
72*b2055c35SXin Li DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
73*b2055c35SXin Li DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
74*b2055c35SXin Li SRAR_D4_UD(out0, out1, out2, out3, shift); \
75*b2055c35SXin Li PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3); \
76*b2055c35SXin Li } while (0)
77*b2055c35SXin Li
78*b2055c35SXin Li #define CALC_MULT_FIX1_4(in0, scale, shift, dst) do { \
79*b2055c35SXin Li v4u32 tmp0, tmp1; \
80*b2055c35SXin Li v2u64 out0, out1; \
81*b2055c35SXin Li ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
82*b2055c35SXin Li DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
83*b2055c35SXin Li SRAR_D2_UD(out0, out1, shift); \
84*b2055c35SXin Li dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0); \
85*b2055c35SXin Li } while (0)
86*b2055c35SXin Li
87*b2055c35SXin Li #define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift, \
88*b2055c35SXin Li dst0, dst1) do { \
89*b2055c35SXin Li v4u32 tmp0, tmp1, tmp2, tmp3; \
90*b2055c35SXin Li v2u64 out0, out1, out2, out3; \
91*b2055c35SXin Li ILVRL_W2_UW(in0, in2, tmp0, tmp1); \
92*b2055c35SXin Li ILVRL_W2_UW(in1, in3, tmp2, tmp3); \
93*b2055c35SXin Li DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
94*b2055c35SXin Li DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3); \
95*b2055c35SXin Li SRAR_D4_UD(out0, out1, out2, out3, shift); \
96*b2055c35SXin Li DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
97*b2055c35SXin Li DOTP_UW2_UD(out2, out3, scale, scale, out2, out3); \
98*b2055c35SXin Li SRAR_D4_UD(out0, out1, out2, out3, shift); \
99*b2055c35SXin Li PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1); \
100*b2055c35SXin Li } while (0)
101*b2055c35SXin Li
102*b2055c35SXin Li #define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do { \
103*b2055c35SXin Li v4u32 tmp0, tmp1; \
104*b2055c35SXin Li v2u64 out0, out1; \
105*b2055c35SXin Li v16i8 t0, t1; \
106*b2055c35SXin Li ILVRL_W2_UW(in0, in1, tmp0, tmp1); \
107*b2055c35SXin Li DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
108*b2055c35SXin Li SRAR_D2_UD(out0, out1, shift); \
109*b2055c35SXin Li DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
110*b2055c35SXin Li SRAR_D2_UD(out0, out1, shift); \
111*b2055c35SXin Li t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
112*b2055c35SXin Li t1 = __msa_pckev_b(t0, t0); \
113*b2055c35SXin Li t0 = __msa_pckev_b(t1, t1); \
114*b2055c35SXin Li dst = __msa_copy_s_w((v4i32)t0, 0); \
115*b2055c35SXin Li } while (0)
116*b2055c35SXin Li
ExportRowExpand_0(const uint32_t * frow,uint8_t * dst,int length,WebPRescaler * const wrk)117*b2055c35SXin Li static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
118*b2055c35SXin Li int length,
119*b2055c35SXin Li WebPRescaler* const wrk) {
120*b2055c35SXin Li const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
121*b2055c35SXin Li const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
122*b2055c35SXin Li const v4i32 zero = { 0 };
123*b2055c35SXin Li
124*b2055c35SXin Li while (length >= 16) {
125*b2055c35SXin Li v4u32 src0, src1, src2, src3;
126*b2055c35SXin Li v16u8 out;
127*b2055c35SXin Li LD_UW4(frow, 4, src0, src1, src2, src3);
128*b2055c35SXin Li CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
129*b2055c35SXin Li ST_UB(out, dst);
130*b2055c35SXin Li length -= 16;
131*b2055c35SXin Li frow += 16;
132*b2055c35SXin Li dst += 16;
133*b2055c35SXin Li }
134*b2055c35SXin Li if (length > 0) {
135*b2055c35SXin Li int x_out;
136*b2055c35SXin Li if (length >= 12) {
137*b2055c35SXin Li uint32_t val0_m, val1_m, val2_m;
138*b2055c35SXin Li v4u32 src0, src1, src2;
139*b2055c35SXin Li LD_UW3(frow, 4, src0, src1, src2);
140*b2055c35SXin Li CALC_MULT_FIX_4(src0, scale, shift, val0_m);
141*b2055c35SXin Li CALC_MULT_FIX_4(src1, scale, shift, val1_m);
142*b2055c35SXin Li CALC_MULT_FIX_4(src2, scale, shift, val2_m);
143*b2055c35SXin Li SW3(val0_m, val1_m, val2_m, dst, 4);
144*b2055c35SXin Li length -= 12;
145*b2055c35SXin Li frow += 12;
146*b2055c35SXin Li dst += 12;
147*b2055c35SXin Li } else if (length >= 8) {
148*b2055c35SXin Li uint32_t val0_m, val1_m;
149*b2055c35SXin Li v4u32 src0, src1;
150*b2055c35SXin Li LD_UW2(frow, 4, src0, src1);
151*b2055c35SXin Li CALC_MULT_FIX_4(src0, scale, shift, val0_m);
152*b2055c35SXin Li CALC_MULT_FIX_4(src1, scale, shift, val1_m);
153*b2055c35SXin Li SW2(val0_m, val1_m, dst, 4);
154*b2055c35SXin Li length -= 8;
155*b2055c35SXin Li frow += 8;
156*b2055c35SXin Li dst += 8;
157*b2055c35SXin Li } else if (length >= 4) {
158*b2055c35SXin Li uint32_t val0_m;
159*b2055c35SXin Li const v4u32 src0 = LD_UW(frow);
160*b2055c35SXin Li CALC_MULT_FIX_4(src0, scale, shift, val0_m);
161*b2055c35SXin Li SW(val0_m, dst);
162*b2055c35SXin Li length -= 4;
163*b2055c35SXin Li frow += 4;
164*b2055c35SXin Li dst += 4;
165*b2055c35SXin Li }
166*b2055c35SXin Li for (x_out = 0; x_out < length; ++x_out) {
167*b2055c35SXin Li const uint32_t J = frow[x_out];
168*b2055c35SXin Li const int v = (int)MULT_FIX(J, wrk->fy_scale);
169*b2055c35SXin Li dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
170*b2055c35SXin Li }
171*b2055c35SXin Li }
172*b2055c35SXin Li }
173*b2055c35SXin Li
ExportRowExpand_1(const uint32_t * frow,uint32_t * irow,uint8_t * dst,int length,WebPRescaler * const wrk)174*b2055c35SXin Li static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
175*b2055c35SXin Li uint8_t* dst, int length,
176*b2055c35SXin Li WebPRescaler* const wrk) {
177*b2055c35SXin Li const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
178*b2055c35SXin Li const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
179*b2055c35SXin Li const v4i32 B1 = __msa_fill_w(B);
180*b2055c35SXin Li const v4i32 A1 = __msa_fill_w(A);
181*b2055c35SXin Li const v4i32 AB = __msa_ilvr_w(A1, B1);
182*b2055c35SXin Li const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
183*b2055c35SXin Li const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
184*b2055c35SXin Li
185*b2055c35SXin Li while (length >= 16) {
186*b2055c35SXin Li v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
187*b2055c35SXin Li v16u8 t0, t1, t2, t3, t4, t5;
188*b2055c35SXin Li LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
189*b2055c35SXin Li LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
190*b2055c35SXin Li CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
191*b2055c35SXin Li CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
192*b2055c35SXin Li PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
193*b2055c35SXin Li t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
194*b2055c35SXin Li ST_UB(t0, dst);
195*b2055c35SXin Li frow += 16;
196*b2055c35SXin Li irow += 16;
197*b2055c35SXin Li dst += 16;
198*b2055c35SXin Li length -= 16;
199*b2055c35SXin Li }
200*b2055c35SXin Li if (length > 0) {
201*b2055c35SXin Li int x_out;
202*b2055c35SXin Li if (length >= 12) {
203*b2055c35SXin Li uint32_t val0_m, val1_m, val2_m;
204*b2055c35SXin Li v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
205*b2055c35SXin Li LD_UW3(frow, 4, frow0, frow1, frow2);
206*b2055c35SXin Li LD_UW3(irow, 4, irow0, irow1, irow2);
207*b2055c35SXin Li CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
208*b2055c35SXin Li CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
209*b2055c35SXin Li CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
210*b2055c35SXin Li SW3(val0_m, val1_m, val2_m, dst, 4);
211*b2055c35SXin Li frow += 12;
212*b2055c35SXin Li irow += 12;
213*b2055c35SXin Li dst += 12;
214*b2055c35SXin Li length -= 12;
215*b2055c35SXin Li } else if (length >= 8) {
216*b2055c35SXin Li uint32_t val0_m, val1_m;
217*b2055c35SXin Li v4u32 frow0, frow1, irow0, irow1;
218*b2055c35SXin Li LD_UW2(frow, 4, frow0, frow1);
219*b2055c35SXin Li LD_UW2(irow, 4, irow0, irow1);
220*b2055c35SXin Li CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
221*b2055c35SXin Li CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
222*b2055c35SXin Li SW2(val0_m, val1_m, dst, 4);
223*b2055c35SXin Li frow += 4;
224*b2055c35SXin Li irow += 4;
225*b2055c35SXin Li dst += 4;
226*b2055c35SXin Li length -= 4;
227*b2055c35SXin Li } else if (length >= 4) {
228*b2055c35SXin Li uint32_t val0_m;
229*b2055c35SXin Li const v4u32 frow0 = LD_UW(frow + 0);
230*b2055c35SXin Li const v4u32 irow0 = LD_UW(irow + 0);
231*b2055c35SXin Li CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
232*b2055c35SXin Li SW(val0_m, dst);
233*b2055c35SXin Li frow += 4;
234*b2055c35SXin Li irow += 4;
235*b2055c35SXin Li dst += 4;
236*b2055c35SXin Li length -= 4;
237*b2055c35SXin Li }
238*b2055c35SXin Li for (x_out = 0; x_out < length; ++x_out) {
239*b2055c35SXin Li const uint64_t I = (uint64_t)A * frow[x_out]
240*b2055c35SXin Li + (uint64_t)B * irow[x_out];
241*b2055c35SXin Li const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
242*b2055c35SXin Li const int v = (int)MULT_FIX(J, wrk->fy_scale);
243*b2055c35SXin Li dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
244*b2055c35SXin Li }
245*b2055c35SXin Li }
246*b2055c35SXin Li }
247*b2055c35SXin Li
RescalerExportRowExpand_MIPSdspR2(WebPRescaler * const wrk)248*b2055c35SXin Li static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
249*b2055c35SXin Li uint8_t* dst = wrk->dst;
250*b2055c35SXin Li rescaler_t* irow = wrk->irow;
251*b2055c35SXin Li const int x_out_max = wrk->dst_width * wrk->num_channels;
252*b2055c35SXin Li const rescaler_t* frow = wrk->frow;
253*b2055c35SXin Li assert(!WebPRescalerOutputDone(wrk));
254*b2055c35SXin Li assert(wrk->y_accum <= 0);
255*b2055c35SXin Li assert(wrk->y_expand);
256*b2055c35SXin Li assert(wrk->y_sub != 0);
257*b2055c35SXin Li if (wrk->y_accum == 0) {
258*b2055c35SXin Li ExportRowExpand_0(frow, dst, x_out_max, wrk);
259*b2055c35SXin Li } else {
260*b2055c35SXin Li ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
261*b2055c35SXin Li }
262*b2055c35SXin Li }
263*b2055c35SXin Li
264*b2055c35SXin Li #if 0 // disabled for now. TODO(skal): make match the C-code
265*b2055c35SXin Li static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
266*b2055c35SXin Li uint8_t* dst, int length,
267*b2055c35SXin Li const uint32_t yscale,
268*b2055c35SXin Li WebPRescaler* const wrk) {
269*b2055c35SXin Li const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
270*b2055c35SXin Li const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
271*b2055c35SXin Li const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
272*b2055c35SXin Li const v4i32 zero = { 0 };
273*b2055c35SXin Li
274*b2055c35SXin Li while (length >= 16) {
275*b2055c35SXin Li v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
276*b2055c35SXin Li v16u8 out;
277*b2055c35SXin Li LD_UW4(frow, 4, src0, src1, src2, src3);
278*b2055c35SXin Li CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
279*b2055c35SXin Li frac0, frac1, frac2, frac3);
280*b2055c35SXin Li LD_UW4(irow, 4, src0, src1, src2, src3);
281*b2055c35SXin Li SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
282*b2055c35SXin Li src0, src1, src2, src3);
283*b2055c35SXin Li CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
284*b2055c35SXin Li ST_UB(out, dst);
285*b2055c35SXin Li ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
286*b2055c35SXin Li frow += 16;
287*b2055c35SXin Li irow += 16;
288*b2055c35SXin Li dst += 16;
289*b2055c35SXin Li length -= 16;
290*b2055c35SXin Li }
291*b2055c35SXin Li if (length > 0) {
292*b2055c35SXin Li int x_out;
293*b2055c35SXin Li if (length >= 12) {
294*b2055c35SXin Li uint32_t val0_m, val1_m, val2_m;
295*b2055c35SXin Li v4u32 src0, src1, src2, frac0, frac1, frac2;
296*b2055c35SXin Li LD_UW3(frow, 4, src0, src1, src2);
297*b2055c35SXin Li CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
298*b2055c35SXin Li CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
299*b2055c35SXin Li CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
300*b2055c35SXin Li LD_UW3(irow, 4, src0, src1, src2);
301*b2055c35SXin Li SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
302*b2055c35SXin Li CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
303*b2055c35SXin Li CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
304*b2055c35SXin Li CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
305*b2055c35SXin Li SW3(val0_m, val1_m, val2_m, dst, 4);
306*b2055c35SXin Li ST_UW3(frac0, frac1, frac2, irow, 4);
307*b2055c35SXin Li frow += 12;
308*b2055c35SXin Li irow += 12;
309*b2055c35SXin Li dst += 12;
310*b2055c35SXin Li length -= 12;
311*b2055c35SXin Li } else if (length >= 8) {
312*b2055c35SXin Li uint32_t val0_m, val1_m;
313*b2055c35SXin Li v4u32 src0, src1, frac0, frac1;
314*b2055c35SXin Li LD_UW2(frow, 4, src0, src1);
315*b2055c35SXin Li CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
316*b2055c35SXin Li CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
317*b2055c35SXin Li LD_UW2(irow, 4, src0, src1);
318*b2055c35SXin Li SUB2(src0, frac0, src1, frac1, src0, src1);
319*b2055c35SXin Li CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
320*b2055c35SXin Li CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
321*b2055c35SXin Li SW2(val0_m, val1_m, dst, 4);
322*b2055c35SXin Li ST_UW2(frac0, frac1, irow, 4);
323*b2055c35SXin Li frow += 8;
324*b2055c35SXin Li irow += 8;
325*b2055c35SXin Li dst += 8;
326*b2055c35SXin Li length -= 8;
327*b2055c35SXin Li } else if (length >= 4) {
328*b2055c35SXin Li uint32_t val0_m;
329*b2055c35SXin Li v4u32 frac0;
330*b2055c35SXin Li v4u32 src0 = LD_UW(frow);
331*b2055c35SXin Li CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
332*b2055c35SXin Li src0 = LD_UW(irow);
333*b2055c35SXin Li src0 = src0 - frac0;
334*b2055c35SXin Li CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
335*b2055c35SXin Li SW(val0_m, dst);
336*b2055c35SXin Li ST_UW(frac0, irow);
337*b2055c35SXin Li frow += 4;
338*b2055c35SXin Li irow += 4;
339*b2055c35SXin Li dst += 4;
340*b2055c35SXin Li length -= 4;
341*b2055c35SXin Li }
342*b2055c35SXin Li for (x_out = 0; x_out < length; ++x_out) {
343*b2055c35SXin Li const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
344*b2055c35SXin Li const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
345*b2055c35SXin Li dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
346*b2055c35SXin Li irow[x_out] = frac;
347*b2055c35SXin Li }
348*b2055c35SXin Li }
349*b2055c35SXin Li }
350*b2055c35SXin Li
351*b2055c35SXin Li static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
352*b2055c35SXin Li int length,
353*b2055c35SXin Li WebPRescaler* const wrk) {
354*b2055c35SXin Li const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
355*b2055c35SXin Li const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
356*b2055c35SXin Li const v4i32 zero = { 0 };
357*b2055c35SXin Li
358*b2055c35SXin Li while (length >= 16) {
359*b2055c35SXin Li v4u32 src0, src1, src2, src3;
360*b2055c35SXin Li v16u8 dst0;
361*b2055c35SXin Li LD_UW4(irow, 4, src0, src1, src2, src3);
362*b2055c35SXin Li CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
363*b2055c35SXin Li ST_UB(dst0, dst);
364*b2055c35SXin Li ST_SW4(zero, zero, zero, zero, irow, 4);
365*b2055c35SXin Li length -= 16;
366*b2055c35SXin Li irow += 16;
367*b2055c35SXin Li dst += 16;
368*b2055c35SXin Li }
369*b2055c35SXin Li if (length > 0) {
370*b2055c35SXin Li int x_out;
371*b2055c35SXin Li if (length >= 12) {
372*b2055c35SXin Li uint32_t val0_m, val1_m, val2_m;
373*b2055c35SXin Li v4u32 src0, src1, src2;
374*b2055c35SXin Li LD_UW3(irow, 4, src0, src1, src2);
375*b2055c35SXin Li CALC_MULT_FIX_4(src0, scale, shift, val0_m);
376*b2055c35SXin Li CALC_MULT_FIX_4(src1, scale, shift, val1_m);
377*b2055c35SXin Li CALC_MULT_FIX_4(src2, scale, shift, val2_m);
378*b2055c35SXin Li SW3(val0_m, val1_m, val2_m, dst, 4);
379*b2055c35SXin Li ST_SW3(zero, zero, zero, irow, 4);
380*b2055c35SXin Li length -= 12;
381*b2055c35SXin Li irow += 12;
382*b2055c35SXin Li dst += 12;
383*b2055c35SXin Li } else if (length >= 8) {
384*b2055c35SXin Li uint32_t val0_m, val1_m;
385*b2055c35SXin Li v4u32 src0, src1;
386*b2055c35SXin Li LD_UW2(irow, 4, src0, src1);
387*b2055c35SXin Li CALC_MULT_FIX_4(src0, scale, shift, val0_m);
388*b2055c35SXin Li CALC_MULT_FIX_4(src1, scale, shift, val1_m);
389*b2055c35SXin Li SW2(val0_m, val1_m, dst, 4);
390*b2055c35SXin Li ST_SW2(zero, zero, irow, 4);
391*b2055c35SXin Li length -= 8;
392*b2055c35SXin Li irow += 8;
393*b2055c35SXin Li dst += 8;
394*b2055c35SXin Li } else if (length >= 4) {
395*b2055c35SXin Li uint32_t val0_m;
396*b2055c35SXin Li const v4u32 src0 = LD_UW(irow + 0);
397*b2055c35SXin Li CALC_MULT_FIX_4(src0, scale, shift, val0_m);
398*b2055c35SXin Li SW(val0_m, dst);
399*b2055c35SXin Li ST_SW(zero, irow);
400*b2055c35SXin Li length -= 4;
401*b2055c35SXin Li irow += 4;
402*b2055c35SXin Li dst += 4;
403*b2055c35SXin Li }
404*b2055c35SXin Li for (x_out = 0; x_out < length; ++x_out) {
405*b2055c35SXin Li const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
406*b2055c35SXin Li dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
407*b2055c35SXin Li irow[x_out] = 0;
408*b2055c35SXin Li }
409*b2055c35SXin Li }
410*b2055c35SXin Li }
411*b2055c35SXin Li
412*b2055c35SXin Li static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
413*b2055c35SXin Li uint8_t* dst = wrk->dst;
414*b2055c35SXin Li rescaler_t* irow = wrk->irow;
415*b2055c35SXin Li const int x_out_max = wrk->dst_width * wrk->num_channels;
416*b2055c35SXin Li const rescaler_t* frow = wrk->frow;
417*b2055c35SXin Li const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
418*b2055c35SXin Li assert(!WebPRescalerOutputDone(wrk));
419*b2055c35SXin Li assert(wrk->y_accum <= 0);
420*b2055c35SXin Li assert(!wrk->y_expand);
421*b2055c35SXin Li if (yscale) {
422*b2055c35SXin Li ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
423*b2055c35SXin Li } else {
424*b2055c35SXin Li ExportRowShrink_1(irow, dst, x_out_max, wrk);
425*b2055c35SXin Li }
426*b2055c35SXin Li }
427*b2055c35SXin Li #endif // 0
428*b2055c35SXin Li
429*b2055c35SXin Li //------------------------------------------------------------------------------
430*b2055c35SXin Li // Entry point
431*b2055c35SXin Li
432*b2055c35SXin Li extern void WebPRescalerDspInitMSA(void);
433*b2055c35SXin Li
WebPRescalerDspInitMSA(void)434*b2055c35SXin Li WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
435*b2055c35SXin Li WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
436*b2055c35SXin Li // WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
437*b2055c35SXin Li }
438*b2055c35SXin Li
439*b2055c35SXin Li #else // !WEBP_USE_MSA
440*b2055c35SXin Li
441*b2055c35SXin Li WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
442*b2055c35SXin Li
443*b2055c35SXin Li #endif // WEBP_USE_MSA
444