xref: /aosp_15_r20/external/webp/src/dsp/rescaler_msa.c (revision b2055c353e87c8814eb2b6b1b11112a1562253bd)
1*b2055c35SXin Li // Copyright 2016 Google Inc. All Rights Reserved.
2*b2055c35SXin Li //
3*b2055c35SXin Li // Use of this source code is governed by a BSD-style license
4*b2055c35SXin Li // that can be found in the COPYING file in the root of the source
5*b2055c35SXin Li // tree. An additional intellectual property rights grant can be found
6*b2055c35SXin Li // in the file PATENTS. All contributing project authors may
7*b2055c35SXin Li // be found in the AUTHORS file in the root of the source tree.
8*b2055c35SXin Li // -----------------------------------------------------------------------------
9*b2055c35SXin Li //
10*b2055c35SXin Li // MSA version of rescaling functions
11*b2055c35SXin Li //
12*b2055c35SXin Li // Author: Prashant Patil ([email protected])
13*b2055c35SXin Li 
14*b2055c35SXin Li #include "src/dsp/dsp.h"
15*b2055c35SXin Li 
16*b2055c35SXin Li #if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
17*b2055c35SXin Li 
18*b2055c35SXin Li #include <assert.h>
19*b2055c35SXin Li 
20*b2055c35SXin Li #include "src/utils/rescaler_utils.h"
21*b2055c35SXin Li #include "src/dsp/msa_macro.h"
22*b2055c35SXin Li 
23*b2055c35SXin Li #define ROUNDER (WEBP_RESCALER_ONE >> 1)
24*b2055c35SXin Li #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
25*b2055c35SXin Li #define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
26*b2055c35SXin Li 
27*b2055c35SXin Li #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do {  \
28*b2055c35SXin Li   v4u32 tmp0, tmp1, tmp2, tmp3;                                       \
29*b2055c35SXin Li   v16u8 t0, t1, t2, t3, t4, t5;                                       \
30*b2055c35SXin Li   v2u64 out0, out1, out2, out3;                                       \
31*b2055c35SXin Li   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                                 \
32*b2055c35SXin Li   ILVRL_W2_UW(zero, in1, tmp2, tmp3);                                 \
33*b2055c35SXin Li   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
34*b2055c35SXin Li   DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
35*b2055c35SXin Li   SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
36*b2055c35SXin Li   PCKEV_B2_UB(out1, out0, out3, out2, t0, t1);                        \
37*b2055c35SXin Li   ILVRL_W2_UW(zero, in2, tmp0, tmp1);                                 \
38*b2055c35SXin Li   ILVRL_W2_UW(zero, in3, tmp2, tmp3);                                 \
39*b2055c35SXin Li   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
40*b2055c35SXin Li   DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
41*b2055c35SXin Li   SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
42*b2055c35SXin Li   PCKEV_B2_UB(out1, out0, out3, out2, t2, t3);                        \
43*b2055c35SXin Li   PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);                                \
44*b2055c35SXin Li   dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);                   \
45*b2055c35SXin Li } while (0)
46*b2055c35SXin Li 
47*b2055c35SXin Li #define CALC_MULT_FIX_4(in0, scale, shift, dst) do {  \
48*b2055c35SXin Li   v4u32 tmp0, tmp1;                                   \
49*b2055c35SXin Li   v16i8 t0, t1;                                       \
50*b2055c35SXin Li   v2u64 out0, out1;                                   \
51*b2055c35SXin Li   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                 \
52*b2055c35SXin Li   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);  \
53*b2055c35SXin Li   SRAR_D2_UD(out0, out1, shift);                      \
54*b2055c35SXin Li   t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);       \
55*b2055c35SXin Li   t1 = __msa_pckev_b(t0, t0);                         \
56*b2055c35SXin Li   t0 = __msa_pckev_b(t1, t1);                         \
57*b2055c35SXin Li   dst = __msa_copy_s_w((v4i32)t0, 0);                 \
58*b2055c35SXin Li } while (0)
59*b2055c35SXin Li 
60*b2055c35SXin Li #define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift,  \
61*b2055c35SXin Li                           dst0, dst1, dst2, dst3) do {         \
62*b2055c35SXin Li   v4u32 tmp0, tmp1, tmp2, tmp3;                                \
63*b2055c35SXin Li   v2u64 out0, out1, out2, out3;                                \
64*b2055c35SXin Li   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                          \
65*b2055c35SXin Li   ILVRL_W2_UW(zero, in1, tmp2, tmp3);                          \
66*b2055c35SXin Li   DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
67*b2055c35SXin Li   DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
68*b2055c35SXin Li   SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
69*b2055c35SXin Li   PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1);             \
70*b2055c35SXin Li   ILVRL_W2_UW(zero, in2, tmp0, tmp1);                          \
71*b2055c35SXin Li   ILVRL_W2_UW(zero, in3, tmp2, tmp3);                          \
72*b2055c35SXin Li   DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
73*b2055c35SXin Li   DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
74*b2055c35SXin Li   SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
75*b2055c35SXin Li   PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3);             \
76*b2055c35SXin Li } while (0)
77*b2055c35SXin Li 
78*b2055c35SXin Li #define CALC_MULT_FIX1_4(in0, scale, shift, dst) do {    \
79*b2055c35SXin Li   v4u32 tmp0, tmp1;                                      \
80*b2055c35SXin Li   v2u64 out0, out1;                                      \
81*b2055c35SXin Li   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                    \
82*b2055c35SXin Li   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);     \
83*b2055c35SXin Li   SRAR_D2_UD(out0, out1, shift);                         \
84*b2055c35SXin Li   dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0);  \
85*b2055c35SXin Li } while (0)
86*b2055c35SXin Li 
87*b2055c35SXin Li #define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift,  \
88*b2055c35SXin Li                           dst0, dst1) do {                         \
89*b2055c35SXin Li   v4u32 tmp0, tmp1, tmp2, tmp3;                                    \
90*b2055c35SXin Li   v2u64 out0, out1, out2, out3;                                    \
91*b2055c35SXin Li   ILVRL_W2_UW(in0, in2, tmp0, tmp1);                               \
92*b2055c35SXin Li   ILVRL_W2_UW(in1, in3, tmp2, tmp3);                               \
93*b2055c35SXin Li   DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                 \
94*b2055c35SXin Li   DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3);                 \
95*b2055c35SXin Li   SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
96*b2055c35SXin Li   DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);               \
97*b2055c35SXin Li   DOTP_UW2_UD(out2, out3, scale, scale, out2, out3);               \
98*b2055c35SXin Li   SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
99*b2055c35SXin Li   PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1);                 \
100*b2055c35SXin Li } while (0)
101*b2055c35SXin Li 
102*b2055c35SXin Li #define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do {  \
103*b2055c35SXin Li   v4u32 tmp0, tmp1;                                               \
104*b2055c35SXin Li   v2u64 out0, out1;                                               \
105*b2055c35SXin Li   v16i8 t0, t1;                                                   \
106*b2055c35SXin Li   ILVRL_W2_UW(in0, in1, tmp0, tmp1);                              \
107*b2055c35SXin Li   DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                \
108*b2055c35SXin Li   SRAR_D2_UD(out0, out1, shift);                                  \
109*b2055c35SXin Li   DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);              \
110*b2055c35SXin Li   SRAR_D2_UD(out0, out1, shift);                                  \
111*b2055c35SXin Li   t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);                   \
112*b2055c35SXin Li   t1 = __msa_pckev_b(t0, t0);                                     \
113*b2055c35SXin Li   t0 = __msa_pckev_b(t1, t1);                                     \
114*b2055c35SXin Li   dst = __msa_copy_s_w((v4i32)t0, 0);                             \
115*b2055c35SXin Li } while (0)
116*b2055c35SXin Li 
ExportRowExpand_0(const uint32_t * frow,uint8_t * dst,int length,WebPRescaler * const wrk)117*b2055c35SXin Li static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
118*b2055c35SXin Li                                           int length,
119*b2055c35SXin Li                                           WebPRescaler* const wrk) {
120*b2055c35SXin Li   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
121*b2055c35SXin Li   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
122*b2055c35SXin Li   const v4i32 zero = { 0 };
123*b2055c35SXin Li 
124*b2055c35SXin Li   while (length >= 16) {
125*b2055c35SXin Li     v4u32 src0, src1, src2, src3;
126*b2055c35SXin Li     v16u8 out;
127*b2055c35SXin Li     LD_UW4(frow, 4, src0, src1, src2, src3);
128*b2055c35SXin Li     CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
129*b2055c35SXin Li     ST_UB(out, dst);
130*b2055c35SXin Li     length -= 16;
131*b2055c35SXin Li     frow   += 16;
132*b2055c35SXin Li     dst    += 16;
133*b2055c35SXin Li   }
134*b2055c35SXin Li   if (length > 0) {
135*b2055c35SXin Li     int x_out;
136*b2055c35SXin Li     if (length >= 12) {
137*b2055c35SXin Li       uint32_t val0_m, val1_m, val2_m;
138*b2055c35SXin Li       v4u32 src0, src1, src2;
139*b2055c35SXin Li       LD_UW3(frow, 4, src0, src1, src2);
140*b2055c35SXin Li       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
141*b2055c35SXin Li       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
142*b2055c35SXin Li       CALC_MULT_FIX_4(src2, scale, shift, val2_m);
143*b2055c35SXin Li       SW3(val0_m, val1_m, val2_m, dst, 4);
144*b2055c35SXin Li       length -= 12;
145*b2055c35SXin Li       frow   += 12;
146*b2055c35SXin Li       dst    += 12;
147*b2055c35SXin Li     } else if (length >= 8) {
148*b2055c35SXin Li       uint32_t val0_m, val1_m;
149*b2055c35SXin Li       v4u32 src0, src1;
150*b2055c35SXin Li       LD_UW2(frow, 4, src0, src1);
151*b2055c35SXin Li       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
152*b2055c35SXin Li       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
153*b2055c35SXin Li       SW2(val0_m, val1_m, dst, 4);
154*b2055c35SXin Li       length -= 8;
155*b2055c35SXin Li       frow   += 8;
156*b2055c35SXin Li       dst    += 8;
157*b2055c35SXin Li     } else if (length >= 4) {
158*b2055c35SXin Li       uint32_t val0_m;
159*b2055c35SXin Li       const v4u32 src0 = LD_UW(frow);
160*b2055c35SXin Li       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
161*b2055c35SXin Li       SW(val0_m, dst);
162*b2055c35SXin Li       length -= 4;
163*b2055c35SXin Li       frow   += 4;
164*b2055c35SXin Li       dst    += 4;
165*b2055c35SXin Li     }
166*b2055c35SXin Li     for (x_out = 0; x_out < length; ++x_out) {
167*b2055c35SXin Li       const uint32_t J = frow[x_out];
168*b2055c35SXin Li       const int v = (int)MULT_FIX(J, wrk->fy_scale);
169*b2055c35SXin Li       dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
170*b2055c35SXin Li     }
171*b2055c35SXin Li   }
172*b2055c35SXin Li }
173*b2055c35SXin Li 
ExportRowExpand_1(const uint32_t * frow,uint32_t * irow,uint8_t * dst,int length,WebPRescaler * const wrk)174*b2055c35SXin Li static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
175*b2055c35SXin Li                                           uint8_t* dst, int length,
176*b2055c35SXin Li                                           WebPRescaler* const wrk) {
177*b2055c35SXin Li   const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
178*b2055c35SXin Li   const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
179*b2055c35SXin Li   const v4i32 B1 = __msa_fill_w(B);
180*b2055c35SXin Li   const v4i32 A1 = __msa_fill_w(A);
181*b2055c35SXin Li   const v4i32 AB = __msa_ilvr_w(A1, B1);
182*b2055c35SXin Li   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
183*b2055c35SXin Li   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
184*b2055c35SXin Li 
185*b2055c35SXin Li   while (length >= 16) {
186*b2055c35SXin Li     v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
187*b2055c35SXin Li     v16u8 t0, t1, t2, t3, t4, t5;
188*b2055c35SXin Li     LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
189*b2055c35SXin Li     LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
190*b2055c35SXin Li     CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
191*b2055c35SXin Li     CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
192*b2055c35SXin Li     PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
193*b2055c35SXin Li     t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
194*b2055c35SXin Li     ST_UB(t0, dst);
195*b2055c35SXin Li     frow   += 16;
196*b2055c35SXin Li     irow   += 16;
197*b2055c35SXin Li     dst    += 16;
198*b2055c35SXin Li     length -= 16;
199*b2055c35SXin Li   }
200*b2055c35SXin Li   if (length > 0) {
201*b2055c35SXin Li     int x_out;
202*b2055c35SXin Li     if (length >= 12) {
203*b2055c35SXin Li       uint32_t val0_m, val1_m, val2_m;
204*b2055c35SXin Li       v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
205*b2055c35SXin Li       LD_UW3(frow, 4, frow0, frow1, frow2);
206*b2055c35SXin Li       LD_UW3(irow, 4, irow0, irow1, irow2);
207*b2055c35SXin Li       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
208*b2055c35SXin Li       CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
209*b2055c35SXin Li       CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
210*b2055c35SXin Li       SW3(val0_m, val1_m, val2_m, dst, 4);
211*b2055c35SXin Li       frow   += 12;
212*b2055c35SXin Li       irow   += 12;
213*b2055c35SXin Li       dst    += 12;
214*b2055c35SXin Li       length -= 12;
215*b2055c35SXin Li     } else if (length >= 8) {
216*b2055c35SXin Li       uint32_t val0_m, val1_m;
217*b2055c35SXin Li       v4u32 frow0, frow1, irow0, irow1;
218*b2055c35SXin Li       LD_UW2(frow, 4, frow0, frow1);
219*b2055c35SXin Li       LD_UW2(irow, 4, irow0, irow1);
220*b2055c35SXin Li       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
221*b2055c35SXin Li       CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
222*b2055c35SXin Li       SW2(val0_m, val1_m, dst, 4);
223*b2055c35SXin Li       frow   += 4;
224*b2055c35SXin Li       irow   += 4;
225*b2055c35SXin Li       dst    += 4;
226*b2055c35SXin Li       length -= 4;
227*b2055c35SXin Li     } else if (length >= 4) {
228*b2055c35SXin Li       uint32_t val0_m;
229*b2055c35SXin Li       const v4u32 frow0 = LD_UW(frow + 0);
230*b2055c35SXin Li       const v4u32 irow0 = LD_UW(irow + 0);
231*b2055c35SXin Li       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
232*b2055c35SXin Li       SW(val0_m, dst);
233*b2055c35SXin Li       frow   += 4;
234*b2055c35SXin Li       irow   += 4;
235*b2055c35SXin Li       dst    += 4;
236*b2055c35SXin Li       length -= 4;
237*b2055c35SXin Li     }
238*b2055c35SXin Li     for (x_out = 0; x_out < length; ++x_out) {
239*b2055c35SXin Li       const uint64_t I = (uint64_t)A * frow[x_out]
240*b2055c35SXin Li                        + (uint64_t)B * irow[x_out];
241*b2055c35SXin Li       const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
242*b2055c35SXin Li       const int v = (int)MULT_FIX(J, wrk->fy_scale);
243*b2055c35SXin Li       dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
244*b2055c35SXin Li     }
245*b2055c35SXin Li   }
246*b2055c35SXin Li }
247*b2055c35SXin Li 
RescalerExportRowExpand_MIPSdspR2(WebPRescaler * const wrk)248*b2055c35SXin Li static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
249*b2055c35SXin Li   uint8_t* dst = wrk->dst;
250*b2055c35SXin Li   rescaler_t* irow = wrk->irow;
251*b2055c35SXin Li   const int x_out_max = wrk->dst_width * wrk->num_channels;
252*b2055c35SXin Li   const rescaler_t* frow = wrk->frow;
253*b2055c35SXin Li   assert(!WebPRescalerOutputDone(wrk));
254*b2055c35SXin Li   assert(wrk->y_accum <= 0);
255*b2055c35SXin Li   assert(wrk->y_expand);
256*b2055c35SXin Li   assert(wrk->y_sub != 0);
257*b2055c35SXin Li   if (wrk->y_accum == 0) {
258*b2055c35SXin Li     ExportRowExpand_0(frow, dst, x_out_max, wrk);
259*b2055c35SXin Li   } else {
260*b2055c35SXin Li     ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
261*b2055c35SXin Li   }
262*b2055c35SXin Li }
263*b2055c35SXin Li 
264*b2055c35SXin Li #if 0  // disabled for now. TODO(skal): make match the C-code
265*b2055c35SXin Li static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
266*b2055c35SXin Li                                           uint8_t* dst, int length,
267*b2055c35SXin Li                                           const uint32_t yscale,
268*b2055c35SXin Li                                           WebPRescaler* const wrk) {
269*b2055c35SXin Li   const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
270*b2055c35SXin Li   const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
271*b2055c35SXin Li   const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
272*b2055c35SXin Li   const v4i32 zero = { 0 };
273*b2055c35SXin Li 
274*b2055c35SXin Li   while (length >= 16) {
275*b2055c35SXin Li     v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
276*b2055c35SXin Li     v16u8 out;
277*b2055c35SXin Li     LD_UW4(frow, 4, src0, src1, src2, src3);
278*b2055c35SXin Li     CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
279*b2055c35SXin Li                       frac0, frac1, frac2, frac3);
280*b2055c35SXin Li     LD_UW4(irow, 4, src0, src1, src2, src3);
281*b2055c35SXin Li     SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
282*b2055c35SXin Li          src0, src1, src2, src3);
283*b2055c35SXin Li     CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
284*b2055c35SXin Li     ST_UB(out, dst);
285*b2055c35SXin Li     ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
286*b2055c35SXin Li     frow   += 16;
287*b2055c35SXin Li     irow   += 16;
288*b2055c35SXin Li     dst    += 16;
289*b2055c35SXin Li     length -= 16;
290*b2055c35SXin Li   }
291*b2055c35SXin Li   if (length > 0) {
292*b2055c35SXin Li     int x_out;
293*b2055c35SXin Li     if (length >= 12) {
294*b2055c35SXin Li       uint32_t val0_m, val1_m, val2_m;
295*b2055c35SXin Li       v4u32 src0, src1, src2, frac0, frac1, frac2;
296*b2055c35SXin Li       LD_UW3(frow, 4, src0, src1, src2);
297*b2055c35SXin Li       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
298*b2055c35SXin Li       CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
299*b2055c35SXin Li       CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
300*b2055c35SXin Li       LD_UW3(irow, 4, src0, src1, src2);
301*b2055c35SXin Li       SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
302*b2055c35SXin Li       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
303*b2055c35SXin Li       CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
304*b2055c35SXin Li       CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
305*b2055c35SXin Li       SW3(val0_m, val1_m, val2_m, dst, 4);
306*b2055c35SXin Li       ST_UW3(frac0, frac1, frac2, irow, 4);
307*b2055c35SXin Li       frow   += 12;
308*b2055c35SXin Li       irow   += 12;
309*b2055c35SXin Li       dst    += 12;
310*b2055c35SXin Li       length -= 12;
311*b2055c35SXin Li     } else if (length >= 8) {
312*b2055c35SXin Li       uint32_t val0_m, val1_m;
313*b2055c35SXin Li       v4u32 src0, src1, frac0, frac1;
314*b2055c35SXin Li       LD_UW2(frow, 4, src0, src1);
315*b2055c35SXin Li       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
316*b2055c35SXin Li       CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
317*b2055c35SXin Li       LD_UW2(irow, 4, src0, src1);
318*b2055c35SXin Li       SUB2(src0, frac0, src1, frac1, src0, src1);
319*b2055c35SXin Li       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
320*b2055c35SXin Li       CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
321*b2055c35SXin Li       SW2(val0_m, val1_m, dst, 4);
322*b2055c35SXin Li       ST_UW2(frac0, frac1, irow, 4);
323*b2055c35SXin Li       frow   += 8;
324*b2055c35SXin Li       irow   += 8;
325*b2055c35SXin Li       dst    += 8;
326*b2055c35SXin Li       length -= 8;
327*b2055c35SXin Li     } else if (length >= 4) {
328*b2055c35SXin Li       uint32_t val0_m;
329*b2055c35SXin Li       v4u32 frac0;
330*b2055c35SXin Li       v4u32 src0 = LD_UW(frow);
331*b2055c35SXin Li       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
332*b2055c35SXin Li       src0 = LD_UW(irow);
333*b2055c35SXin Li       src0 = src0 - frac0;
334*b2055c35SXin Li       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
335*b2055c35SXin Li       SW(val0_m, dst);
336*b2055c35SXin Li       ST_UW(frac0, irow);
337*b2055c35SXin Li       frow   += 4;
338*b2055c35SXin Li       irow   += 4;
339*b2055c35SXin Li       dst    += 4;
340*b2055c35SXin Li       length -= 4;
341*b2055c35SXin Li     }
342*b2055c35SXin Li     for (x_out = 0; x_out < length; ++x_out) {
343*b2055c35SXin Li       const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
344*b2055c35SXin Li       const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
345*b2055c35SXin Li       dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
346*b2055c35SXin Li       irow[x_out] = frac;
347*b2055c35SXin Li     }
348*b2055c35SXin Li   }
349*b2055c35SXin Li }
350*b2055c35SXin Li 
351*b2055c35SXin Li static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
352*b2055c35SXin Li                                           int length,
353*b2055c35SXin Li                                           WebPRescaler* const wrk) {
354*b2055c35SXin Li   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
355*b2055c35SXin Li   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
356*b2055c35SXin Li   const v4i32 zero = { 0 };
357*b2055c35SXin Li 
358*b2055c35SXin Li   while (length >= 16) {
359*b2055c35SXin Li     v4u32 src0, src1, src2, src3;
360*b2055c35SXin Li     v16u8 dst0;
361*b2055c35SXin Li     LD_UW4(irow, 4, src0, src1, src2, src3);
362*b2055c35SXin Li     CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
363*b2055c35SXin Li     ST_UB(dst0, dst);
364*b2055c35SXin Li     ST_SW4(zero, zero, zero, zero, irow, 4);
365*b2055c35SXin Li     length -= 16;
366*b2055c35SXin Li     irow   += 16;
367*b2055c35SXin Li     dst    += 16;
368*b2055c35SXin Li   }
369*b2055c35SXin Li   if (length > 0) {
370*b2055c35SXin Li     int x_out;
371*b2055c35SXin Li     if (length >= 12) {
372*b2055c35SXin Li       uint32_t val0_m, val1_m, val2_m;
373*b2055c35SXin Li       v4u32 src0, src1, src2;
374*b2055c35SXin Li       LD_UW3(irow, 4, src0, src1, src2);
375*b2055c35SXin Li       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
376*b2055c35SXin Li       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
377*b2055c35SXin Li       CALC_MULT_FIX_4(src2, scale, shift, val2_m);
378*b2055c35SXin Li       SW3(val0_m, val1_m, val2_m, dst, 4);
379*b2055c35SXin Li       ST_SW3(zero, zero, zero, irow, 4);
380*b2055c35SXin Li       length -= 12;
381*b2055c35SXin Li       irow   += 12;
382*b2055c35SXin Li       dst    += 12;
383*b2055c35SXin Li     } else if (length >= 8) {
384*b2055c35SXin Li       uint32_t val0_m, val1_m;
385*b2055c35SXin Li       v4u32 src0, src1;
386*b2055c35SXin Li       LD_UW2(irow, 4, src0, src1);
387*b2055c35SXin Li       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
388*b2055c35SXin Li       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
389*b2055c35SXin Li       SW2(val0_m, val1_m, dst, 4);
390*b2055c35SXin Li       ST_SW2(zero, zero, irow, 4);
391*b2055c35SXin Li       length -= 8;
392*b2055c35SXin Li       irow   += 8;
393*b2055c35SXin Li       dst    += 8;
394*b2055c35SXin Li     } else if (length >= 4) {
395*b2055c35SXin Li       uint32_t val0_m;
396*b2055c35SXin Li       const v4u32 src0 = LD_UW(irow + 0);
397*b2055c35SXin Li       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
398*b2055c35SXin Li       SW(val0_m, dst);
399*b2055c35SXin Li       ST_SW(zero, irow);
400*b2055c35SXin Li       length -= 4;
401*b2055c35SXin Li       irow   += 4;
402*b2055c35SXin Li       dst    += 4;
403*b2055c35SXin Li     }
404*b2055c35SXin Li     for (x_out = 0; x_out < length; ++x_out) {
405*b2055c35SXin Li       const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
406*b2055c35SXin Li       dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
407*b2055c35SXin Li       irow[x_out] = 0;
408*b2055c35SXin Li     }
409*b2055c35SXin Li   }
410*b2055c35SXin Li }
411*b2055c35SXin Li 
412*b2055c35SXin Li static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
413*b2055c35SXin Li   uint8_t* dst = wrk->dst;
414*b2055c35SXin Li   rescaler_t* irow = wrk->irow;
415*b2055c35SXin Li   const int x_out_max = wrk->dst_width * wrk->num_channels;
416*b2055c35SXin Li   const rescaler_t* frow = wrk->frow;
417*b2055c35SXin Li   const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
418*b2055c35SXin Li   assert(!WebPRescalerOutputDone(wrk));
419*b2055c35SXin Li   assert(wrk->y_accum <= 0);
420*b2055c35SXin Li   assert(!wrk->y_expand);
421*b2055c35SXin Li   if (yscale) {
422*b2055c35SXin Li     ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
423*b2055c35SXin Li   } else {
424*b2055c35SXin Li     ExportRowShrink_1(irow, dst, x_out_max, wrk);
425*b2055c35SXin Li   }
426*b2055c35SXin Li }
427*b2055c35SXin Li #endif  // 0
428*b2055c35SXin Li 
429*b2055c35SXin Li //------------------------------------------------------------------------------
430*b2055c35SXin Li // Entry point
431*b2055c35SXin Li 
432*b2055c35SXin Li extern void WebPRescalerDspInitMSA(void);
433*b2055c35SXin Li 
WebPRescalerDspInitMSA(void)434*b2055c35SXin Li WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
435*b2055c35SXin Li   WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
436*b2055c35SXin Li //  WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
437*b2055c35SXin Li }
438*b2055c35SXin Li 
439*b2055c35SXin Li #else     // !WEBP_USE_MSA
440*b2055c35SXin Li 
441*b2055c35SXin Li WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
442*b2055c35SXin Li 
443*b2055c35SXin Li #endif    // WEBP_USE_MSA
444