1*b2055c35SXin Li // Copyright 2014 Google Inc. All Rights Reserved. 2*b2055c35SXin Li // 3*b2055c35SXin Li // Use of this source code is governed by a BSD-style license 4*b2055c35SXin Li // that can be found in the COPYING file in the root of the source 5*b2055c35SXin Li // tree. An additional intellectual property rights grant can be found 6*b2055c35SXin Li // in the file PATENTS. All contributing project authors may 7*b2055c35SXin Li // be found in the AUTHORS file in the root of the source tree. 8*b2055c35SXin Li // ----------------------------------------------------------------------------- 9*b2055c35SXin Li // 10*b2055c35SXin Li // MIPS common macros 11*b2055c35SXin Li 12*b2055c35SXin Li #ifndef WEBP_DSP_MIPS_MACRO_H_ 13*b2055c35SXin Li #define WEBP_DSP_MIPS_MACRO_H_ 14*b2055c35SXin Li 15*b2055c35SXin Li #if defined(__GNUC__) && defined(__ANDROID__) && LOCAL_GCC_VERSION == 0x409 16*b2055c35SXin Li #define WORK_AROUND_GCC 17*b2055c35SXin Li #endif 18*b2055c35SXin Li 19*b2055c35SXin Li #define STR(s) #s 20*b2055c35SXin Li #define XSTR(s) STR(s) 21*b2055c35SXin Li 22*b2055c35SXin Li // O0[31..16 | 15..0] = I0[31..16 | 15..0] + I1[31..16 | 15..0] 23*b2055c35SXin Li // O1[31..16 | 15..0] = I0[31..16 | 15..0] - I1[31..16 | 15..0] 24*b2055c35SXin Li // O - output 25*b2055c35SXin Li // I - input (macro doesn't change it) 26*b2055c35SXin Li #define ADD_SUB_HALVES(O0, O1, \ 27*b2055c35SXin Li I0, I1) \ 28*b2055c35SXin Li "addq.ph %[" #O0 "], %[" #I0 "], %[" #I1 "] \n\t" \ 29*b2055c35SXin Li "subq.ph %[" #O1 "], %[" #I0 "], %[" #I1 "] \n\t" 30*b2055c35SXin Li 31*b2055c35SXin Li // O - output 32*b2055c35SXin Li // I - input (macro doesn't change it) 33*b2055c35SXin Li // I[0/1] - offset in bytes 34*b2055c35SXin Li #define LOAD_IN_X2(O0, O1, \ 35*b2055c35SXin Li I0, I1) \ 36*b2055c35SXin Li "lh %[" #O0 "], " #I0 "(%[in]) \n\t" \ 37*b2055c35SXin Li "lh %[" #O1 "], " #I1 "(%[in]) \n\t" 38*b2055c35SXin Li 39*b2055c35SXin Li // I0 - location 40*b2055c35SXin Li // I1..I9 - offsets in bytes 41*b2055c35SXin Li #define LOAD_WITH_OFFSET_X4(O0, O1, O2, O3, \ 42*b2055c35SXin Li I0, I1, I2, I3, I4, I5, I6, I7, I8, I9) \ 43*b2055c35SXin Li "ulw %[" #O0 "], " #I1 "+" XSTR(I9) "*" #I5 "(%[" #I0 "]) \n\t" \ 44*b2055c35SXin Li "ulw %[" #O1 "], " #I2 "+" XSTR(I9) "*" #I6 "(%[" #I0 "]) \n\t" \ 45*b2055c35SXin Li "ulw %[" #O2 "], " #I3 "+" XSTR(I9) "*" #I7 "(%[" #I0 "]) \n\t" \ 46*b2055c35SXin Li "ulw %[" #O3 "], " #I4 "+" XSTR(I9) "*" #I8 "(%[" #I0 "]) \n\t" 47*b2055c35SXin Li 48*b2055c35SXin Li 49*b2055c35SXin Li // O - output 50*b2055c35SXin Li // I - input (macro doesn't change it so it should be different from I) 51*b2055c35SXin Li #define MUL_SHIFT_C1(O, I) \ 52*b2055c35SXin Li "mul %[" #O "], %[" #I "], %[kC1] \n\t" \ 53*b2055c35SXin Li "sra %[" #O "], %[" #O "], 16 \n\t" \ 54*b2055c35SXin Li "addu %[" #O "], %[" #O "], %[" #I "] \n\t" 55*b2055c35SXin Li #define MUL_SHIFT_C2(O, I) \ 56*b2055c35SXin Li "mul %[" #O "], %[" #I "], %[kC2] \n\t" \ 57*b2055c35SXin Li "sra %[" #O "], %[" #O "], 16 \n\t" 58*b2055c35SXin Li 59*b2055c35SXin Li // Same as #define MUL_SHIFT_C1 but I and O are the same. It stores the 60*b2055c35SXin Li // intermediary result in TMP. 61*b2055c35SXin Li #define MUL_SHIFT_C1_IO(IO, TMP) \ 62*b2055c35SXin Li "mul %[" #TMP "], %[" #IO "], %[kC1] \n\t" \ 63*b2055c35SXin Li "sra %[" #TMP "], %[" #TMP "], 16 \n\t" \ 64*b2055c35SXin Li "addu %[" #IO "], %[" #TMP "], %[" #IO "] \n\t" 65*b2055c35SXin Li 66*b2055c35SXin Li // O - output 67*b2055c35SXin Li // IO - input/output 68*b2055c35SXin Li // I - input (macro doesn't change it) 69*b2055c35SXin Li #define MUL_SHIFT_SUM(O0, O1, O2, O3, O4, O5, O6, O7, \ 70*b2055c35SXin Li IO0, IO1, IO2, IO3, \ 71*b2055c35SXin Li I0, I1, I2, I3, I4, I5, I6, I7) \ 72*b2055c35SXin Li MUL_SHIFT_C2(O0, I0) \ 73*b2055c35SXin Li MUL_SHIFT_C1(O1, I0) \ 74*b2055c35SXin Li MUL_SHIFT_C2(O2, I1) \ 75*b2055c35SXin Li MUL_SHIFT_C1(O3, I1) \ 76*b2055c35SXin Li MUL_SHIFT_C2(O4, I2) \ 77*b2055c35SXin Li MUL_SHIFT_C1(O5, I2) \ 78*b2055c35SXin Li MUL_SHIFT_C2(O6, I3) \ 79*b2055c35SXin Li MUL_SHIFT_C1(O7, I3) \ 80*b2055c35SXin Li "addu %[" #IO0 "], %[" #IO0 "], %[" #I4 "] \n\t" \ 81*b2055c35SXin Li "addu %[" #IO1 "], %[" #IO1 "], %[" #I5 "] \n\t" \ 82*b2055c35SXin Li "subu %[" #IO2 "], %[" #IO2 "], %[" #I6 "] \n\t" \ 83*b2055c35SXin Li "subu %[" #IO3 "], %[" #IO3 "], %[" #I7 "] \n\t" 84*b2055c35SXin Li 85*b2055c35SXin Li // O - output 86*b2055c35SXin Li // I - input (macro doesn't change it) 87*b2055c35SXin Li #define INSERT_HALF_X2(O0, O1, \ 88*b2055c35SXin Li I0, I1) \ 89*b2055c35SXin Li "ins %[" #O0 "], %[" #I0 "], 16, 16 \n\t" \ 90*b2055c35SXin Li "ins %[" #O1 "], %[" #I1 "], 16, 16 \n\t" 91*b2055c35SXin Li 92*b2055c35SXin Li // O - output 93*b2055c35SXin Li // I - input (macro doesn't change it) 94*b2055c35SXin Li #define SRA_16(O0, O1, O2, O3, \ 95*b2055c35SXin Li I0, I1, I2, I3) \ 96*b2055c35SXin Li "sra %[" #O0 "], %[" #I0 "], 16 \n\t" \ 97*b2055c35SXin Li "sra %[" #O1 "], %[" #I1 "], 16 \n\t" \ 98*b2055c35SXin Li "sra %[" #O2 "], %[" #I2 "], 16 \n\t" \ 99*b2055c35SXin Li "sra %[" #O3 "], %[" #I3 "], 16 \n\t" 100*b2055c35SXin Li 101*b2055c35SXin Li // temp0[31..16 | 15..0] = temp8[31..16 | 15..0] + temp12[31..16 | 15..0] 102*b2055c35SXin Li // temp1[31..16 | 15..0] = temp8[31..16 | 15..0] - temp12[31..16 | 15..0] 103*b2055c35SXin Li // temp0[31..16 | 15..0] = temp0[31..16 >> 3 | 15..0 >> 3] 104*b2055c35SXin Li // temp1[31..16 | 15..0] = temp1[31..16 >> 3 | 15..0 >> 3] 105*b2055c35SXin Li // O - output 106*b2055c35SXin Li // I - input (macro doesn't change it) 107*b2055c35SXin Li #define SHIFT_R_SUM_X2(O0, O1, O2, O3, O4, O5, O6, O7, \ 108*b2055c35SXin Li I0, I1, I2, I3, I4, I5, I6, I7) \ 109*b2055c35SXin Li "addq.ph %[" #O0 "], %[" #I0 "], %[" #I4 "] \n\t" \ 110*b2055c35SXin Li "subq.ph %[" #O1 "], %[" #I0 "], %[" #I4 "] \n\t" \ 111*b2055c35SXin Li "addq.ph %[" #O2 "], %[" #I1 "], %[" #I5 "] \n\t" \ 112*b2055c35SXin Li "subq.ph %[" #O3 "], %[" #I1 "], %[" #I5 "] \n\t" \ 113*b2055c35SXin Li "addq.ph %[" #O4 "], %[" #I2 "], %[" #I6 "] \n\t" \ 114*b2055c35SXin Li "subq.ph %[" #O5 "], %[" #I2 "], %[" #I6 "] \n\t" \ 115*b2055c35SXin Li "addq.ph %[" #O6 "], %[" #I3 "], %[" #I7 "] \n\t" \ 116*b2055c35SXin Li "subq.ph %[" #O7 "], %[" #I3 "], %[" #I7 "] \n\t" \ 117*b2055c35SXin Li "shra.ph %[" #O0 "], %[" #O0 "], 3 \n\t" \ 118*b2055c35SXin Li "shra.ph %[" #O1 "], %[" #O1 "], 3 \n\t" \ 119*b2055c35SXin Li "shra.ph %[" #O2 "], %[" #O2 "], 3 \n\t" \ 120*b2055c35SXin Li "shra.ph %[" #O3 "], %[" #O3 "], 3 \n\t" \ 121*b2055c35SXin Li "shra.ph %[" #O4 "], %[" #O4 "], 3 \n\t" \ 122*b2055c35SXin Li "shra.ph %[" #O5 "], %[" #O5 "], 3 \n\t" \ 123*b2055c35SXin Li "shra.ph %[" #O6 "], %[" #O6 "], 3 \n\t" \ 124*b2055c35SXin Li "shra.ph %[" #O7 "], %[" #O7 "], 3 \n\t" 125*b2055c35SXin Li 126*b2055c35SXin Li // precrq.ph.w temp0, temp8, temp2 127*b2055c35SXin Li // temp0 = temp8[31..16] | temp2[31..16] 128*b2055c35SXin Li // ins temp2, temp8, 16, 16 129*b2055c35SXin Li // temp2 = temp8[31..16] | temp2[15..0] 130*b2055c35SXin Li // O - output 131*b2055c35SXin Li // IO - input/output 132*b2055c35SXin Li // I - input (macro doesn't change it) 133*b2055c35SXin Li #define PACK_2_HALVES_TO_WORD(O0, O1, O2, O3, \ 134*b2055c35SXin Li IO0, IO1, IO2, IO3, \ 135*b2055c35SXin Li I0, I1, I2, I3) \ 136*b2055c35SXin Li "precrq.ph.w %[" #O0 "], %[" #I0 "], %[" #IO0 "] \n\t" \ 137*b2055c35SXin Li "precrq.ph.w %[" #O1 "], %[" #I1 "], %[" #IO1 "] \n\t" \ 138*b2055c35SXin Li "ins %[" #IO0 "], %[" #I0 "], 16, 16 \n\t" \ 139*b2055c35SXin Li "ins %[" #IO1 "], %[" #I1 "], 16, 16 \n\t" \ 140*b2055c35SXin Li "precrq.ph.w %[" #O2 "], %[" #I2 "], %[" #IO2 "] \n\t" \ 141*b2055c35SXin Li "precrq.ph.w %[" #O3 "], %[" #I3 "], %[" #IO3 "] \n\t" \ 142*b2055c35SXin Li "ins %[" #IO2 "], %[" #I2 "], 16, 16 \n\t" \ 143*b2055c35SXin Li "ins %[" #IO3 "], %[" #I3 "], 16, 16 \n\t" 144*b2055c35SXin Li 145*b2055c35SXin Li // preceu.ph.qbr temp0, temp8 146*b2055c35SXin Li // temp0 = 0 | 0 | temp8[23..16] | temp8[7..0] 147*b2055c35SXin Li // preceu.ph.qbl temp1, temp8 148*b2055c35SXin Li // temp1 = temp8[23..16] | temp8[7..0] | 0 | 0 149*b2055c35SXin Li // O - output 150*b2055c35SXin Li // I - input (macro doesn't change it) 151*b2055c35SXin Li #define CONVERT_2_BYTES_TO_HALF(O0, O1, O2, O3, O4, O5, O6, O7, \ 152*b2055c35SXin Li I0, I1, I2, I3) \ 153*b2055c35SXin Li "preceu.ph.qbr %[" #O0 "], %[" #I0 "] \n\t" \ 154*b2055c35SXin Li "preceu.ph.qbl %[" #O1 "], %[" #I0 "] \n\t" \ 155*b2055c35SXin Li "preceu.ph.qbr %[" #O2 "], %[" #I1 "] \n\t" \ 156*b2055c35SXin Li "preceu.ph.qbl %[" #O3 "], %[" #I1 "] \n\t" \ 157*b2055c35SXin Li "preceu.ph.qbr %[" #O4 "], %[" #I2 "] \n\t" \ 158*b2055c35SXin Li "preceu.ph.qbl %[" #O5 "], %[" #I2 "] \n\t" \ 159*b2055c35SXin Li "preceu.ph.qbr %[" #O6 "], %[" #I3 "] \n\t" \ 160*b2055c35SXin Li "preceu.ph.qbl %[" #O7 "], %[" #I3 "] \n\t" 161*b2055c35SXin Li 162*b2055c35SXin Li // temp0[31..16 | 15..0] = temp0[31..16 | 15..0] + temp8[31..16 | 15..0] 163*b2055c35SXin Li // temp0[31..16 | 15..0] = temp0[31..16 <<(s) 7 | 15..0 <<(s) 7] 164*b2055c35SXin Li // temp1..temp7 same as temp0 165*b2055c35SXin Li // precrqu_s.qb.ph temp0, temp1, temp0: 166*b2055c35SXin Li // temp0 = temp1[31..24] | temp1[15..8] | temp0[31..24] | temp0[15..8] 167*b2055c35SXin Li // store temp0 to dst 168*b2055c35SXin Li // IO - input/output 169*b2055c35SXin Li // I - input (macro doesn't change it) 170*b2055c35SXin Li #define STORE_SAT_SUM_X2(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7, \ 171*b2055c35SXin Li I0, I1, I2, I3, I4, I5, I6, I7, \ 172*b2055c35SXin Li I8, I9, I10, I11, I12, I13) \ 173*b2055c35SXin Li "addq.ph %[" #IO0 "], %[" #IO0 "], %[" #I0 "] \n\t" \ 174*b2055c35SXin Li "addq.ph %[" #IO1 "], %[" #IO1 "], %[" #I1 "] \n\t" \ 175*b2055c35SXin Li "addq.ph %[" #IO2 "], %[" #IO2 "], %[" #I2 "] \n\t" \ 176*b2055c35SXin Li "addq.ph %[" #IO3 "], %[" #IO3 "], %[" #I3 "] \n\t" \ 177*b2055c35SXin Li "addq.ph %[" #IO4 "], %[" #IO4 "], %[" #I4 "] \n\t" \ 178*b2055c35SXin Li "addq.ph %[" #IO5 "], %[" #IO5 "], %[" #I5 "] \n\t" \ 179*b2055c35SXin Li "addq.ph %[" #IO6 "], %[" #IO6 "], %[" #I6 "] \n\t" \ 180*b2055c35SXin Li "addq.ph %[" #IO7 "], %[" #IO7 "], %[" #I7 "] \n\t" \ 181*b2055c35SXin Li "shll_s.ph %[" #IO0 "], %[" #IO0 "], 7 \n\t" \ 182*b2055c35SXin Li "shll_s.ph %[" #IO1 "], %[" #IO1 "], 7 \n\t" \ 183*b2055c35SXin Li "shll_s.ph %[" #IO2 "], %[" #IO2 "], 7 \n\t" \ 184*b2055c35SXin Li "shll_s.ph %[" #IO3 "], %[" #IO3 "], 7 \n\t" \ 185*b2055c35SXin Li "shll_s.ph %[" #IO4 "], %[" #IO4 "], 7 \n\t" \ 186*b2055c35SXin Li "shll_s.ph %[" #IO5 "], %[" #IO5 "], 7 \n\t" \ 187*b2055c35SXin Li "shll_s.ph %[" #IO6 "], %[" #IO6 "], 7 \n\t" \ 188*b2055c35SXin Li "shll_s.ph %[" #IO7 "], %[" #IO7 "], 7 \n\t" \ 189*b2055c35SXin Li "precrqu_s.qb.ph %[" #IO0 "], %[" #IO1 "], %[" #IO0 "] \n\t" \ 190*b2055c35SXin Li "precrqu_s.qb.ph %[" #IO2 "], %[" #IO3 "], %[" #IO2 "] \n\t" \ 191*b2055c35SXin Li "precrqu_s.qb.ph %[" #IO4 "], %[" #IO5 "], %[" #IO4 "] \n\t" \ 192*b2055c35SXin Li "precrqu_s.qb.ph %[" #IO6 "], %[" #IO7 "], %[" #IO6 "] \n\t" \ 193*b2055c35SXin Li "usw %[" #IO0 "], " XSTR(I13) "*" #I9 "(%[" #I8 "]) \n\t" \ 194*b2055c35SXin Li "usw %[" #IO2 "], " XSTR(I13) "*" #I10 "(%[" #I8 "]) \n\t" \ 195*b2055c35SXin Li "usw %[" #IO4 "], " XSTR(I13) "*" #I11 "(%[" #I8 "]) \n\t" \ 196*b2055c35SXin Li "usw %[" #IO6 "], " XSTR(I13) "*" #I12 "(%[" #I8 "]) \n\t" 197*b2055c35SXin Li 198*b2055c35SXin Li #define OUTPUT_EARLY_CLOBBER_REGS_10() \ 199*b2055c35SXin Li : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \ 200*b2055c35SXin Li [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6), \ 201*b2055c35SXin Li [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), \ 202*b2055c35SXin Li [temp10]"=&r"(temp10) 203*b2055c35SXin Li 204*b2055c35SXin Li #define OUTPUT_EARLY_CLOBBER_REGS_18() \ 205*b2055c35SXin Li OUTPUT_EARLY_CLOBBER_REGS_10(), \ 206*b2055c35SXin Li [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), \ 207*b2055c35SXin Li [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), \ 208*b2055c35SXin Li [temp17]"=&r"(temp17), [temp18]"=&r"(temp18) 209*b2055c35SXin Li 210*b2055c35SXin Li #endif // WEBP_DSP_MIPS_MACRO_H_ 211