xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_fs_lower_integer_multiplication.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8 
9 using namespace brw;
10 
11 /**
12  * Factor an unsigned 32-bit integer.
13  *
14  * Attempts to factor \c x into two values that are at most 0xFFFF.  If no
15  * such factorization is possible, either because the value is too large or is
16  * prime, both \c result_a and \c result_b will be zero.
17  */
18 static void
factor_uint32(uint32_t x,unsigned * result_a,unsigned * result_b)19 factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b)
20 {
21    /* This is necessary to prevent various opportunities for division by zero
22     * below.
23     */
24    assert(x > 0xffff);
25 
26    /* This represents the actual expected constraints on the input.  Namely,
27     * both the upper and lower words should be > 1.
28     */
29    assert(x >= 0x00020002);
30 
31    *result_a = 0;
32    *result_b = 0;
33 
34    /* The value is too large to factor with the constraints. */
35    if (x > (0xffffu * 0xffffu))
36       return;
37 
38    /* A non-prime number will have the form p*q*d where p is some prime
39     * number, q > 1, and 1 <= d <= q.  To meet the constraints of this
40     * function, (p*d) < 0x10000.  This implies d <= floor(0xffff / p).
41     * Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)).  Finally,
42     * floor(x / (0xffff * p)) <= d <= floor(0xffff / p).
43     *
44     * The observation is finding the largest possible value of p reduces the
45     * possible range of d.  After selecting p, all values of d in this range
46     * are tested until a factorization is found.  The size of the range of
47     * possible values of d sets an upper bound on the run time of the
48     * function.
49     */
50    static const uint16_t primes[256] = {
51          2,    3,    5,    7,   11,   13,   17,   19,
52         23,   29,   31,   37,   41,   43,   47,   53,
53         59,   61,   67,   71,   73,   79,   83,   89,
54         97,  101,  103,  107,  109,  113,  127,  131,  /*  32 */
55        137,  139,  149,  151,  157,  163,  167,  173,
56        179,  181,  191,  193,  197,  199,  211,  223,
57        227,  229,  233,  239,  241,  251,  257,  263,
58        269,  271,  277,  281,  283,  293,  307,  311,  /*  64 */
59        313,  317,  331,  337,  347,  349,  353,  359,
60        367,  373,  379,  383,  389,  397,  401,  409,
61        419,  421,  431,  433,  439,  443,  449,  457,
62        461,  463,  467,  479,  487,  491,  499,  503,  /*  96 */
63        509,  521,  523,  541,  547,  557,  563,  569,
64        571,  577,  587,  593,  599,  601,  607,  613,
65        617,  619,  631,  641,  643,  647,  653,  659,
66        661,  673,  677,  683,  691,  701,  709,  719,   /* 128 */
67        727,  733,  739,  743,  751,  757,  761,  769,
68        773,  787,  797,  809,  811,  821,  823,  827,
69        829,  839,  853,  857,  859,  863,  877,  881,
70        883,  887,  907,  911,  919,  929,  937,  941,  /* 160 */
71        947,  953,  967,  971,  977,  983,  991,  997,
72       1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
73       1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
74       1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163,  /* 192 */
75       1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
76       1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
77       1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
78       1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423,  /* 224 */
79       1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
80       1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
81       1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
82       1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619,  /* 256 */
83    };
84 
85    unsigned p;
86    unsigned x_div_p;
87 
88    for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) {
89       p = primes[i];
90       x_div_p = x / p;
91 
92       if ((x_div_p * p) == x)
93          break;
94    }
95 
96    /* A prime factor was not found. */
97    if (x_div_p * p != x)
98       return;
99 
100    /* Terminate early if d=1 is a solution. */
101    if (x_div_p < 0x10000) {
102       *result_a = x_div_p;
103       *result_b = p;
104       return;
105    }
106 
107    /* Pick the maximum possible value for 'd'.  It's important that the loop
108     * below execute while d <= max_d because max_d is a valid value.  Having
109     * the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be
110     * incorrectly reported as not being factorable.  The problem would occur
111     * with any value that is a factor of two primes in the table and one prime
112     * not in the table.
113     */
114    const unsigned max_d = 0xffff / p;
115 
116    /* Pick an initial value of 'd' that (combined with rejecting too large
117     * values above) guarantees that 'q' will always be small enough.
118     * DIV_ROUND_UP is used to prevent 'd' from being zero.
119     */
120    for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) {
121       unsigned q = x_div_p / d;
122 
123       if ((q * d) == x_div_p) {
124          assert(p * d * q == x);
125          assert((p * d) < 0x10000);
126 
127          *result_a = q;
128          *result_b = p * d;
129          break;
130       }
131 
132       /* Since every value of 'd' is tried, as soon as 'd' is larger
133        * than 'q', we're just re-testing combinations that have
134        * already been tested.
135        */
136       if (d > q)
137          break;
138    }
139 }
140 
141 static void
brw_fs_lower_mul_dword_inst(fs_visitor & s,fs_inst * inst,bblock_t * block)142 brw_fs_lower_mul_dword_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
143 {
144    const intel_device_info *devinfo = s.devinfo;
145    const fs_builder ibld(&s, block, inst);
146 
147    /* It is correct to use inst->src[1].d in both end of the comparison.
148     * Using .ud in the UINT16_MAX comparison would cause any negative value to
149     * fail the check.
150     */
151    if (inst->src[1].file == IMM &&
152        (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) {
153       /* The MUL instruction isn't commutative. On Gen >= 7 only
154        * the low 16-bits of src1 are used.
155        *
156        * If multiplying by an immediate value that fits in 16-bits, do a
157        * single MUL instruction with that value in the proper location.
158        */
159       const bool ud = (inst->src[1].d >= 0);
160       ibld.MUL(inst->dst, inst->src[0],
161                ud ? brw_imm_uw(inst->src[1].ud)
162                   : brw_imm_w(inst->src[1].d));
163    } else {
164       /* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
165        * do 32-bit integer multiplication in one instruction, but instead
166        * must do a sequence (which actually calculates a 64-bit result):
167        *
168        *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
169        *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
170        *    mov(8)  g2<1>D     acc0<8,8,1>D
171        *
172        * But on Gen > 6, the ability to use second accumulator register
173        * (acc1) for non-float data types was removed, preventing a simple
174        * implementation in SIMD16. A 16-channel result can be calculated by
175        * executing the three instructions twice in SIMD8, once with quarter
176        * control of 1Q for the first eight channels and again with 2Q for
177        * the second eight channels.
178        *
179        * Which accumulator register is implicitly accessed (by AccWrEnable
180        * for instance) is determined by the quarter control. Unfortunately
181        * Ivybridge (and presumably Baytrail) has a hardware bug in which an
182        * implicit accumulator access by an instruction with 2Q will access
183        * acc1 regardless of whether the data type is usable in acc1.
184        *
185        * Specifically, the 2Q mach(8) writes acc1 which does not exist for
186        * integer data types.
187        *
188        * Since we only want the low 32-bits of the result, we can do two
189        * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
190        * adjust the high result and add them (like the mach is doing):
191        *
192        *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
193        *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
194        *    shl(8)  g9<1>D     g8<8,8,1>D      16D
195        *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
196        *
197        * We avoid the shl instruction by realizing that we only want to add
198        * the low 16-bits of the "high" result to the high 16-bits of the
199        * "low" result and using proper regioning on the add:
200        *
201        *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
202        *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
203        *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
204        *
205        * Since it does not use the (single) accumulator register, we can
206        * schedule multi-component multiplications much better.
207        */
208 
209       bool needs_mov = false;
210       brw_reg orig_dst = inst->dst;
211 
212       /* Get a new VGRF for the "low" 32x16-bit multiplication result if
213        * reusing the original destination is impossible due to hardware
214        * restrictions, source/destination overlap, or it being the null
215        * register.
216        */
217       brw_reg low = inst->dst;
218       if (orig_dst.is_null() ||
219           regions_overlap(inst->dst, inst->size_written,
220                           inst->src[0], inst->size_read(0)) ||
221           regions_overlap(inst->dst, inst->size_written,
222                           inst->src[1], inst->size_read(1)) ||
223           inst->dst.stride >= 4) {
224          needs_mov = true;
225          low = brw_vgrf(s.alloc.allocate(regs_written(inst)),
226                         inst->dst.type);
227       }
228 
229       /* Get a new VGRF but keep the same stride as inst->dst */
230       brw_reg high = brw_vgrf(s.alloc.allocate(regs_written(inst)), inst->dst.type);
231       high.stride = inst->dst.stride;
232       high.offset = inst->dst.offset % REG_SIZE;
233 
234       bool do_addition = true;
235       {
236          /* From Wa_1604601757:
237           *
238           * "When multiplying a DW and any lower precision integer, source modifier
239           *  is not supported."
240           *
241           * An unsupported negate modifier on src[1] would ordinarily be
242           * lowered by the subsequent lower_regioning pass.  In this case that
243           * pass would spawn another dword multiply.  Instead, lower the
244           * modifier first.
245           */
246          const bool source_mods_unsupported = (devinfo->ver >= 12);
247 
248          if (inst->src[1].abs || (inst->src[1].negate &&
249                                   source_mods_unsupported))
250             lower_src_modifiers(&s, block, inst, 1);
251 
252          if (inst->src[1].file == IMM) {
253             unsigned a;
254             unsigned b;
255 
256             /* If the immeditate value can be factored into two values, A and
257              * B, that each fit in 16-bits, the multiplication result can
258              * instead be calculated as (src1 * (A * B)) = ((src1 * A) * B).
259              * This saves an operation (the addition) and a temporary register
260              * (high).
261              *
262              * Skip the optimization if either the high word or the low word
263              * is 0 or 1.  In these conditions, at least one of the
264              * multiplications generated by the straightforward method will be
265              * eliminated anyway.
266              */
267             if (inst->src[1].ud > 0x0001ffff &&
268                 (inst->src[1].ud & 0xffff) > 1) {
269                factor_uint32(inst->src[1].ud, &a, &b);
270 
271                if (a != 0) {
272                   ibld.MUL(low, inst->src[0], brw_imm_uw(a));
273                   ibld.MUL(low, low, brw_imm_uw(b));
274                   do_addition = false;
275                }
276             }
277 
278             if (do_addition) {
279                ibld.MUL(low, inst->src[0],
280                         brw_imm_uw(inst->src[1].ud & 0xffff));
281                ibld.MUL(high, inst->src[0],
282                         brw_imm_uw(inst->src[1].ud >> 16));
283             }
284          } else {
285             ibld.MUL(low, inst->src[0],
286                      subscript(inst->src[1], BRW_TYPE_UW, 0));
287             ibld.MUL(high, inst->src[0],
288                      subscript(inst->src[1], BRW_TYPE_UW, 1));
289          }
290       }
291 
292       if (do_addition) {
293          ibld.ADD(subscript(low, BRW_TYPE_UW, 1),
294                   subscript(low, BRW_TYPE_UW, 1),
295                   subscript(high, BRW_TYPE_UW, 0));
296       }
297 
298       if (needs_mov || inst->conditional_mod)
299          set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
300    }
301 }
302 
303 static void
brw_fs_lower_mul_qword_inst(fs_visitor & s,fs_inst * inst,bblock_t * block)304 brw_fs_lower_mul_qword_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
305 {
306    const intel_device_info *devinfo = s.devinfo;
307    const fs_builder ibld(&s, block, inst);
308 
309    /* Considering two 64-bit integers ab and cd where each letter        ab
310     * corresponds to 32 bits, we get a 128-bit result WXYZ. We         * cd
311     * only need to provide the YZ part of the result.               -------
312     *                                                                    BD
313     *  Only BD needs to be 64 bits. For AD and BC we only care       +  AD
314     *  about the lower 32 bits (since they are part of the upper     +  BC
315     *  32 bits of our result). AC is not needed since it starts      + AC
316     *  on the 65th bit of the result.                               -------
317     *                                                                  WXYZ
318     */
319    unsigned int q_regs = regs_written(inst);
320    unsigned int d_regs = (q_regs + 1) / 2;
321 
322    brw_reg bd = brw_vgrf(s.alloc.allocate(q_regs), BRW_TYPE_UQ);
323    brw_reg ad = brw_vgrf(s.alloc.allocate(d_regs), BRW_TYPE_UD);
324    brw_reg bc = brw_vgrf(s.alloc.allocate(d_regs), BRW_TYPE_UD);
325 
326    /* Here we need the full 64 bit result for 32b * 32b. */
327    if (devinfo->has_integer_dword_mul) {
328       ibld.MUL(bd, subscript(inst->src[0], BRW_TYPE_UD, 0),
329                subscript(inst->src[1], BRW_TYPE_UD, 0));
330    } else {
331       brw_reg bd_high = brw_vgrf(s.alloc.allocate(d_regs), BRW_TYPE_UD);
332       brw_reg bd_low  = brw_vgrf(s.alloc.allocate(d_regs), BRW_TYPE_UD);
333       const unsigned acc_width = reg_unit(devinfo) * 8;
334       brw_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_TYPE_UD),
335                              inst->group % acc_width);
336 
337       fs_inst *mul = ibld.MUL(acc,
338                             subscript(inst->src[0], BRW_TYPE_UD, 0),
339                             subscript(inst->src[1], BRW_TYPE_UW, 0));
340       mul->writes_accumulator = true;
341 
342       ibld.MACH(bd_high, subscript(inst->src[0], BRW_TYPE_UD, 0),
343                 subscript(inst->src[1], BRW_TYPE_UD, 0));
344       ibld.MOV(bd_low, acc);
345 
346       ibld.UNDEF(bd);
347       ibld.MOV(subscript(bd, BRW_TYPE_UD, 0), bd_low);
348       ibld.MOV(subscript(bd, BRW_TYPE_UD, 1), bd_high);
349    }
350 
351    ibld.MUL(ad, subscript(inst->src[0], BRW_TYPE_UD, 1),
352             subscript(inst->src[1], BRW_TYPE_UD, 0));
353    ibld.MUL(bc, subscript(inst->src[0], BRW_TYPE_UD, 0),
354             subscript(inst->src[1], BRW_TYPE_UD, 1));
355 
356    ibld.ADD(ad, ad, bc);
357    ibld.ADD(subscript(bd, BRW_TYPE_UD, 1),
358             subscript(bd, BRW_TYPE_UD, 1), ad);
359 
360    if (devinfo->has_64bit_int) {
361       ibld.MOV(inst->dst, bd);
362    } else {
363       if (!inst->is_partial_write())
364          ibld.emit_undef_for_dst(inst);
365       ibld.MOV(subscript(inst->dst, BRW_TYPE_UD, 0),
366                subscript(bd, BRW_TYPE_UD, 0));
367       ibld.MOV(subscript(inst->dst, BRW_TYPE_UD, 1),
368                subscript(bd, BRW_TYPE_UD, 1));
369    }
370 }
371 
372 static void
brw_fs_lower_mulh_inst(fs_visitor & s,fs_inst * inst,bblock_t * block)373 brw_fs_lower_mulh_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
374 {
375    const intel_device_info *devinfo = s.devinfo;
376    const fs_builder ibld(&s, block, inst);
377 
378    /* According to the BDW+ BSpec page for the "Multiply Accumulate
379     * High" instruction:
380     *
381     *  "An added preliminary mov is required for source modification on
382     *   src1:
383     *      mov (8) r3.0<1>:d -r3<8;8,1>:d
384     *      mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
385     *      mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
386     */
387    if (inst->src[1].negate || inst->src[1].abs)
388       lower_src_modifiers(&s, block, inst, 1);
389 
390    /* Should have been lowered to 8-wide. */
391    assert(inst->exec_size <= brw_fs_get_lowered_simd_width(&s, inst));
392    const unsigned acc_width = reg_unit(devinfo) * 8;
393    const brw_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), inst->dst.type),
394                                 inst->group % acc_width);
395    fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
396    ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
397 
398    /* Until Gfx8, integer multiplies read 32-bits from one source,
399     * and 16-bits from the other, and relying on the MACH instruction
400     * to generate the high bits of the result.
401     *
402     * On Gfx8, the multiply instruction does a full 32x32-bit
403     * multiply, but in order to do a 64-bit multiply we can simulate
404     * the previous behavior and then use a MACH instruction.
405     */
406    assert(mul->src[1].type == BRW_TYPE_D ||
407           mul->src[1].type == BRW_TYPE_UD);
408    mul->src[1].type = BRW_TYPE_UW;
409    mul->src[1].stride *= 2;
410 
411    if (mul->src[1].file == IMM) {
412       mul->src[1] = brw_imm_uw(mul->src[1].ud);
413    }
414 }
415 
416 bool
brw_fs_lower_integer_multiplication(fs_visitor & s)417 brw_fs_lower_integer_multiplication(fs_visitor &s)
418 {
419    const intel_device_info *devinfo = s.devinfo;
420    bool progress = false;
421 
422    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
423       if (inst->opcode == BRW_OPCODE_MUL) {
424          /* If the instruction is already in a form that does not need lowering,
425           * return early.
426           */
427          if (brw_type_size_bytes(inst->src[1].type) < 4 && brw_type_size_bytes(inst->src[0].type) <= 4)
428             continue;
429 
430          if ((inst->dst.type == BRW_TYPE_Q ||
431               inst->dst.type == BRW_TYPE_UQ) &&
432              (inst->src[0].type == BRW_TYPE_Q ||
433               inst->src[0].type == BRW_TYPE_UQ) &&
434              (inst->src[1].type == BRW_TYPE_Q ||
435               inst->src[1].type == BRW_TYPE_UQ)) {
436             brw_fs_lower_mul_qword_inst(s, inst, block);
437             inst->remove(block);
438             progress = true;
439          } else if (!inst->dst.is_accumulator() &&
440                     (inst->dst.type == BRW_TYPE_D ||
441                      inst->dst.type == BRW_TYPE_UD) &&
442                     (!devinfo->has_integer_dword_mul ||
443                      devinfo->verx10 >= 125)) {
444             brw_fs_lower_mul_dword_inst(s, inst, block);
445             inst->remove(block);
446             progress = true;
447          }
448       } else if (inst->opcode == SHADER_OPCODE_MULH) {
449          brw_fs_lower_mulh_inst(s, inst, block);
450          inst->remove(block);
451          progress = true;
452       }
453 
454    }
455 
456    if (progress)
457       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
458 
459    return progress;
460 }
461 
462 
463