1 /*
2 * Copyright © 2010 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8
9 using namespace brw;
10
11 /**
12 * Factor an unsigned 32-bit integer.
13 *
14 * Attempts to factor \c x into two values that are at most 0xFFFF. If no
15 * such factorization is possible, either because the value is too large or is
16 * prime, both \c result_a and \c result_b will be zero.
17 */
18 static void
factor_uint32(uint32_t x,unsigned * result_a,unsigned * result_b)19 factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b)
20 {
21 /* This is necessary to prevent various opportunities for division by zero
22 * below.
23 */
24 assert(x > 0xffff);
25
26 /* This represents the actual expected constraints on the input. Namely,
27 * both the upper and lower words should be > 1.
28 */
29 assert(x >= 0x00020002);
30
31 *result_a = 0;
32 *result_b = 0;
33
34 /* The value is too large to factor with the constraints. */
35 if (x > (0xffffu * 0xffffu))
36 return;
37
38 /* A non-prime number will have the form p*q*d where p is some prime
39 * number, q > 1, and 1 <= d <= q. To meet the constraints of this
40 * function, (p*d) < 0x10000. This implies d <= floor(0xffff / p).
41 * Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)). Finally,
42 * floor(x / (0xffff * p)) <= d <= floor(0xffff / p).
43 *
44 * The observation is finding the largest possible value of p reduces the
45 * possible range of d. After selecting p, all values of d in this range
46 * are tested until a factorization is found. The size of the range of
47 * possible values of d sets an upper bound on the run time of the
48 * function.
49 */
50 static const uint16_t primes[256] = {
51 2, 3, 5, 7, 11, 13, 17, 19,
52 23, 29, 31, 37, 41, 43, 47, 53,
53 59, 61, 67, 71, 73, 79, 83, 89,
54 97, 101, 103, 107, 109, 113, 127, 131, /* 32 */
55 137, 139, 149, 151, 157, 163, 167, 173,
56 179, 181, 191, 193, 197, 199, 211, 223,
57 227, 229, 233, 239, 241, 251, 257, 263,
58 269, 271, 277, 281, 283, 293, 307, 311, /* 64 */
59 313, 317, 331, 337, 347, 349, 353, 359,
60 367, 373, 379, 383, 389, 397, 401, 409,
61 419, 421, 431, 433, 439, 443, 449, 457,
62 461, 463, 467, 479, 487, 491, 499, 503, /* 96 */
63 509, 521, 523, 541, 547, 557, 563, 569,
64 571, 577, 587, 593, 599, 601, 607, 613,
65 617, 619, 631, 641, 643, 647, 653, 659,
66 661, 673, 677, 683, 691, 701, 709, 719, /* 128 */
67 727, 733, 739, 743, 751, 757, 761, 769,
68 773, 787, 797, 809, 811, 821, 823, 827,
69 829, 839, 853, 857, 859, 863, 877, 881,
70 883, 887, 907, 911, 919, 929, 937, 941, /* 160 */
71 947, 953, 967, 971, 977, 983, 991, 997,
72 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
73 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
74 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, /* 192 */
75 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
76 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
77 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
78 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, /* 224 */
79 1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
80 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
81 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
82 1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619, /* 256 */
83 };
84
85 unsigned p;
86 unsigned x_div_p;
87
88 for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) {
89 p = primes[i];
90 x_div_p = x / p;
91
92 if ((x_div_p * p) == x)
93 break;
94 }
95
96 /* A prime factor was not found. */
97 if (x_div_p * p != x)
98 return;
99
100 /* Terminate early if d=1 is a solution. */
101 if (x_div_p < 0x10000) {
102 *result_a = x_div_p;
103 *result_b = p;
104 return;
105 }
106
107 /* Pick the maximum possible value for 'd'. It's important that the loop
108 * below execute while d <= max_d because max_d is a valid value. Having
109 * the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be
110 * incorrectly reported as not being factorable. The problem would occur
111 * with any value that is a factor of two primes in the table and one prime
112 * not in the table.
113 */
114 const unsigned max_d = 0xffff / p;
115
116 /* Pick an initial value of 'd' that (combined with rejecting too large
117 * values above) guarantees that 'q' will always be small enough.
118 * DIV_ROUND_UP is used to prevent 'd' from being zero.
119 */
120 for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) {
121 unsigned q = x_div_p / d;
122
123 if ((q * d) == x_div_p) {
124 assert(p * d * q == x);
125 assert((p * d) < 0x10000);
126
127 *result_a = q;
128 *result_b = p * d;
129 break;
130 }
131
132 /* Since every value of 'd' is tried, as soon as 'd' is larger
133 * than 'q', we're just re-testing combinations that have
134 * already been tested.
135 */
136 if (d > q)
137 break;
138 }
139 }
140
141 static void
brw_fs_lower_mul_dword_inst(fs_visitor & s,fs_inst * inst,bblock_t * block)142 brw_fs_lower_mul_dword_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
143 {
144 const intel_device_info *devinfo = s.devinfo;
145 const fs_builder ibld(&s, block, inst);
146
147 /* It is correct to use inst->src[1].d in both end of the comparison.
148 * Using .ud in the UINT16_MAX comparison would cause any negative value to
149 * fail the check.
150 */
151 if (inst->src[1].file == IMM &&
152 (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) {
153 /* The MUL instruction isn't commutative. On Gen >= 7 only
154 * the low 16-bits of src1 are used.
155 *
156 * If multiplying by an immediate value that fits in 16-bits, do a
157 * single MUL instruction with that value in the proper location.
158 */
159 const bool ud = (inst->src[1].d >= 0);
160 ibld.MUL(inst->dst, inst->src[0],
161 ud ? brw_imm_uw(inst->src[1].ud)
162 : brw_imm_w(inst->src[1].d));
163 } else {
164 /* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
165 * do 32-bit integer multiplication in one instruction, but instead
166 * must do a sequence (which actually calculates a 64-bit result):
167 *
168 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
169 * mach(8) null g3<8,8,1>D g4<8,8,1>D
170 * mov(8) g2<1>D acc0<8,8,1>D
171 *
172 * But on Gen > 6, the ability to use second accumulator register
173 * (acc1) for non-float data types was removed, preventing a simple
174 * implementation in SIMD16. A 16-channel result can be calculated by
175 * executing the three instructions twice in SIMD8, once with quarter
176 * control of 1Q for the first eight channels and again with 2Q for
177 * the second eight channels.
178 *
179 * Which accumulator register is implicitly accessed (by AccWrEnable
180 * for instance) is determined by the quarter control. Unfortunately
181 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
182 * implicit accumulator access by an instruction with 2Q will access
183 * acc1 regardless of whether the data type is usable in acc1.
184 *
185 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
186 * integer data types.
187 *
188 * Since we only want the low 32-bits of the result, we can do two
189 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
190 * adjust the high result and add them (like the mach is doing):
191 *
192 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
193 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
194 * shl(8) g9<1>D g8<8,8,1>D 16D
195 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
196 *
197 * We avoid the shl instruction by realizing that we only want to add
198 * the low 16-bits of the "high" result to the high 16-bits of the
199 * "low" result and using proper regioning on the add:
200 *
201 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
202 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
203 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
204 *
205 * Since it does not use the (single) accumulator register, we can
206 * schedule multi-component multiplications much better.
207 */
208
209 bool needs_mov = false;
210 brw_reg orig_dst = inst->dst;
211
212 /* Get a new VGRF for the "low" 32x16-bit multiplication result if
213 * reusing the original destination is impossible due to hardware
214 * restrictions, source/destination overlap, or it being the null
215 * register.
216 */
217 brw_reg low = inst->dst;
218 if (orig_dst.is_null() ||
219 regions_overlap(inst->dst, inst->size_written,
220 inst->src[0], inst->size_read(0)) ||
221 regions_overlap(inst->dst, inst->size_written,
222 inst->src[1], inst->size_read(1)) ||
223 inst->dst.stride >= 4) {
224 needs_mov = true;
225 low = brw_vgrf(s.alloc.allocate(regs_written(inst)),
226 inst->dst.type);
227 }
228
229 /* Get a new VGRF but keep the same stride as inst->dst */
230 brw_reg high = brw_vgrf(s.alloc.allocate(regs_written(inst)), inst->dst.type);
231 high.stride = inst->dst.stride;
232 high.offset = inst->dst.offset % REG_SIZE;
233
234 bool do_addition = true;
235 {
236 /* From Wa_1604601757:
237 *
238 * "When multiplying a DW and any lower precision integer, source modifier
239 * is not supported."
240 *
241 * An unsupported negate modifier on src[1] would ordinarily be
242 * lowered by the subsequent lower_regioning pass. In this case that
243 * pass would spawn another dword multiply. Instead, lower the
244 * modifier first.
245 */
246 const bool source_mods_unsupported = (devinfo->ver >= 12);
247
248 if (inst->src[1].abs || (inst->src[1].negate &&
249 source_mods_unsupported))
250 lower_src_modifiers(&s, block, inst, 1);
251
252 if (inst->src[1].file == IMM) {
253 unsigned a;
254 unsigned b;
255
256 /* If the immeditate value can be factored into two values, A and
257 * B, that each fit in 16-bits, the multiplication result can
258 * instead be calculated as (src1 * (A * B)) = ((src1 * A) * B).
259 * This saves an operation (the addition) and a temporary register
260 * (high).
261 *
262 * Skip the optimization if either the high word or the low word
263 * is 0 or 1. In these conditions, at least one of the
264 * multiplications generated by the straightforward method will be
265 * eliminated anyway.
266 */
267 if (inst->src[1].ud > 0x0001ffff &&
268 (inst->src[1].ud & 0xffff) > 1) {
269 factor_uint32(inst->src[1].ud, &a, &b);
270
271 if (a != 0) {
272 ibld.MUL(low, inst->src[0], brw_imm_uw(a));
273 ibld.MUL(low, low, brw_imm_uw(b));
274 do_addition = false;
275 }
276 }
277
278 if (do_addition) {
279 ibld.MUL(low, inst->src[0],
280 brw_imm_uw(inst->src[1].ud & 0xffff));
281 ibld.MUL(high, inst->src[0],
282 brw_imm_uw(inst->src[1].ud >> 16));
283 }
284 } else {
285 ibld.MUL(low, inst->src[0],
286 subscript(inst->src[1], BRW_TYPE_UW, 0));
287 ibld.MUL(high, inst->src[0],
288 subscript(inst->src[1], BRW_TYPE_UW, 1));
289 }
290 }
291
292 if (do_addition) {
293 ibld.ADD(subscript(low, BRW_TYPE_UW, 1),
294 subscript(low, BRW_TYPE_UW, 1),
295 subscript(high, BRW_TYPE_UW, 0));
296 }
297
298 if (needs_mov || inst->conditional_mod)
299 set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
300 }
301 }
302
303 static void
brw_fs_lower_mul_qword_inst(fs_visitor & s,fs_inst * inst,bblock_t * block)304 brw_fs_lower_mul_qword_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
305 {
306 const intel_device_info *devinfo = s.devinfo;
307 const fs_builder ibld(&s, block, inst);
308
309 /* Considering two 64-bit integers ab and cd where each letter ab
310 * corresponds to 32 bits, we get a 128-bit result WXYZ. We * cd
311 * only need to provide the YZ part of the result. -------
312 * BD
313 * Only BD needs to be 64 bits. For AD and BC we only care + AD
314 * about the lower 32 bits (since they are part of the upper + BC
315 * 32 bits of our result). AC is not needed since it starts + AC
316 * on the 65th bit of the result. -------
317 * WXYZ
318 */
319 unsigned int q_regs = regs_written(inst);
320 unsigned int d_regs = (q_regs + 1) / 2;
321
322 brw_reg bd = brw_vgrf(s.alloc.allocate(q_regs), BRW_TYPE_UQ);
323 brw_reg ad = brw_vgrf(s.alloc.allocate(d_regs), BRW_TYPE_UD);
324 brw_reg bc = brw_vgrf(s.alloc.allocate(d_regs), BRW_TYPE_UD);
325
326 /* Here we need the full 64 bit result for 32b * 32b. */
327 if (devinfo->has_integer_dword_mul) {
328 ibld.MUL(bd, subscript(inst->src[0], BRW_TYPE_UD, 0),
329 subscript(inst->src[1], BRW_TYPE_UD, 0));
330 } else {
331 brw_reg bd_high = brw_vgrf(s.alloc.allocate(d_regs), BRW_TYPE_UD);
332 brw_reg bd_low = brw_vgrf(s.alloc.allocate(d_regs), BRW_TYPE_UD);
333 const unsigned acc_width = reg_unit(devinfo) * 8;
334 brw_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_TYPE_UD),
335 inst->group % acc_width);
336
337 fs_inst *mul = ibld.MUL(acc,
338 subscript(inst->src[0], BRW_TYPE_UD, 0),
339 subscript(inst->src[1], BRW_TYPE_UW, 0));
340 mul->writes_accumulator = true;
341
342 ibld.MACH(bd_high, subscript(inst->src[0], BRW_TYPE_UD, 0),
343 subscript(inst->src[1], BRW_TYPE_UD, 0));
344 ibld.MOV(bd_low, acc);
345
346 ibld.UNDEF(bd);
347 ibld.MOV(subscript(bd, BRW_TYPE_UD, 0), bd_low);
348 ibld.MOV(subscript(bd, BRW_TYPE_UD, 1), bd_high);
349 }
350
351 ibld.MUL(ad, subscript(inst->src[0], BRW_TYPE_UD, 1),
352 subscript(inst->src[1], BRW_TYPE_UD, 0));
353 ibld.MUL(bc, subscript(inst->src[0], BRW_TYPE_UD, 0),
354 subscript(inst->src[1], BRW_TYPE_UD, 1));
355
356 ibld.ADD(ad, ad, bc);
357 ibld.ADD(subscript(bd, BRW_TYPE_UD, 1),
358 subscript(bd, BRW_TYPE_UD, 1), ad);
359
360 if (devinfo->has_64bit_int) {
361 ibld.MOV(inst->dst, bd);
362 } else {
363 if (!inst->is_partial_write())
364 ibld.emit_undef_for_dst(inst);
365 ibld.MOV(subscript(inst->dst, BRW_TYPE_UD, 0),
366 subscript(bd, BRW_TYPE_UD, 0));
367 ibld.MOV(subscript(inst->dst, BRW_TYPE_UD, 1),
368 subscript(bd, BRW_TYPE_UD, 1));
369 }
370 }
371
372 static void
brw_fs_lower_mulh_inst(fs_visitor & s,fs_inst * inst,bblock_t * block)373 brw_fs_lower_mulh_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
374 {
375 const intel_device_info *devinfo = s.devinfo;
376 const fs_builder ibld(&s, block, inst);
377
378 /* According to the BDW+ BSpec page for the "Multiply Accumulate
379 * High" instruction:
380 *
381 * "An added preliminary mov is required for source modification on
382 * src1:
383 * mov (8) r3.0<1>:d -r3<8;8,1>:d
384 * mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
385 * mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
386 */
387 if (inst->src[1].negate || inst->src[1].abs)
388 lower_src_modifiers(&s, block, inst, 1);
389
390 /* Should have been lowered to 8-wide. */
391 assert(inst->exec_size <= brw_fs_get_lowered_simd_width(&s, inst));
392 const unsigned acc_width = reg_unit(devinfo) * 8;
393 const brw_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), inst->dst.type),
394 inst->group % acc_width);
395 fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
396 ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
397
398 /* Until Gfx8, integer multiplies read 32-bits from one source,
399 * and 16-bits from the other, and relying on the MACH instruction
400 * to generate the high bits of the result.
401 *
402 * On Gfx8, the multiply instruction does a full 32x32-bit
403 * multiply, but in order to do a 64-bit multiply we can simulate
404 * the previous behavior and then use a MACH instruction.
405 */
406 assert(mul->src[1].type == BRW_TYPE_D ||
407 mul->src[1].type == BRW_TYPE_UD);
408 mul->src[1].type = BRW_TYPE_UW;
409 mul->src[1].stride *= 2;
410
411 if (mul->src[1].file == IMM) {
412 mul->src[1] = brw_imm_uw(mul->src[1].ud);
413 }
414 }
415
416 bool
brw_fs_lower_integer_multiplication(fs_visitor & s)417 brw_fs_lower_integer_multiplication(fs_visitor &s)
418 {
419 const intel_device_info *devinfo = s.devinfo;
420 bool progress = false;
421
422 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
423 if (inst->opcode == BRW_OPCODE_MUL) {
424 /* If the instruction is already in a form that does not need lowering,
425 * return early.
426 */
427 if (brw_type_size_bytes(inst->src[1].type) < 4 && brw_type_size_bytes(inst->src[0].type) <= 4)
428 continue;
429
430 if ((inst->dst.type == BRW_TYPE_Q ||
431 inst->dst.type == BRW_TYPE_UQ) &&
432 (inst->src[0].type == BRW_TYPE_Q ||
433 inst->src[0].type == BRW_TYPE_UQ) &&
434 (inst->src[1].type == BRW_TYPE_Q ||
435 inst->src[1].type == BRW_TYPE_UQ)) {
436 brw_fs_lower_mul_qword_inst(s, inst, block);
437 inst->remove(block);
438 progress = true;
439 } else if (!inst->dst.is_accumulator() &&
440 (inst->dst.type == BRW_TYPE_D ||
441 inst->dst.type == BRW_TYPE_UD) &&
442 (!devinfo->has_integer_dword_mul ||
443 devinfo->verx10 >= 125)) {
444 brw_fs_lower_mul_dword_inst(s, inst, block);
445 inst->remove(block);
446 progress = true;
447 }
448 } else if (inst->opcode == SHADER_OPCODE_MULH) {
449 brw_fs_lower_mulh_inst(s, inst, block);
450 inst->remove(block);
451 progress = true;
452 }
453
454 }
455
456 if (progress)
457 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
458
459 return progress;
460 }
461
462
463