1 /*
2 * Copyright © 2010 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8
9 using namespace brw;
10
11 /**
12 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
13 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
14 */
15 bool
brw_fs_lower_constant_loads(fs_visitor & s)16 brw_fs_lower_constant_loads(fs_visitor &s)
17 {
18 unsigned index, pull_index;
19 bool progress = false;
20
21 foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
22 /* Set up the annotation tracking for new generated instructions. */
23 const fs_builder ibld(&s, block, inst);
24
25 for (int i = 0; i < inst->sources; i++) {
26 if (inst->src[i].file != UNIFORM)
27 continue;
28
29 /* We'll handle this case later */
30 if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
31 continue;
32
33 if (!s.get_pull_locs(inst->src[i], &index, &pull_index))
34 continue;
35
36 assert(inst->src[i].stride == 0);
37
38 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
39 const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
40 const brw_reg dst = ubld.vgrf(BRW_TYPE_UD);
41 const unsigned base = pull_index * 4;
42
43 brw_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
44 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
45 srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1));
46 srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz);
47
48
49 ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
50 srcs, PULL_UNIFORM_CONSTANT_SRCS);
51
52 /* Rewrite the instruction to use the temporary VGRF. */
53 inst->src[i].file = VGRF;
54 inst->src[i].nr = dst.nr;
55 inst->src[i].offset = (base & (block_sz - 1)) +
56 inst->src[i].offset % 4;
57
58 progress = true;
59 }
60
61 if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
62 inst->src[0].file == UNIFORM) {
63
64 if (!s.get_pull_locs(inst->src[0], &index, &pull_index))
65 continue;
66
67 ibld.VARYING_PULL_CONSTANT_LOAD(inst->dst,
68 brw_imm_ud(index),
69 brw_reg() /* surface_handle */,
70 inst->src[1],
71 pull_index * 4, 4, 1);
72 inst->remove(block);
73
74 progress = true;
75 }
76 }
77 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
78
79 return progress;
80 }
81
82 bool
brw_fs_lower_load_payload(fs_visitor & s)83 brw_fs_lower_load_payload(fs_visitor &s)
84 {
85 bool progress = false;
86
87 foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
88 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
89 continue;
90
91 assert(inst->dst.file == VGRF);
92 assert(inst->saturate == false);
93 brw_reg dst = inst->dst;
94
95 const fs_builder ibld(&s, block, inst);
96 const fs_builder ubld = ibld.exec_all();
97
98 for (uint8_t i = 0; i < inst->header_size;) {
99 /* Number of header GRFs to initialize at once with a single MOV
100 * instruction.
101 */
102 const unsigned n =
103 (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
104 inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
105 2 : 1;
106
107 if (inst->src[i].file != BAD_FILE)
108 ubld.group(8 * n, 0).MOV(retype(dst, BRW_TYPE_UD),
109 retype(inst->src[i], BRW_TYPE_UD));
110
111 dst = byte_offset(dst, n * REG_SIZE);
112 i += n;
113 }
114
115 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
116 dst.type = inst->src[i].type;
117 if (inst->src[i].file != BAD_FILE) {
118 ibld.MOV(dst, inst->src[i]);
119 }
120 dst = offset(dst, ibld, 1);
121 }
122
123 inst->remove(block);
124 progress = true;
125 }
126
127 if (progress)
128 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
129
130 return progress;
131 }
132
133 /**
134 * Lower CSEL with unsupported types to CMP+SEL.
135 *
136 * Or, for unsigned ==/!= comparisons, simply change the types.
137 */
138 bool
brw_fs_lower_csel(fs_visitor & s)139 brw_fs_lower_csel(fs_visitor &s)
140 {
141 const intel_device_info *devinfo = s.devinfo;
142 bool progress = false;
143
144 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
145 if (inst->opcode != BRW_OPCODE_CSEL)
146 continue;
147
148 bool supported = false;
149 enum brw_reg_type orig_type = inst->src[2].type;
150 enum brw_reg_type new_type = orig_type;
151
152 switch (orig_type) {
153 case BRW_TYPE_F:
154 /* Gfx9 CSEL can only do F */
155 supported = true;
156 break;
157 case BRW_TYPE_HF:
158 case BRW_TYPE_W:
159 case BRW_TYPE_D:
160 /* Gfx11+ CSEL can do HF, W, and D. Note that we can't simply
161 * retype integer ==/!= comparisons as float on earlier hardware
162 * because it breaks for 0x8000000 and 0 (-0.0 == 0.0).
163 */
164 supported = devinfo->ver >= 11;
165 break;
166 case BRW_TYPE_UW:
167 case BRW_TYPE_UD:
168 /* CSEL doesn't support UW/UD but we can simply retype to use the
169 * signed types when comparing with == or !=.
170 */
171 supported = devinfo->ver >= 11 &&
172 (inst->conditional_mod == BRW_CONDITIONAL_EQ ||
173 inst->conditional_mod == BRW_CONDITIONAL_NEQ);
174
175 /* Bspec 47408, Gfx125+ CSEL does support the both signed and unsigned
176 * integer types.
177 */
178 if (devinfo->verx10 < 125) {
179 new_type = inst->src[2].type == BRW_TYPE_UD ?
180 BRW_TYPE_D : BRW_TYPE_W;
181 }
182 break;
183 default:
184 break;
185 }
186
187 if (!supported) {
188 const fs_builder ibld(&s, block, inst);
189
190 /* CSEL: dst = src2 <op> 0 ? src0 : src1 */
191 brw_reg zero = brw_imm_reg(orig_type);
192 ibld.CMP(retype(brw_null_reg(), orig_type),
193 inst->src[2], zero, inst->conditional_mod);
194
195 inst->opcode = BRW_OPCODE_SEL;
196 inst->predicate = BRW_PREDICATE_NORMAL;
197 inst->conditional_mod = BRW_CONDITIONAL_NONE;
198 inst->resize_sources(2);
199 progress = true;
200 } else if (new_type != orig_type) {
201 inst->src[0].type = new_type;
202 inst->src[1].type = new_type;
203 inst->src[2].type = new_type;
204 progress = true;
205 }
206 }
207
208 if (progress)
209 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
210
211 return progress;
212 }
213
214 bool
brw_fs_lower_sub_sat(fs_visitor & s)215 brw_fs_lower_sub_sat(fs_visitor &s)
216 {
217 bool progress = false;
218
219 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
220 const fs_builder ibld(&s, block, inst);
221
222 if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
223 inst->opcode == SHADER_OPCODE_ISUB_SAT) {
224 /* The fundamental problem is the hardware performs source negation
225 * at the bit width of the source. If the source is 0x80000000D, the
226 * negation is 0x80000000D. As a result, subtractSaturate(0,
227 * 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There
228 * are at least three ways to resolve this:
229 *
230 * 1. Use the accumulator for the negated source. The accumulator is
231 * 33 bits, so our source 0x80000000 is sign-extended to
232 * 0x1800000000. The negation of which is 0x080000000. This
233 * doesn't help for 64-bit integers (which are already bigger than
234 * 33 bits). There are also only 8 accumulators, so SIMD16 or
235 * SIMD32 instructions would have to be split into multiple SIMD8
236 * instructions.
237 *
238 * 2. Use slightly different math. For any n-bit value x, we know (x
239 * >> 1) != -(x >> 1). We can use this fact to only do
240 * subtractions involving (x >> 1). subtractSaturate(a, b) ==
241 * subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
242 *
243 * 3. For unsigned sources, it is sufficient to replace the
244 * subtractSaturate with (a > b) ? a - b : 0.
245 *
246 * It may also be possible to use the SUBB instruction. This
247 * implicitly writes the accumulator, so it could only be used in the
248 * same situations as #1 above. It is further limited by only
249 * allowing UD sources.
250 */
251 if (inst->exec_size == 8 && inst->src[0].type != BRW_TYPE_Q &&
252 inst->src[0].type != BRW_TYPE_UQ) {
253 brw_reg acc = retype(brw_acc_reg(inst->exec_size),
254 inst->src[1].type);
255
256 ibld.MOV(acc, inst->src[1]);
257 fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
258 add->saturate = true;
259 add->src[0].negate = true;
260 } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
261 /* tmp = src1 >> 1;
262 * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
263 */
264 fs_inst *add;
265
266 brw_reg tmp = ibld.vgrf(inst->src[0].type);
267 ibld.SHR(tmp, inst->src[1], brw_imm_d(1));
268
269 brw_reg s1_sub_t = ibld.ADD(inst->src[1], negate(tmp));
270 brw_reg sat_s0_sub_t = ibld.ADD(inst->src[0], negate(tmp), &add);
271 add->saturate = true;
272
273 add = ibld.ADD(inst->dst, sat_s0_sub_t, negate(s1_sub_t));
274 add->saturate = true;
275 } else {
276 /* a > b ? a - b : 0 */
277 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
278 BRW_CONDITIONAL_G);
279
280 fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
281 add->src[1].negate = !add->src[1].negate;
282
283 ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
284 ->predicate = BRW_PREDICATE_NORMAL;
285 }
286
287 inst->remove(block);
288 progress = true;
289 }
290 }
291
292 if (progress)
293 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
294
295 return progress;
296 }
297
298 /**
299 * Transform barycentric vectors into the interleaved form expected by the PLN
300 * instruction and returned by the Gfx7+ PI shared function.
301 *
302 * For channels 0-15 in SIMD16 mode they are expected to be laid out as
303 * follows in the register file:
304 *
305 * rN+0: X[0-7]
306 * rN+1: Y[0-7]
307 * rN+2: X[8-15]
308 * rN+3: Y[8-15]
309 *
310 * There is no need to handle SIMD32 here -- This is expected to be run after
311 * SIMD lowering, since SIMD lowering relies on vectors having the standard
312 * component layout.
313 */
314 bool
brw_fs_lower_barycentrics(fs_visitor & s)315 brw_fs_lower_barycentrics(fs_visitor &s)
316 {
317 const intel_device_info *devinfo = s.devinfo;
318
319 if (s.stage != MESA_SHADER_FRAGMENT || devinfo->ver >= 20)
320 return false;
321
322 bool progress = false;
323
324 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
325 if (inst->exec_size < 16)
326 continue;
327
328 const fs_builder ibld(&s, block, inst);
329 const fs_builder ubld = ibld.exec_all().group(8, 0);
330
331 switch (inst->opcode) {
332 case BRW_OPCODE_PLN: {
333 assert(inst->exec_size == 16);
334 const brw_reg tmp = ibld.vgrf(inst->src[1].type, 2);
335 brw_reg srcs[4];
336
337 for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
338 srcs[i] = horiz_offset(offset(inst->src[1], ibld, i % 2),
339 8 * (i / 2));
340
341 ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
342
343 inst->src[1] = tmp;
344 progress = true;
345 break;
346 }
347 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
348 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
349 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
350 assert(inst->exec_size == 16);
351 const brw_reg tmp = ibld.vgrf(inst->dst.type, 2);
352
353 for (unsigned i = 0; i < 2; i++) {
354 for (unsigned g = 0; g < inst->exec_size / 8; g++) {
355 fs_inst *mov = ibld.at(block, inst->next).group(8, g)
356 .MOV(horiz_offset(offset(inst->dst, ibld, i),
357 8 * g),
358 offset(tmp, ubld, 2 * g + i));
359 mov->predicate = inst->predicate;
360 mov->predicate_inverse = inst->predicate_inverse;
361 mov->flag_subreg = inst->flag_subreg;
362 }
363 }
364
365 inst->dst = tmp;
366 progress = true;
367 break;
368 }
369 default:
370 break;
371 }
372 }
373
374 if (progress)
375 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
376
377 return progress;
378 }
379
380 /**
381 * Lower a derivative instruction as the floating-point difference of two
382 * swizzles of the source, specified as \p swz0 and \p swz1.
383 */
384 static bool
lower_derivative(fs_visitor & s,bblock_t * block,fs_inst * inst,unsigned swz0,unsigned swz1)385 lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst,
386 unsigned swz0, unsigned swz1)
387 {
388 const fs_builder ubld = fs_builder(&s, block, inst).exec_all();
389 const brw_reg tmp0 = ubld.vgrf(inst->src[0].type);
390 const brw_reg tmp1 = ubld.vgrf(inst->src[0].type);
391
392 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
393 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
394
395 inst->resize_sources(2);
396 inst->src[0] = negate(tmp0);
397 inst->src[1] = tmp1;
398 inst->opcode = BRW_OPCODE_ADD;
399
400 return true;
401 }
402
403 /**
404 * Lower derivative instructions on platforms where codegen cannot implement
405 * them efficiently (i.e. XeHP).
406 */
407 bool
brw_fs_lower_derivatives(fs_visitor & s)408 brw_fs_lower_derivatives(fs_visitor &s)
409 {
410 bool progress = false;
411
412 if (s.devinfo->verx10 < 125)
413 return false;
414
415 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
416 if (inst->opcode == FS_OPCODE_DDX_COARSE)
417 progress |= lower_derivative(s, block, inst,
418 BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
419
420 else if (inst->opcode == FS_OPCODE_DDX_FINE)
421 progress |= lower_derivative(s, block, inst,
422 BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
423
424 else if (inst->opcode == FS_OPCODE_DDY_COARSE)
425 progress |= lower_derivative(s, block, inst,
426 BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
427
428 else if (inst->opcode == FS_OPCODE_DDY_FINE)
429 progress |= lower_derivative(s, block, inst,
430 BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
431 }
432
433 if (progress)
434 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
435
436 return progress;
437 }
438
439 bool
brw_fs_lower_find_live_channel(fs_visitor & s)440 brw_fs_lower_find_live_channel(fs_visitor &s)
441 {
442 bool progress = false;
443
444 bool packed_dispatch =
445 brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
446 s.prog_data);
447 bool vmask =
448 s.stage == MESA_SHADER_FRAGMENT &&
449 brw_wm_prog_data(s.prog_data)->uses_vmask;
450
451 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
452 if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
453 inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL &&
454 inst->opcode != SHADER_OPCODE_LOAD_LIVE_CHANNELS)
455 continue;
456
457 bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;
458
459 /* Getting the first active channel index is easy on Gfx8: Just find
460 * the first bit set in the execution mask. The register exists on
461 * HSW already but it reads back as all ones when the current
462 * instruction has execution masking disabled, so it's kind of
463 * useless there.
464 */
465
466 const fs_builder ibld(&s, block, inst);
467 if (!inst->is_partial_write())
468 ibld.emit_undef_for_dst(inst);
469
470 const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0);
471
472 brw_reg exec_mask = ubld.vgrf(BRW_TYPE_UD);
473 ubld.UNDEF(exec_mask);
474 ubld.emit(SHADER_OPCODE_READ_ARCH_REG, exec_mask,
475 retype(brw_mask_reg(0),
476 BRW_TYPE_UD));
477
478 /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
479 * so combine the execution and dispatch masks to obtain the true mask.
480 *
481 * If we're looking for the first live channel, and we have packed
482 * dispatch, we can skip this step, as we know all dispatched channels
483 * will appear at the front of the mask.
484 */
485 if (!(first && packed_dispatch)) {
486 brw_reg mask = ubld.vgrf(BRW_TYPE_UD);
487 ubld.UNDEF(mask);
488 ubld.emit(SHADER_OPCODE_READ_ARCH_REG, mask,
489 retype(brw_sr0_reg(vmask ? 3 : 2),
490 BRW_TYPE_UD));
491
492 /* Quarter control has the effect of magically shifting the value of
493 * ce0 so you'll get the first/last active channel relative to the
494 * specified quarter control as result.
495 */
496 if (inst->group > 0)
497 ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
498
499 ubld.AND(mask, exec_mask, mask);
500 exec_mask = mask;
501 }
502
503 switch (inst->opcode) {
504 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
505 ubld.FBL(inst->dst, exec_mask);
506 break;
507
508 case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: {
509 brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
510 ubld.UNDEF(tmp);
511 ubld.LZD(tmp, exec_mask);
512 ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
513 break;
514 }
515
516 case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
517 ubld.MOV(inst->dst, exec_mask);
518 break;
519
520 default:
521 unreachable("Impossible.");
522 }
523
524 inst->remove(block);
525 progress = true;
526 }
527
528 if (progress)
529 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
530
531 return progress;
532 }
533
534 /**
535 * From the Skylake PRM Vol. 2a docs for sends:
536 *
537 * "It is required that the second block of GRFs does not overlap with the
538 * first block."
539 *
540 * There are plenty of cases where we may accidentally violate this due to
541 * having, for instance, both sources be the constant 0. This little pass
542 * just adds a new vgrf for the second payload and copies it over.
543 */
544 bool
brw_fs_lower_sends_overlapping_payload(fs_visitor & s)545 brw_fs_lower_sends_overlapping_payload(fs_visitor &s)
546 {
547 bool progress = false;
548
549 foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
550 if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
551 regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
552 inst->src[3], inst->ex_mlen * REG_SIZE)) {
553 const unsigned arg = inst->mlen < inst->ex_mlen ? 2 : 3;
554 const unsigned len = MIN2(inst->mlen, inst->ex_mlen);
555
556 brw_reg tmp = brw_vgrf(s.alloc.allocate(len),
557 BRW_TYPE_UD);
558
559 /* Sadly, we've lost all notion of channels and bit sizes at this
560 * point. Just WE_all it.
561 */
562 const fs_builder ibld = fs_builder(&s, block, inst).exec_all().group(16, 0);
563 brw_reg copy_src = retype(inst->src[arg], BRW_TYPE_UD);
564 brw_reg copy_dst = tmp;
565 for (unsigned i = 0; i < len; i += 2) {
566 if (len == i + 1) {
567 /* Only one register left; do SIMD8 */
568 ibld.group(8, 0).MOV(copy_dst, copy_src);
569 } else {
570 ibld.MOV(copy_dst, copy_src);
571 }
572 copy_src = offset(copy_src, ibld, 1);
573 copy_dst = offset(copy_dst, ibld, 1);
574 }
575 inst->src[arg] = tmp;
576 progress = true;
577 }
578 }
579
580 if (progress)
581 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
582
583 return progress;
584 }
585
586 /**
587 * Three source instruction must have a GRF destination register.
588 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
589 */
590 bool
brw_fs_lower_3src_null_dest(fs_visitor & s)591 brw_fs_lower_3src_null_dest(fs_visitor &s)
592 {
593 bool progress = false;
594
595 foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
596 if (inst->is_3src(s.compiler) && inst->dst.is_null()) {
597 inst->dst = brw_vgrf(s.alloc.allocate(s.dispatch_width / 8),
598 inst->dst.type);
599 progress = true;
600 }
601 }
602
603 if (progress)
604 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
605 DEPENDENCY_VARIABLES);
606
607 return progress;
608 }
609
610 static bool
unsupported_64bit_type(const intel_device_info * devinfo,enum brw_reg_type type)611 unsupported_64bit_type(const intel_device_info *devinfo,
612 enum brw_reg_type type)
613 {
614 return (!devinfo->has_64bit_float && type == BRW_TYPE_DF) ||
615 (!devinfo->has_64bit_int && (type == BRW_TYPE_UQ ||
616 type == BRW_TYPE_Q));
617 }
618
619 /**
620 * Perform lowering to legalize the IR for various ALU restrictions.
621 *
622 * For example:
623 * - Splitting 64-bit MOV/SEL into 2x32-bit where needed
624 */
625 bool
brw_fs_lower_alu_restrictions(fs_visitor & s)626 brw_fs_lower_alu_restrictions(fs_visitor &s)
627 {
628 const intel_device_info *devinfo = s.devinfo;
629 bool progress = false;
630
631 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
632 switch (inst->opcode) {
633 case BRW_OPCODE_MOV:
634 if (unsupported_64bit_type(devinfo, inst->dst.type)) {
635 assert(inst->dst.type == inst->src[0].type);
636 assert(!inst->saturate);
637 assert(!inst->src[0].abs);
638 assert(!inst->src[0].negate);
639 const brw::fs_builder ibld(&s, block, inst);
640
641 enum brw_reg_type type = brw_type_with_size(inst->dst.type, 32);
642
643 if (!inst->is_partial_write())
644 ibld.emit_undef_for_dst(inst);
645
646 ibld.MOV(subscript(inst->dst, type, 1),
647 subscript(inst->src[0], type, 1));
648 ibld.MOV(subscript(inst->dst, type, 0),
649 subscript(inst->src[0], type, 0));
650
651 inst->remove(block);
652 progress = true;
653 }
654 break;
655
656 case BRW_OPCODE_SEL:
657 if (unsupported_64bit_type(devinfo, inst->dst.type)) {
658 assert(inst->dst.type == inst->src[0].type);
659 assert(!inst->saturate);
660 assert(!inst->src[0].abs && !inst->src[0].negate);
661 assert(!inst->src[1].abs && !inst->src[1].negate);
662 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
663 const brw::fs_builder ibld(&s, block, inst);
664
665 enum brw_reg_type type = brw_type_with_size(inst->dst.type, 32);
666
667 if (!inst->is_partial_write())
668 ibld.emit_undef_for_dst(inst);
669
670 set_predicate(inst->predicate,
671 ibld.SEL(subscript(inst->dst, type, 0),
672 subscript(inst->src[0], type, 0),
673 subscript(inst->src[1], type, 0)));
674 set_predicate(inst->predicate,
675 ibld.SEL(subscript(inst->dst, type, 1),
676 subscript(inst->src[0], type, 1),
677 subscript(inst->src[1], type, 1)));
678
679 inst->remove(block);
680 progress = true;
681 }
682 break;
683
684 default:
685 break;
686 }
687 }
688
689 if (progress) {
690 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
691 DEPENDENCY_INSTRUCTION_DETAIL);
692 }
693
694 return progress;
695 }
696
697 static void
brw_fs_lower_vgrf_to_fixed_grf(const struct intel_device_info * devinfo,fs_inst * inst,brw_reg * reg,bool compressed)698 brw_fs_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst *inst,
699 brw_reg *reg, bool compressed)
700 {
701 if (reg->file != VGRF)
702 return;
703
704 struct brw_reg new_reg;
705
706 if (reg->stride == 0) {
707 new_reg = brw_vec1_grf(reg->nr, 0);
708 } else if (reg->stride > 4) {
709 assert(reg != &inst->dst);
710 assert(reg->stride * brw_type_size_bytes(reg->type) <= REG_SIZE);
711 new_reg = brw_vecn_grf(1, reg->nr, 0);
712 new_reg = stride(new_reg, reg->stride, 1, 0);
713 } else {
714 /* From the Haswell PRM:
715 *
716 * "VertStride must be used to cross GRF register boundaries. This
717 * rule implies that elements within a 'Width' cannot cross GRF
718 * boundaries."
719 *
720 * The maximum width value that could satisfy this restriction is:
721 */
722 const unsigned reg_width =
723 REG_SIZE / (reg->stride * brw_type_size_bytes(reg->type));
724
725 /* Because the hardware can only split source regions at a whole
726 * multiple of width during decompression (i.e. vertically), clamp
727 * the value obtained above to the physical execution size of a
728 * single decompressed chunk of the instruction:
729 */
730 const bool compressed = inst->dst.component_size(inst->exec_size) > REG_SIZE;
731 const unsigned phys_width = compressed ? inst->exec_size / 2 :
732 inst->exec_size;
733
734 /* XXX - The equation above is strictly speaking not correct on
735 * hardware that supports unbalanced GRF writes -- On Gfx9+
736 * each decompressed chunk of the instruction may have a
737 * different execution size when the number of components
738 * written to each destination GRF is not the same.
739 */
740
741 const unsigned max_hw_width = 16;
742
743 const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
744 new_reg = brw_vecn_grf(width, reg->nr, 0);
745 new_reg = stride(new_reg, width * reg->stride, width, reg->stride);
746 }
747
748 new_reg = retype(new_reg, reg->type);
749 new_reg = byte_offset(new_reg, reg->offset);
750 new_reg.abs = reg->abs;
751 new_reg.negate = reg->negate;
752
753 *reg = new_reg;
754 }
755
756 void
brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor & s)757 brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor &s)
758 {
759 assert(s.grf_used || !"Must be called after register allocation");
760
761 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
762 /* If the instruction writes to more than one register, it needs to be
763 * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the
764 * hardware figures out by itself what the right compression mode is,
765 * but we still need to know whether the instruction is compressed to
766 * set up the source register regions appropriately.
767 *
768 * XXX - This is wrong for instructions that write a single register but
769 * read more than one which should strictly speaking be treated as
770 * compressed. For instructions that don't write any registers it
771 * relies on the destination being a null register of the correct
772 * type and regioning so the instruction is considered compressed
773 * or not accordingly.
774 */
775
776 const bool compressed =
777 inst->dst.component_size(inst->exec_size) > REG_SIZE;
778
779 brw_fs_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->dst, compressed);
780 for (int i = 0; i < inst->sources; i++) {
781 brw_fs_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->src[i], compressed);
782 }
783 }
784
785 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
786 DEPENDENCY_VARIABLES);
787 }
788
789 bool
brw_fs_lower_load_subgroup_invocation(fs_visitor & s)790 brw_fs_lower_load_subgroup_invocation(fs_visitor &s)
791 {
792 bool progress = false;
793
794 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
795 if (inst->opcode != SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION)
796 continue;
797
798 const fs_builder abld =
799 fs_builder(&s, block, inst).annotate("SubgroupInvocation");
800 const fs_builder ubld8 = abld.group(8, 0).exec_all();
801 ubld8.UNDEF(inst->dst);
802
803 if (inst->exec_size == 8) {
804 assert(inst->dst.type == BRW_TYPE_UD);
805 brw_reg uw = retype(inst->dst, BRW_TYPE_UW);
806 ubld8.MOV(uw, brw_imm_v(0x76543210));
807 ubld8.MOV(inst->dst, uw);
808 } else {
809 assert(inst->dst.type == BRW_TYPE_UW);
810 ubld8.MOV(inst->dst, brw_imm_v(0x76543210));
811 ubld8.ADD(byte_offset(inst->dst, 16), inst->dst, brw_imm_uw(8u));
812 if (inst->exec_size > 16) {
813 const fs_builder ubld16 = abld.group(16, 0).exec_all();
814 ubld16.ADD(byte_offset(inst->dst, 32), inst->dst, brw_imm_uw(16u));
815 }
816 }
817
818 inst->remove(block);
819 progress = true;
820 }
821
822 if (progress)
823 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
824
825 return progress;
826 }
827
828 bool
brw_fs_lower_indirect_mov(fs_visitor & s)829 brw_fs_lower_indirect_mov(fs_visitor &s)
830 {
831 bool progress = false;
832
833 if (s.devinfo->ver < 20)
834 return progress;
835
836 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
837 if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT) {
838 if (brw_type_size_bytes(inst->src[0].type) > 1 &&
839 brw_type_size_bytes(inst->dst.type) > 1) {
840 continue;
841 }
842
843 assert(brw_type_size_bytes(inst->src[0].type) ==
844 brw_type_size_bytes(inst->dst.type));
845
846 const fs_builder ibld(&s, block, inst);
847
848 /* Extract unaligned part */
849 uint16_t extra_offset = inst->src[0].offset & 0x1;
850 brw_reg offset = ibld.ADD(inst->src[1], brw_imm_uw(extra_offset));
851
852 /* Check if offset is odd or even so that we can choose either high or
853 * low byte from the result.
854 */
855 brw_reg is_odd = ibld.AND(offset, brw_imm_ud(1));
856
857 /* Make sure offset is word (2-bytes) aligned */
858 offset = ibld.AND(offset, brw_imm_uw(~1));
859
860 /* Indirect addressing(vx1 and vxh) not supported with UB/B datatype for
861 * Src0, so change data type for src0 and dst to UW.
862 */
863 brw_reg dst = ibld.vgrf(BRW_TYPE_UW);
864
865 /* Substract unaligned offset from src0 offset since we already
866 * accounted unaligned part in the indirect byte offset.
867 */
868 brw_reg start = retype(inst->src[0], BRW_TYPE_UW);
869 start.offset &= ~extra_offset;
870
871 /* Adjust length to account extra offset. */
872 assert(inst->src[2].file == IMM);
873 brw_reg length = brw_imm_ud(inst->src[2].ud + extra_offset);
874
875 ibld.emit(SHADER_OPCODE_MOV_INDIRECT, dst, start, offset, length);
876
877 /* Select high byte if offset is odd otherwise select low byte. */
878 brw_reg lo = ibld.AND(dst, brw_imm_uw(0xff));
879 brw_reg hi = ibld.SHR(dst, brw_imm_uw(8));
880 brw_reg result = ibld.vgrf(BRW_TYPE_UW);
881 ibld.CSEL(result, hi, lo, is_odd, BRW_CONDITIONAL_NZ);
882
883 /* Extra MOV needed here to convert back to the corresponding B type */
884 ibld.MOV(inst->dst, result);
885
886 inst->remove(block);
887 progress = true;
888 }
889 }
890
891 if (progress)
892 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
893
894 return progress;
895 }
896