xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_fs_lower.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8 
9 using namespace brw;
10 
11 /**
12  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
13  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
14  */
15 bool
brw_fs_lower_constant_loads(fs_visitor & s)16 brw_fs_lower_constant_loads(fs_visitor &s)
17 {
18    unsigned index, pull_index;
19    bool progress = false;
20 
21    foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
22       /* Set up the annotation tracking for new generated instructions. */
23       const fs_builder ibld(&s, block, inst);
24 
25       for (int i = 0; i < inst->sources; i++) {
26 	 if (inst->src[i].file != UNIFORM)
27 	    continue;
28 
29          /* We'll handle this case later */
30          if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
31             continue;
32 
33          if (!s.get_pull_locs(inst->src[i], &index, &pull_index))
34 	    continue;
35 
36          assert(inst->src[i].stride == 0);
37 
38          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
39          const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
40          const brw_reg dst = ubld.vgrf(BRW_TYPE_UD);
41          const unsigned base = pull_index * 4;
42 
43          brw_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
44          srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
45          srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]  = brw_imm_ud(base & ~(block_sz - 1));
46          srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]    = brw_imm_ud(block_sz);
47 
48 
49          ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
50                    srcs, PULL_UNIFORM_CONSTANT_SRCS);
51 
52          /* Rewrite the instruction to use the temporary VGRF. */
53          inst->src[i].file = VGRF;
54          inst->src[i].nr = dst.nr;
55          inst->src[i].offset = (base & (block_sz - 1)) +
56                                inst->src[i].offset % 4;
57 
58          progress = true;
59       }
60 
61       if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
62           inst->src[0].file == UNIFORM) {
63 
64          if (!s.get_pull_locs(inst->src[0], &index, &pull_index))
65             continue;
66 
67          ibld.VARYING_PULL_CONSTANT_LOAD(inst->dst,
68                                          brw_imm_ud(index),
69                                          brw_reg() /* surface_handle */,
70                                          inst->src[1],
71                                          pull_index * 4, 4, 1);
72          inst->remove(block);
73 
74          progress = true;
75       }
76    }
77    s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
78 
79    return progress;
80 }
81 
82 bool
brw_fs_lower_load_payload(fs_visitor & s)83 brw_fs_lower_load_payload(fs_visitor &s)
84 {
85    bool progress = false;
86 
87    foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
88       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
89          continue;
90 
91       assert(inst->dst.file == VGRF);
92       assert(inst->saturate == false);
93       brw_reg dst = inst->dst;
94 
95       const fs_builder ibld(&s, block, inst);
96       const fs_builder ubld = ibld.exec_all();
97 
98       for (uint8_t i = 0; i < inst->header_size;) {
99          /* Number of header GRFs to initialize at once with a single MOV
100           * instruction.
101           */
102          const unsigned n =
103             (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
104              inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
105             2 : 1;
106 
107          if (inst->src[i].file != BAD_FILE)
108             ubld.group(8 * n, 0).MOV(retype(dst, BRW_TYPE_UD),
109                                      retype(inst->src[i], BRW_TYPE_UD));
110 
111          dst = byte_offset(dst, n * REG_SIZE);
112          i += n;
113       }
114 
115       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
116          dst.type = inst->src[i].type;
117          if (inst->src[i].file != BAD_FILE) {
118             ibld.MOV(dst, inst->src[i]);
119          }
120          dst = offset(dst, ibld, 1);
121       }
122 
123       inst->remove(block);
124       progress = true;
125    }
126 
127    if (progress)
128       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
129 
130    return progress;
131 }
132 
133 /**
134  * Lower CSEL with unsupported types to CMP+SEL.
135  *
136  * Or, for unsigned ==/!= comparisons, simply change the types.
137  */
138 bool
brw_fs_lower_csel(fs_visitor & s)139 brw_fs_lower_csel(fs_visitor &s)
140 {
141    const intel_device_info *devinfo = s.devinfo;
142    bool progress = false;
143 
144    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
145       if (inst->opcode != BRW_OPCODE_CSEL)
146          continue;
147 
148       bool supported = false;
149       enum brw_reg_type orig_type = inst->src[2].type;
150       enum brw_reg_type new_type = orig_type;
151 
152       switch (orig_type) {
153       case BRW_TYPE_F:
154          /* Gfx9 CSEL can only do F */
155          supported = true;
156          break;
157       case BRW_TYPE_HF:
158       case BRW_TYPE_W:
159       case BRW_TYPE_D:
160          /* Gfx11+ CSEL can do HF, W, and D.  Note that we can't simply
161           * retype integer ==/!= comparisons as float on earlier hardware
162           * because it breaks for 0x8000000 and 0 (-0.0 == 0.0).
163           */
164          supported = devinfo->ver >= 11;
165          break;
166       case BRW_TYPE_UW:
167       case BRW_TYPE_UD:
168          /* CSEL doesn't support UW/UD but we can simply retype to use the
169           * signed types when comparing with == or !=.
170           */
171          supported = devinfo->ver >= 11 &&
172                      (inst->conditional_mod == BRW_CONDITIONAL_EQ ||
173                       inst->conditional_mod == BRW_CONDITIONAL_NEQ);
174 
175          /* Bspec 47408, Gfx125+ CSEL does support the both signed and unsigned
176           * integer types.
177           */
178          if (devinfo->verx10 < 125) {
179             new_type = inst->src[2].type == BRW_TYPE_UD ?
180                        BRW_TYPE_D : BRW_TYPE_W;
181          }
182          break;
183       default:
184          break;
185       }
186 
187       if (!supported) {
188          const fs_builder ibld(&s, block, inst);
189 
190          /* CSEL: dst = src2 <op> 0 ? src0 : src1 */
191          brw_reg zero = brw_imm_reg(orig_type);
192          ibld.CMP(retype(brw_null_reg(), orig_type),
193                   inst->src[2], zero, inst->conditional_mod);
194 
195          inst->opcode = BRW_OPCODE_SEL;
196          inst->predicate = BRW_PREDICATE_NORMAL;
197          inst->conditional_mod = BRW_CONDITIONAL_NONE;
198          inst->resize_sources(2);
199          progress = true;
200       } else if (new_type != orig_type) {
201          inst->src[0].type = new_type;
202          inst->src[1].type = new_type;
203          inst->src[2].type = new_type;
204          progress = true;
205       }
206    }
207 
208    if (progress)
209       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
210 
211    return progress;
212 }
213 
214 bool
brw_fs_lower_sub_sat(fs_visitor & s)215 brw_fs_lower_sub_sat(fs_visitor &s)
216 {
217    bool progress = false;
218 
219    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
220       const fs_builder ibld(&s, block, inst);
221 
222       if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
223           inst->opcode == SHADER_OPCODE_ISUB_SAT) {
224          /* The fundamental problem is the hardware performs source negation
225           * at the bit width of the source.  If the source is 0x80000000D, the
226           * negation is 0x80000000D.  As a result, subtractSaturate(0,
227           * 0x80000000) will produce 0x80000000 instead of 0x7fffffff.  There
228           * are at least three ways to resolve this:
229           *
230           * 1. Use the accumulator for the negated source.  The accumulator is
231           *    33 bits, so our source 0x80000000 is sign-extended to
232           *    0x1800000000.  The negation of which is 0x080000000.  This
233           *    doesn't help for 64-bit integers (which are already bigger than
234           *    33 bits).  There are also only 8 accumulators, so SIMD16 or
235           *    SIMD32 instructions would have to be split into multiple SIMD8
236           *    instructions.
237           *
238           * 2. Use slightly different math.  For any n-bit value x, we know (x
239           *    >> 1) != -(x >> 1).  We can use this fact to only do
240           *    subtractions involving (x >> 1).  subtractSaturate(a, b) ==
241           *    subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
242           *
243           * 3. For unsigned sources, it is sufficient to replace the
244           *    subtractSaturate with (a > b) ? a - b : 0.
245           *
246           * It may also be possible to use the SUBB instruction.  This
247           * implicitly writes the accumulator, so it could only be used in the
248           * same situations as #1 above.  It is further limited by only
249           * allowing UD sources.
250           */
251          if (inst->exec_size == 8 && inst->src[0].type != BRW_TYPE_Q &&
252              inst->src[0].type != BRW_TYPE_UQ) {
253             brw_reg acc = retype(brw_acc_reg(inst->exec_size),
254                                 inst->src[1].type);
255 
256             ibld.MOV(acc, inst->src[1]);
257             fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
258             add->saturate = true;
259             add->src[0].negate = true;
260          } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
261             /* tmp = src1 >> 1;
262              * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
263              */
264             fs_inst *add;
265 
266             brw_reg tmp = ibld.vgrf(inst->src[0].type);
267             ibld.SHR(tmp, inst->src[1], brw_imm_d(1));
268 
269             brw_reg s1_sub_t = ibld.ADD(inst->src[1], negate(tmp));
270             brw_reg sat_s0_sub_t = ibld.ADD(inst->src[0], negate(tmp), &add);
271             add->saturate = true;
272 
273             add = ibld.ADD(inst->dst, sat_s0_sub_t, negate(s1_sub_t));
274             add->saturate = true;
275          } else {
276             /* a > b ? a - b : 0 */
277             ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
278                      BRW_CONDITIONAL_G);
279 
280             fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
281             add->src[1].negate = !add->src[1].negate;
282 
283             ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
284                ->predicate = BRW_PREDICATE_NORMAL;
285          }
286 
287          inst->remove(block);
288          progress = true;
289       }
290    }
291 
292    if (progress)
293       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
294 
295    return progress;
296 }
297 
298 /**
299  * Transform barycentric vectors into the interleaved form expected by the PLN
300  * instruction and returned by the Gfx7+ PI shared function.
301  *
302  * For channels 0-15 in SIMD16 mode they are expected to be laid out as
303  * follows in the register file:
304  *
305  *    rN+0: X[0-7]
306  *    rN+1: Y[0-7]
307  *    rN+2: X[8-15]
308  *    rN+3: Y[8-15]
309  *
310  * There is no need to handle SIMD32 here -- This is expected to be run after
311  * SIMD lowering, since SIMD lowering relies on vectors having the standard
312  * component layout.
313  */
314 bool
brw_fs_lower_barycentrics(fs_visitor & s)315 brw_fs_lower_barycentrics(fs_visitor &s)
316 {
317    const intel_device_info *devinfo = s.devinfo;
318 
319    if (s.stage != MESA_SHADER_FRAGMENT || devinfo->ver >= 20)
320       return false;
321 
322    bool progress = false;
323 
324    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
325       if (inst->exec_size < 16)
326          continue;
327 
328       const fs_builder ibld(&s, block, inst);
329       const fs_builder ubld = ibld.exec_all().group(8, 0);
330 
331       switch (inst->opcode) {
332       case BRW_OPCODE_PLN: {
333          assert(inst->exec_size == 16);
334          const brw_reg tmp = ibld.vgrf(inst->src[1].type, 2);
335          brw_reg srcs[4];
336 
337          for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
338             srcs[i] = horiz_offset(offset(inst->src[1], ibld, i % 2),
339                                    8 * (i / 2));
340 
341          ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
342 
343          inst->src[1] = tmp;
344          progress = true;
345          break;
346       }
347       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
348       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
349       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
350          assert(inst->exec_size == 16);
351          const brw_reg tmp = ibld.vgrf(inst->dst.type, 2);
352 
353          for (unsigned i = 0; i < 2; i++) {
354             for (unsigned g = 0; g < inst->exec_size / 8; g++) {
355                fs_inst *mov = ibld.at(block, inst->next).group(8, g)
356                                   .MOV(horiz_offset(offset(inst->dst, ibld, i),
357                                                     8 * g),
358                                        offset(tmp, ubld, 2 * g + i));
359                mov->predicate = inst->predicate;
360                mov->predicate_inverse = inst->predicate_inverse;
361                mov->flag_subreg = inst->flag_subreg;
362             }
363          }
364 
365          inst->dst = tmp;
366          progress = true;
367          break;
368       }
369       default:
370          break;
371       }
372    }
373 
374    if (progress)
375       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
376 
377    return progress;
378 }
379 
380 /**
381  * Lower a derivative instruction as the floating-point difference of two
382  * swizzles of the source, specified as \p swz0 and \p swz1.
383  */
384 static bool
lower_derivative(fs_visitor & s,bblock_t * block,fs_inst * inst,unsigned swz0,unsigned swz1)385 lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst,
386                  unsigned swz0, unsigned swz1)
387 {
388    const fs_builder ubld = fs_builder(&s, block, inst).exec_all();
389    const brw_reg tmp0 = ubld.vgrf(inst->src[0].type);
390    const brw_reg tmp1 = ubld.vgrf(inst->src[0].type);
391 
392    ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
393    ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
394 
395    inst->resize_sources(2);
396    inst->src[0] = negate(tmp0);
397    inst->src[1] = tmp1;
398    inst->opcode = BRW_OPCODE_ADD;
399 
400    return true;
401 }
402 
403 /**
404  * Lower derivative instructions on platforms where codegen cannot implement
405  * them efficiently (i.e. XeHP).
406  */
407 bool
brw_fs_lower_derivatives(fs_visitor & s)408 brw_fs_lower_derivatives(fs_visitor &s)
409 {
410    bool progress = false;
411 
412    if (s.devinfo->verx10 < 125)
413       return false;
414 
415    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
416       if (inst->opcode == FS_OPCODE_DDX_COARSE)
417          progress |= lower_derivative(s, block, inst,
418                                       BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
419 
420       else if (inst->opcode == FS_OPCODE_DDX_FINE)
421          progress |= lower_derivative(s, block, inst,
422                                       BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
423 
424       else if (inst->opcode == FS_OPCODE_DDY_COARSE)
425          progress |= lower_derivative(s, block, inst,
426                                       BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
427 
428       else if (inst->opcode == FS_OPCODE_DDY_FINE)
429          progress |= lower_derivative(s, block, inst,
430                                       BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
431    }
432 
433    if (progress)
434       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
435 
436    return progress;
437 }
438 
439 bool
brw_fs_lower_find_live_channel(fs_visitor & s)440 brw_fs_lower_find_live_channel(fs_visitor &s)
441 {
442    bool progress = false;
443 
444    bool packed_dispatch =
445       brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
446                                     s.prog_data);
447    bool vmask =
448       s.stage == MESA_SHADER_FRAGMENT &&
449       brw_wm_prog_data(s.prog_data)->uses_vmask;
450 
451    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
452       if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
453           inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL &&
454           inst->opcode != SHADER_OPCODE_LOAD_LIVE_CHANNELS)
455          continue;
456 
457       bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;
458 
459       /* Getting the first active channel index is easy on Gfx8: Just find
460        * the first bit set in the execution mask.  The register exists on
461        * HSW already but it reads back as all ones when the current
462        * instruction has execution masking disabled, so it's kind of
463        * useless there.
464        */
465 
466       const fs_builder ibld(&s, block, inst);
467       if (!inst->is_partial_write())
468          ibld.emit_undef_for_dst(inst);
469 
470       const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0);
471 
472       brw_reg exec_mask = ubld.vgrf(BRW_TYPE_UD);
473       ubld.UNDEF(exec_mask);
474       ubld.emit(SHADER_OPCODE_READ_ARCH_REG, exec_mask,
475                                              retype(brw_mask_reg(0),
476                                                     BRW_TYPE_UD));
477 
478       /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
479        * so combine the execution and dispatch masks to obtain the true mask.
480        *
481        * If we're looking for the first live channel, and we have packed
482        * dispatch, we can skip this step, as we know all dispatched channels
483        * will appear at the front of the mask.
484        */
485       if (!(first && packed_dispatch)) {
486          brw_reg mask = ubld.vgrf(BRW_TYPE_UD);
487          ubld.UNDEF(mask);
488          ubld.emit(SHADER_OPCODE_READ_ARCH_REG, mask,
489                                                 retype(brw_sr0_reg(vmask ? 3 : 2),
490                                                        BRW_TYPE_UD));
491 
492          /* Quarter control has the effect of magically shifting the value of
493           * ce0 so you'll get the first/last active channel relative to the
494           * specified quarter control as result.
495           */
496          if (inst->group > 0)
497             ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
498 
499          ubld.AND(mask, exec_mask, mask);
500          exec_mask = mask;
501       }
502 
503       switch (inst->opcode) {
504       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
505          ubld.FBL(inst->dst, exec_mask);
506          break;
507 
508       case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: {
509          brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
510          ubld.UNDEF(tmp);
511          ubld.LZD(tmp, exec_mask);
512          ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
513          break;
514       }
515 
516       case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
517          ubld.MOV(inst->dst, exec_mask);
518          break;
519 
520       default:
521          unreachable("Impossible.");
522       }
523 
524       inst->remove(block);
525       progress = true;
526    }
527 
528    if (progress)
529       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
530 
531    return progress;
532 }
533 
534 /**
535  * From the Skylake PRM Vol. 2a docs for sends:
536  *
537  *    "It is required that the second block of GRFs does not overlap with the
538  *    first block."
539  *
540  * There are plenty of cases where we may accidentally violate this due to
541  * having, for instance, both sources be the constant 0.  This little pass
542  * just adds a new vgrf for the second payload and copies it over.
543  */
544 bool
brw_fs_lower_sends_overlapping_payload(fs_visitor & s)545 brw_fs_lower_sends_overlapping_payload(fs_visitor &s)
546 {
547    bool progress = false;
548 
549    foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
550       if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
551           regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
552                           inst->src[3], inst->ex_mlen * REG_SIZE)) {
553          const unsigned arg = inst->mlen < inst->ex_mlen ? 2 : 3;
554          const unsigned len = MIN2(inst->mlen, inst->ex_mlen);
555 
556          brw_reg tmp = brw_vgrf(s.alloc.allocate(len),
557                                BRW_TYPE_UD);
558 
559          /* Sadly, we've lost all notion of channels and bit sizes at this
560           * point.  Just WE_all it.
561           */
562          const fs_builder ibld = fs_builder(&s, block, inst).exec_all().group(16, 0);
563          brw_reg copy_src = retype(inst->src[arg], BRW_TYPE_UD);
564          brw_reg copy_dst = tmp;
565          for (unsigned i = 0; i < len; i += 2) {
566             if (len == i + 1) {
567                /* Only one register left; do SIMD8 */
568                ibld.group(8, 0).MOV(copy_dst, copy_src);
569             } else {
570                ibld.MOV(copy_dst, copy_src);
571             }
572             copy_src = offset(copy_src, ibld, 1);
573             copy_dst = offset(copy_dst, ibld, 1);
574          }
575          inst->src[arg] = tmp;
576          progress = true;
577       }
578    }
579 
580    if (progress)
581       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
582 
583    return progress;
584 }
585 
586 /**
587  * Three source instruction must have a GRF destination register.
588  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
589  */
590 bool
brw_fs_lower_3src_null_dest(fs_visitor & s)591 brw_fs_lower_3src_null_dest(fs_visitor &s)
592 {
593    bool progress = false;
594 
595    foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
596       if (inst->is_3src(s.compiler) && inst->dst.is_null()) {
597          inst->dst = brw_vgrf(s.alloc.allocate(s.dispatch_width / 8),
598                               inst->dst.type);
599          progress = true;
600       }
601    }
602 
603    if (progress)
604       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
605                             DEPENDENCY_VARIABLES);
606 
607    return progress;
608 }
609 
610 static bool
unsupported_64bit_type(const intel_device_info * devinfo,enum brw_reg_type type)611 unsupported_64bit_type(const intel_device_info *devinfo,
612                        enum brw_reg_type type)
613 {
614    return (!devinfo->has_64bit_float && type == BRW_TYPE_DF) ||
615           (!devinfo->has_64bit_int && (type == BRW_TYPE_UQ ||
616                                        type == BRW_TYPE_Q));
617 }
618 
619 /**
620  * Perform lowering to legalize the IR for various ALU restrictions.
621  *
622  * For example:
623  * - Splitting 64-bit MOV/SEL into 2x32-bit where needed
624  */
625 bool
brw_fs_lower_alu_restrictions(fs_visitor & s)626 brw_fs_lower_alu_restrictions(fs_visitor &s)
627 {
628    const intel_device_info *devinfo = s.devinfo;
629    bool progress = false;
630 
631    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
632       switch (inst->opcode) {
633       case BRW_OPCODE_MOV:
634          if (unsupported_64bit_type(devinfo, inst->dst.type)) {
635             assert(inst->dst.type == inst->src[0].type);
636             assert(!inst->saturate);
637             assert(!inst->src[0].abs);
638             assert(!inst->src[0].negate);
639             const brw::fs_builder ibld(&s, block, inst);
640 
641             enum brw_reg_type type = brw_type_with_size(inst->dst.type, 32);
642 
643             if (!inst->is_partial_write())
644                ibld.emit_undef_for_dst(inst);
645 
646             ibld.MOV(subscript(inst->dst, type, 1),
647                      subscript(inst->src[0], type, 1));
648             ibld.MOV(subscript(inst->dst, type, 0),
649                      subscript(inst->src[0], type, 0));
650 
651             inst->remove(block);
652             progress = true;
653          }
654          break;
655 
656       case BRW_OPCODE_SEL:
657          if (unsupported_64bit_type(devinfo, inst->dst.type)) {
658             assert(inst->dst.type == inst->src[0].type);
659             assert(!inst->saturate);
660             assert(!inst->src[0].abs && !inst->src[0].negate);
661             assert(!inst->src[1].abs && !inst->src[1].negate);
662             assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
663             const brw::fs_builder ibld(&s, block, inst);
664 
665             enum brw_reg_type type = brw_type_with_size(inst->dst.type, 32);
666 
667             if (!inst->is_partial_write())
668                ibld.emit_undef_for_dst(inst);
669 
670             set_predicate(inst->predicate,
671                           ibld.SEL(subscript(inst->dst, type, 0),
672                                    subscript(inst->src[0], type, 0),
673                                    subscript(inst->src[1], type, 0)));
674             set_predicate(inst->predicate,
675                           ibld.SEL(subscript(inst->dst, type, 1),
676                                    subscript(inst->src[0], type, 1),
677                                    subscript(inst->src[1], type, 1)));
678 
679             inst->remove(block);
680             progress = true;
681          }
682          break;
683 
684       default:
685          break;
686       }
687    }
688 
689    if (progress) {
690       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
691                             DEPENDENCY_INSTRUCTION_DETAIL);
692    }
693 
694    return progress;
695 }
696 
697 static void
brw_fs_lower_vgrf_to_fixed_grf(const struct intel_device_info * devinfo,fs_inst * inst,brw_reg * reg,bool compressed)698 brw_fs_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst *inst,
699                                brw_reg *reg, bool compressed)
700 {
701    if (reg->file != VGRF)
702       return;
703 
704    struct brw_reg new_reg;
705 
706    if (reg->stride == 0) {
707       new_reg = brw_vec1_grf(reg->nr, 0);
708    } else if (reg->stride > 4) {
709       assert(reg != &inst->dst);
710       assert(reg->stride * brw_type_size_bytes(reg->type) <= REG_SIZE);
711       new_reg = brw_vecn_grf(1, reg->nr, 0);
712       new_reg = stride(new_reg, reg->stride, 1, 0);
713    } else {
714       /* From the Haswell PRM:
715        *
716        *  "VertStride must be used to cross GRF register boundaries. This
717        *   rule implies that elements within a 'Width' cannot cross GRF
718        *   boundaries."
719        *
720        * The maximum width value that could satisfy this restriction is:
721        */
722       const unsigned reg_width =
723          REG_SIZE / (reg->stride * brw_type_size_bytes(reg->type));
724 
725       /* Because the hardware can only split source regions at a whole
726        * multiple of width during decompression (i.e. vertically), clamp
727        * the value obtained above to the physical execution size of a
728        * single decompressed chunk of the instruction:
729        */
730       const bool compressed = inst->dst.component_size(inst->exec_size) > REG_SIZE;
731       const unsigned phys_width = compressed ? inst->exec_size / 2 :
732                                   inst->exec_size;
733 
734       /* XXX - The equation above is strictly speaking not correct on
735        *       hardware that supports unbalanced GRF writes -- On Gfx9+
736        *       each decompressed chunk of the instruction may have a
737        *       different execution size when the number of components
738        *       written to each destination GRF is not the same.
739        */
740 
741       const unsigned max_hw_width = 16;
742 
743       const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
744       new_reg = brw_vecn_grf(width, reg->nr, 0);
745       new_reg = stride(new_reg, width * reg->stride, width, reg->stride);
746    }
747 
748    new_reg = retype(new_reg, reg->type);
749    new_reg = byte_offset(new_reg, reg->offset);
750    new_reg.abs = reg->abs;
751    new_reg.negate = reg->negate;
752 
753    *reg = new_reg;
754 }
755 
756 void
brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor & s)757 brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor &s)
758 {
759    assert(s.grf_used || !"Must be called after register allocation");
760 
761    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
762       /* If the instruction writes to more than one register, it needs to be
763        * explicitly marked as compressed on Gen <= 5.  On Gen >= 6 the
764        * hardware figures out by itself what the right compression mode is,
765        * but we still need to know whether the instruction is compressed to
766        * set up the source register regions appropriately.
767        *
768        * XXX - This is wrong for instructions that write a single register but
769        *       read more than one which should strictly speaking be treated as
770        *       compressed.  For instructions that don't write any registers it
771        *       relies on the destination being a null register of the correct
772        *       type and regioning so the instruction is considered compressed
773        *       or not accordingly.
774        */
775 
776       const bool compressed =
777            inst->dst.component_size(inst->exec_size) > REG_SIZE;
778 
779       brw_fs_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->dst, compressed);
780       for (int i = 0; i < inst->sources; i++) {
781          brw_fs_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->src[i], compressed);
782       }
783    }
784 
785    s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
786                          DEPENDENCY_VARIABLES);
787 }
788 
789 bool
brw_fs_lower_load_subgroup_invocation(fs_visitor & s)790 brw_fs_lower_load_subgroup_invocation(fs_visitor &s)
791 {
792    bool progress = false;
793 
794    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
795       if (inst->opcode != SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION)
796          continue;
797 
798       const fs_builder abld =
799          fs_builder(&s, block, inst).annotate("SubgroupInvocation");
800       const fs_builder ubld8 = abld.group(8, 0).exec_all();
801       ubld8.UNDEF(inst->dst);
802 
803       if (inst->exec_size == 8) {
804          assert(inst->dst.type == BRW_TYPE_UD);
805          brw_reg uw = retype(inst->dst, BRW_TYPE_UW);
806          ubld8.MOV(uw, brw_imm_v(0x76543210));
807          ubld8.MOV(inst->dst, uw);
808       } else {
809          assert(inst->dst.type == BRW_TYPE_UW);
810          ubld8.MOV(inst->dst, brw_imm_v(0x76543210));
811          ubld8.ADD(byte_offset(inst->dst, 16), inst->dst, brw_imm_uw(8u));
812          if (inst->exec_size > 16) {
813             const fs_builder ubld16 = abld.group(16, 0).exec_all();
814             ubld16.ADD(byte_offset(inst->dst, 32), inst->dst, brw_imm_uw(16u));
815          }
816       }
817 
818       inst->remove(block);
819       progress = true;
820    }
821 
822    if (progress)
823       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
824 
825    return progress;
826 }
827 
828 bool
brw_fs_lower_indirect_mov(fs_visitor & s)829 brw_fs_lower_indirect_mov(fs_visitor &s)
830 {
831    bool progress = false;
832 
833    if (s.devinfo->ver < 20)
834       return progress;
835 
836    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
837       if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT) {
838          if (brw_type_size_bytes(inst->src[0].type) > 1 &&
839              brw_type_size_bytes(inst->dst.type) > 1) {
840             continue;
841          }
842 
843          assert(brw_type_size_bytes(inst->src[0].type) ==
844                 brw_type_size_bytes(inst->dst.type));
845 
846          const fs_builder ibld(&s, block, inst);
847 
848          /* Extract unaligned part */
849          uint16_t extra_offset = inst->src[0].offset & 0x1;
850          brw_reg offset = ibld.ADD(inst->src[1], brw_imm_uw(extra_offset));
851 
852          /* Check if offset is odd or even so that we can choose either high or
853           * low byte from the result.
854           */
855          brw_reg is_odd = ibld.AND(offset, brw_imm_ud(1));
856 
857          /* Make sure offset is word (2-bytes) aligned */
858          offset = ibld.AND(offset, brw_imm_uw(~1));
859 
860          /* Indirect addressing(vx1 and vxh) not supported with UB/B datatype for
861           * Src0, so change data type for src0 and dst to UW.
862           */
863          brw_reg dst = ibld.vgrf(BRW_TYPE_UW);
864 
865          /* Substract unaligned offset from src0 offset since we already
866           * accounted unaligned part in the indirect byte offset.
867           */
868          brw_reg start = retype(inst->src[0], BRW_TYPE_UW);
869          start.offset &= ~extra_offset;
870 
871          /* Adjust length to account extra offset. */
872          assert(inst->src[2].file == IMM);
873          brw_reg length = brw_imm_ud(inst->src[2].ud + extra_offset);
874 
875          ibld.emit(SHADER_OPCODE_MOV_INDIRECT, dst, start, offset, length);
876 
877          /* Select high byte if offset is odd otherwise select low byte. */
878          brw_reg lo = ibld.AND(dst, brw_imm_uw(0xff));
879          brw_reg hi = ibld.SHR(dst, brw_imm_uw(8));
880          brw_reg result = ibld.vgrf(BRW_TYPE_UW);
881          ibld.CSEL(result, hi, lo, is_odd, BRW_CONDITIONAL_NZ);
882 
883          /* Extra MOV needed here to convert back to the corresponding B type */
884          ibld.MOV(inst->dst, result);
885 
886          inst->remove(block);
887          progress = true;
888       }
889    }
890 
891    if (progress)
892       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
893 
894    return progress;
895 }
896