xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_fs_opt.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_eu.h"
7 #include "brw_fs.h"
8 #include "brw_fs_builder.h"
9 
10 using namespace brw;
11 
12 void
brw_fs_optimize(fs_visitor & s)13 brw_fs_optimize(fs_visitor &s)
14 {
15    const nir_shader *nir = s.nir;
16 
17    s.debug_optimizer(nir, "start", 0, 0);
18 
19    /* Start by validating the shader we currently have. */
20    brw_fs_validate(s);
21 
22    bool progress = false;
23    int iteration = 0;
24    int pass_num = 0;
25 
26 #define OPT(pass, ...) ({                                               \
27       pass_num++;                                                       \
28       bool this_progress = pass(s, ##__VA_ARGS__);                      \
29                                                                         \
30       if (this_progress)                                                \
31          s.debug_optimizer(nir, #pass, iteration, pass_num);            \
32                                                                         \
33       brw_fs_validate(s);                                               \
34                                                                         \
35       progress = progress || this_progress;                             \
36       this_progress;                                                    \
37    })
38 
39    s.assign_constant_locations();
40    OPT(brw_fs_lower_constant_loads);
41 
42    if (s.compiler->lower_dpas)
43       OPT(brw_fs_lower_dpas);
44 
45    OPT(brw_fs_opt_split_virtual_grfs);
46 
47    /* Before anything else, eliminate dead code.  The results of some NIR
48     * instructions may effectively be calculated twice.  Once when the
49     * instruction is encountered, and again when the user of that result is
50     * encountered.  Wipe those away before algebraic optimizations and
51     * especially copy propagation can mix things up.
52     */
53    OPT(brw_fs_opt_dead_code_eliminate);
54 
55    OPT(brw_fs_opt_remove_extra_rounding_modes);
56 
57    OPT(brw_fs_opt_eliminate_find_live_channel);
58 
59    do {
60       progress = false;
61       pass_num = 0;
62       iteration++;
63 
64       OPT(brw_fs_opt_algebraic);
65       OPT(brw_fs_opt_cse_defs);
66       if (!OPT(brw_fs_opt_copy_propagation_defs))
67          OPT(brw_fs_opt_copy_propagation);
68       OPT(brw_fs_opt_cmod_propagation);
69       OPT(brw_fs_opt_dead_code_eliminate);
70       OPT(brw_fs_opt_saturate_propagation);
71       OPT(brw_fs_opt_register_coalesce);
72 
73       OPT(brw_fs_opt_compact_virtual_grfs);
74    } while (progress);
75 
76    progress = false;
77    pass_num = 0;
78 
79    if (OPT(brw_fs_lower_pack)) {
80       OPT(brw_fs_opt_register_coalesce);
81       OPT(brw_fs_opt_dead_code_eliminate);
82    }
83 
84    OPT(brw_fs_lower_csel);
85    OPT(brw_fs_lower_simd_width);
86    OPT(brw_fs_lower_barycentrics);
87    OPT(brw_fs_lower_logical_sends);
88 
89    /* After logical SEND lowering. */
90 
91    if (OPT(brw_fs_opt_copy_propagation_defs) || OPT(brw_fs_opt_copy_propagation))
92       OPT(brw_fs_opt_algebraic);
93 
94    /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
95     * Do this before splitting SENDs.
96     */
97    if (OPT(brw_fs_opt_zero_samples) && (OPT(brw_fs_opt_copy_propagation_defs) || OPT(brw_fs_opt_copy_propagation)))
98       OPT(brw_fs_opt_algebraic);
99 
100    OPT(brw_fs_opt_split_sends);
101    OPT(brw_fs_workaround_nomask_control_flow);
102 
103    if (progress) {
104       if (OPT(brw_fs_opt_copy_propagation_defs) || OPT(brw_fs_opt_copy_propagation))
105          OPT(brw_fs_opt_algebraic);
106 
107       /* Run after logical send lowering to give it a chance to CSE the
108        * LOAD_PAYLOAD instructions created to construct the payloads of
109        * e.g. texturing messages in cases where it wasn't possible to CSE the
110        * whole logical instruction.
111        */
112       OPT(brw_fs_opt_cse_defs);
113       OPT(brw_fs_opt_register_coalesce);
114       OPT(brw_fs_opt_dead_code_eliminate);
115    }
116 
117    OPT(brw_fs_opt_remove_redundant_halts);
118 
119    if (OPT(brw_fs_lower_load_payload)) {
120       OPT(brw_fs_opt_split_virtual_grfs);
121 
122       OPT(brw_fs_opt_register_coalesce);
123       OPT(brw_fs_lower_simd_width);
124       OPT(brw_fs_opt_dead_code_eliminate);
125    }
126 
127    OPT(brw_fs_lower_alu_restrictions);
128 
129    OPT(brw_fs_opt_combine_constants);
130    if (OPT(brw_fs_lower_integer_multiplication)) {
131       /* If lower_integer_multiplication made progress, it may have produced
132        * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
133        * one more time to clean those up if they exist.
134        */
135       OPT(brw_fs_lower_integer_multiplication);
136    }
137    OPT(brw_fs_lower_sub_sat);
138 
139    progress = false;
140    OPT(brw_fs_lower_derivatives);
141    OPT(brw_fs_lower_regioning);
142    if (progress) {
143       /* Try both copy propagation passes.  The defs one will likely not be
144        * able to handle everything at this point.
145        */
146       const bool cp1 = OPT(brw_fs_opt_copy_propagation_defs);
147       const bool cp2 = OPT(brw_fs_opt_copy_propagation);
148       if (cp1 || cp2) {
149          OPT(brw_fs_opt_algebraic);
150          OPT(brw_fs_opt_combine_constants);
151       }
152       OPT(brw_fs_opt_dead_code_eliminate);
153       OPT(brw_fs_lower_simd_width);
154    }
155 
156    OPT(brw_fs_lower_sends_overlapping_payload);
157 
158    OPT(brw_fs_lower_uniform_pull_constant_loads);
159 
160    OPT(brw_fs_lower_indirect_mov);
161 
162    OPT(brw_fs_lower_find_live_channel);
163 
164    OPT(brw_fs_lower_load_subgroup_invocation);
165 }
166 
167 static unsigned
load_payload_sources_read_for_size(fs_inst * lp,unsigned size_read)168 load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
169 {
170    assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
171    assert(size_read >= lp->header_size * REG_SIZE);
172 
173    unsigned i;
174    unsigned size = lp->header_size * REG_SIZE;
175    for (i = lp->header_size; size < size_read && i < lp->sources; i++)
176       size += lp->exec_size * brw_type_size_bytes(lp->src[i].type);
177 
178    /* Size read must cover exactly a subset of sources. */
179    assert(size == size_read);
180    return i;
181 }
182 
183 /**
184  * Optimize sample messages that have constant zero values for the trailing
185  * parameters. We can just reduce the message length for these
186  * instructions instead of reserving a register for it. Trailing parameters
187  * that aren't sent default to zero anyway. This will cause the dead code
188  * eliminator to remove the MOV instruction that would otherwise be emitted to
189  * set up the zero value.
190  */
191 
192 bool
brw_fs_opt_zero_samples(fs_visitor & s)193 brw_fs_opt_zero_samples(fs_visitor &s)
194 {
195    bool progress = false;
196 
197    foreach_block_and_inst(block, fs_inst, send, s.cfg) {
198       if (send->opcode != SHADER_OPCODE_SEND ||
199           send->sfid != BRW_SFID_SAMPLER)
200          continue;
201 
202       /* Wa_14012688258:
203        *
204        * Don't trim zeros at the end of payload for sample operations
205        * in cube and cube arrays.
206        */
207       if (send->keep_payload_trailing_zeros)
208          continue;
209 
210       /* This pass works on SENDs before splitting. */
211       if (send->ex_mlen > 0)
212          continue;
213 
214       fs_inst *lp = (fs_inst *) send->prev;
215 
216       if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
217          continue;
218 
219       /* How much of the payload are actually read by this SEND. */
220       const unsigned params =
221          load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
222 
223       /* We don't want to remove the message header or the first parameter.
224        * Removing the first parameter is not allowed, see the Haswell PRM
225        * volume 7, page 149:
226        *
227        *     "Parameter 0 is required except for the sampleinfo message, which
228        *      has no parameter 0"
229        */
230       const unsigned first_param_idx = lp->header_size;
231       unsigned zero_size = 0;
232       for (unsigned i = params - 1; i > first_param_idx; i--) {
233          if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
234             break;
235          zero_size += lp->exec_size * brw_type_size_bytes(lp->src[i].type) * lp->dst.stride;
236       }
237 
238       /* Round down to ensure to only consider full registers. */
239       const unsigned zero_len = ROUND_DOWN_TO(zero_size / REG_SIZE, reg_unit(s.devinfo));
240       if (zero_len > 0) {
241          /* Note mlen is in REG_SIZE units. */
242          send->mlen -= zero_len;
243          progress = true;
244       }
245    }
246 
247    if (progress)
248       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
249 
250    return progress;
251 }
252 
253 /**
254  * Opportunistically split SEND message payloads.
255  *
256  * Gfx9+ supports "split" SEND messages, which take two payloads that are
257  * implicitly concatenated.  If we find a SEND message with a single payload,
258  * we can split that payload in two.  This results in smaller contiguous
259  * register blocks for us to allocate.  But it can help beyond that, too.
260  *
261  * We try and split a LOAD_PAYLOAD between sources which change registers.
262  * For example, a sampler message often contains a x/y/z coordinate that may
263  * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
264  * or array index, which comes from elsewhere.  In this case, the first few
265  * sources will be different offsets of the same VGRF, then a later source
266  * will be a different VGRF.  So we split there, possibly eliminating the
267  * payload concatenation altogether.
268  */
269 bool
brw_fs_opt_split_sends(fs_visitor & s)270 brw_fs_opt_split_sends(fs_visitor &s)
271 {
272    bool progress = false;
273 
274    foreach_block_and_inst(block, fs_inst, send, s.cfg) {
275       if (send->opcode != SHADER_OPCODE_SEND ||
276           send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0 ||
277           send->src[2].file != VGRF)
278          continue;
279 
280       /* Currently don't split sends that reuse a previously used payload. */
281       fs_inst *lp = (fs_inst *) send->prev;
282 
283       if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
284          continue;
285 
286       if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
287          continue;
288 
289       /* Split either after the header (if present), or when consecutive
290        * sources switch from one VGRF to a different one.
291        */
292       unsigned mid = lp->header_size;
293       if (mid == 0) {
294          for (mid = 1; mid < lp->sources; mid++) {
295             if (lp->src[mid].file == BAD_FILE)
296                continue;
297 
298             if (lp->src[0].file != lp->src[mid].file ||
299                 lp->src[0].nr != lp->src[mid].nr)
300                break;
301          }
302       }
303 
304       /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
305        * find out how many sources from the payload does it really need.
306        */
307       const unsigned end =
308          load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
309 
310       /* Nothing to split. */
311       if (end <= mid)
312          continue;
313 
314       const fs_builder ibld(&s, block, lp);
315       fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
316       fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
317 
318       assert(lp1->size_written % REG_SIZE == 0);
319       assert(lp2->size_written % REG_SIZE == 0);
320       assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
321 
322       lp1->dst = brw_vgrf(s.alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
323       lp2->dst = brw_vgrf(s.alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
324 
325       send->resize_sources(4);
326       send->src[2] = lp1->dst;
327       send->src[3] = lp2->dst;
328       send->ex_mlen = lp2->size_written / REG_SIZE;
329       send->mlen -= send->ex_mlen;
330 
331       progress = true;
332    }
333 
334    if (progress)
335       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
336 
337    return progress;
338 }
339 
340 /**
341  * Remove redundant or useless halts.
342  *
343  * For example, we can eliminate halts in the following sequence:
344  *
345  * halt        (redundant with the next halt)
346  * halt        (useless; jumps to the next instruction)
347  * halt-target
348  */
349 bool
brw_fs_opt_remove_redundant_halts(fs_visitor & s)350 brw_fs_opt_remove_redundant_halts(fs_visitor &s)
351 {
352    bool progress = false;
353 
354    unsigned halt_count = 0;
355    fs_inst *halt_target = NULL;
356    bblock_t *halt_target_block = NULL;
357    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
358       if (inst->opcode == BRW_OPCODE_HALT)
359          halt_count++;
360 
361       if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
362          halt_target = inst;
363          halt_target_block = block;
364          break;
365       }
366    }
367 
368    if (!halt_target) {
369       assert(halt_count == 0);
370       return false;
371    }
372 
373    /* Delete any HALTs immediately before the halt target. */
374    for (fs_inst *prev = (fs_inst *) halt_target->prev;
375         !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
376         prev = (fs_inst *) halt_target->prev) {
377       prev->remove(halt_target_block);
378       halt_count--;
379       progress = true;
380    }
381 
382    if (halt_count == 0) {
383       halt_target->remove(halt_target_block);
384       progress = true;
385    }
386 
387    if (progress)
388       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
389 
390    return progress;
391 }
392 
393 /**
394  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
395  * flow.  We could probably do better here with some form of divergence
396  * analysis.
397  */
398 bool
brw_fs_opt_eliminate_find_live_channel(fs_visitor & s)399 brw_fs_opt_eliminate_find_live_channel(fs_visitor &s)
400 {
401    bool progress = false;
402    unsigned depth = 0;
403 
404    if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
405                                       s.prog_data)) {
406       /* The optimization below assumes that channel zero is live on thread
407        * dispatch, which may not be the case if the fixed function dispatches
408        * threads sparsely.
409        */
410       return false;
411    }
412 
413    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
414       switch (inst->opcode) {
415       case BRW_OPCODE_IF:
416       case BRW_OPCODE_DO:
417          depth++;
418          break;
419 
420       case BRW_OPCODE_ENDIF:
421       case BRW_OPCODE_WHILE:
422          depth--;
423          break;
424 
425       case BRW_OPCODE_HALT:
426          /* This can potentially make control flow non-uniform until the end
427           * of the program.
428           */
429          goto out;
430 
431       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
432          if (depth == 0) {
433             inst->opcode = BRW_OPCODE_MOV;
434             inst->src[0] = brw_imm_ud(0u);
435             inst->sources = 1;
436             inst->force_writemask_all = true;
437             progress = true;
438 
439             /* emit_uniformize() frequently emits FIND_LIVE_CHANNEL paired
440              * with a BROADCAST.  Save some work for opt_copy_propagation
441              * and opt_algebraic by trivially cleaning up both together.
442              */
443             assert(!inst->next->is_tail_sentinel());
444             fs_inst *bcast = (fs_inst *) inst->next;
445 
446             /* Ignore stride when comparing */
447             if (bcast->opcode == SHADER_OPCODE_BROADCAST &&
448                 inst->dst.file == VGRF &&
449                 inst->dst.file == bcast->src[1].file &&
450                 inst->dst.nr == bcast->src[1].nr &&
451                 inst->dst.offset == bcast->src[1].offset) {
452                bcast->opcode = BRW_OPCODE_MOV;
453                if (!is_uniform(bcast->src[0]))
454                   bcast->src[0] = component(bcast->src[0], 0);
455                bcast->sources = 1;
456                bcast->force_writemask_all = true;
457             }
458          }
459          break;
460 
461       default:
462          break;
463       }
464    }
465 
466 out:
467    if (progress)
468       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
469 
470    return progress;
471 }
472 
473 /**
474  * Rounding modes for conversion instructions are included for each
475  * conversion, but right now it is a state. So once it is set,
476  * we don't need to call it again for subsequent calls.
477  *
478  * This is useful for vector/matrices conversions, as setting the
479  * mode once is enough for the full vector/matrix
480  */
481 bool
brw_fs_opt_remove_extra_rounding_modes(fs_visitor & s)482 brw_fs_opt_remove_extra_rounding_modes(fs_visitor &s)
483 {
484    bool progress = false;
485    unsigned execution_mode = s.nir->info.float_controls_execution_mode;
486 
487    brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
488    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
489         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
490         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
491        execution_mode)
492       base_mode = BRW_RND_MODE_RTNE;
493    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
494         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
495         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
496        execution_mode)
497       base_mode = BRW_RND_MODE_RTZ;
498 
499    foreach_block (block, s.cfg) {
500       brw_rnd_mode prev_mode = base_mode;
501 
502       foreach_inst_in_block_safe (fs_inst, inst, block) {
503          if (inst->opcode == SHADER_OPCODE_RND_MODE) {
504             assert(inst->src[0].file == IMM);
505             const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
506             if (mode == prev_mode) {
507                inst->remove(block);
508                progress = true;
509             } else {
510                prev_mode = mode;
511             }
512          }
513       }
514    }
515 
516    if (progress)
517       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
518 
519    return progress;
520 }
521 
522