1 /*
2 * Copyright © 2010 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_eu.h"
7 #include "brw_fs.h"
8 #include "brw_fs_builder.h"
9
10 using namespace brw;
11
12 void
brw_fs_optimize(fs_visitor & s)13 brw_fs_optimize(fs_visitor &s)
14 {
15 const nir_shader *nir = s.nir;
16
17 s.debug_optimizer(nir, "start", 0, 0);
18
19 /* Start by validating the shader we currently have. */
20 brw_fs_validate(s);
21
22 bool progress = false;
23 int iteration = 0;
24 int pass_num = 0;
25
26 #define OPT(pass, ...) ({ \
27 pass_num++; \
28 bool this_progress = pass(s, ##__VA_ARGS__); \
29 \
30 if (this_progress) \
31 s.debug_optimizer(nir, #pass, iteration, pass_num); \
32 \
33 brw_fs_validate(s); \
34 \
35 progress = progress || this_progress; \
36 this_progress; \
37 })
38
39 s.assign_constant_locations();
40 OPT(brw_fs_lower_constant_loads);
41
42 if (s.compiler->lower_dpas)
43 OPT(brw_fs_lower_dpas);
44
45 OPT(brw_fs_opt_split_virtual_grfs);
46
47 /* Before anything else, eliminate dead code. The results of some NIR
48 * instructions may effectively be calculated twice. Once when the
49 * instruction is encountered, and again when the user of that result is
50 * encountered. Wipe those away before algebraic optimizations and
51 * especially copy propagation can mix things up.
52 */
53 OPT(brw_fs_opt_dead_code_eliminate);
54
55 OPT(brw_fs_opt_remove_extra_rounding_modes);
56
57 OPT(brw_fs_opt_eliminate_find_live_channel);
58
59 do {
60 progress = false;
61 pass_num = 0;
62 iteration++;
63
64 OPT(brw_fs_opt_algebraic);
65 OPT(brw_fs_opt_cse_defs);
66 if (!OPT(brw_fs_opt_copy_propagation_defs))
67 OPT(brw_fs_opt_copy_propagation);
68 OPT(brw_fs_opt_cmod_propagation);
69 OPT(brw_fs_opt_dead_code_eliminate);
70 OPT(brw_fs_opt_saturate_propagation);
71 OPT(brw_fs_opt_register_coalesce);
72
73 OPT(brw_fs_opt_compact_virtual_grfs);
74 } while (progress);
75
76 progress = false;
77 pass_num = 0;
78
79 if (OPT(brw_fs_lower_pack)) {
80 OPT(brw_fs_opt_register_coalesce);
81 OPT(brw_fs_opt_dead_code_eliminate);
82 }
83
84 OPT(brw_fs_lower_csel);
85 OPT(brw_fs_lower_simd_width);
86 OPT(brw_fs_lower_barycentrics);
87 OPT(brw_fs_lower_logical_sends);
88
89 /* After logical SEND lowering. */
90
91 if (OPT(brw_fs_opt_copy_propagation_defs) || OPT(brw_fs_opt_copy_propagation))
92 OPT(brw_fs_opt_algebraic);
93
94 /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
95 * Do this before splitting SENDs.
96 */
97 if (OPT(brw_fs_opt_zero_samples) && (OPT(brw_fs_opt_copy_propagation_defs) || OPT(brw_fs_opt_copy_propagation)))
98 OPT(brw_fs_opt_algebraic);
99
100 OPT(brw_fs_opt_split_sends);
101 OPT(brw_fs_workaround_nomask_control_flow);
102
103 if (progress) {
104 if (OPT(brw_fs_opt_copy_propagation_defs) || OPT(brw_fs_opt_copy_propagation))
105 OPT(brw_fs_opt_algebraic);
106
107 /* Run after logical send lowering to give it a chance to CSE the
108 * LOAD_PAYLOAD instructions created to construct the payloads of
109 * e.g. texturing messages in cases where it wasn't possible to CSE the
110 * whole logical instruction.
111 */
112 OPT(brw_fs_opt_cse_defs);
113 OPT(brw_fs_opt_register_coalesce);
114 OPT(brw_fs_opt_dead_code_eliminate);
115 }
116
117 OPT(brw_fs_opt_remove_redundant_halts);
118
119 if (OPT(brw_fs_lower_load_payload)) {
120 OPT(brw_fs_opt_split_virtual_grfs);
121
122 OPT(brw_fs_opt_register_coalesce);
123 OPT(brw_fs_lower_simd_width);
124 OPT(brw_fs_opt_dead_code_eliminate);
125 }
126
127 OPT(brw_fs_lower_alu_restrictions);
128
129 OPT(brw_fs_opt_combine_constants);
130 if (OPT(brw_fs_lower_integer_multiplication)) {
131 /* If lower_integer_multiplication made progress, it may have produced
132 * some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it
133 * one more time to clean those up if they exist.
134 */
135 OPT(brw_fs_lower_integer_multiplication);
136 }
137 OPT(brw_fs_lower_sub_sat);
138
139 progress = false;
140 OPT(brw_fs_lower_derivatives);
141 OPT(brw_fs_lower_regioning);
142 if (progress) {
143 /* Try both copy propagation passes. The defs one will likely not be
144 * able to handle everything at this point.
145 */
146 const bool cp1 = OPT(brw_fs_opt_copy_propagation_defs);
147 const bool cp2 = OPT(brw_fs_opt_copy_propagation);
148 if (cp1 || cp2) {
149 OPT(brw_fs_opt_algebraic);
150 OPT(brw_fs_opt_combine_constants);
151 }
152 OPT(brw_fs_opt_dead_code_eliminate);
153 OPT(brw_fs_lower_simd_width);
154 }
155
156 OPT(brw_fs_lower_sends_overlapping_payload);
157
158 OPT(brw_fs_lower_uniform_pull_constant_loads);
159
160 OPT(brw_fs_lower_indirect_mov);
161
162 OPT(brw_fs_lower_find_live_channel);
163
164 OPT(brw_fs_lower_load_subgroup_invocation);
165 }
166
167 static unsigned
load_payload_sources_read_for_size(fs_inst * lp,unsigned size_read)168 load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
169 {
170 assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
171 assert(size_read >= lp->header_size * REG_SIZE);
172
173 unsigned i;
174 unsigned size = lp->header_size * REG_SIZE;
175 for (i = lp->header_size; size < size_read && i < lp->sources; i++)
176 size += lp->exec_size * brw_type_size_bytes(lp->src[i].type);
177
178 /* Size read must cover exactly a subset of sources. */
179 assert(size == size_read);
180 return i;
181 }
182
183 /**
184 * Optimize sample messages that have constant zero values for the trailing
185 * parameters. We can just reduce the message length for these
186 * instructions instead of reserving a register for it. Trailing parameters
187 * that aren't sent default to zero anyway. This will cause the dead code
188 * eliminator to remove the MOV instruction that would otherwise be emitted to
189 * set up the zero value.
190 */
191
192 bool
brw_fs_opt_zero_samples(fs_visitor & s)193 brw_fs_opt_zero_samples(fs_visitor &s)
194 {
195 bool progress = false;
196
197 foreach_block_and_inst(block, fs_inst, send, s.cfg) {
198 if (send->opcode != SHADER_OPCODE_SEND ||
199 send->sfid != BRW_SFID_SAMPLER)
200 continue;
201
202 /* Wa_14012688258:
203 *
204 * Don't trim zeros at the end of payload for sample operations
205 * in cube and cube arrays.
206 */
207 if (send->keep_payload_trailing_zeros)
208 continue;
209
210 /* This pass works on SENDs before splitting. */
211 if (send->ex_mlen > 0)
212 continue;
213
214 fs_inst *lp = (fs_inst *) send->prev;
215
216 if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
217 continue;
218
219 /* How much of the payload are actually read by this SEND. */
220 const unsigned params =
221 load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
222
223 /* We don't want to remove the message header or the first parameter.
224 * Removing the first parameter is not allowed, see the Haswell PRM
225 * volume 7, page 149:
226 *
227 * "Parameter 0 is required except for the sampleinfo message, which
228 * has no parameter 0"
229 */
230 const unsigned first_param_idx = lp->header_size;
231 unsigned zero_size = 0;
232 for (unsigned i = params - 1; i > first_param_idx; i--) {
233 if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
234 break;
235 zero_size += lp->exec_size * brw_type_size_bytes(lp->src[i].type) * lp->dst.stride;
236 }
237
238 /* Round down to ensure to only consider full registers. */
239 const unsigned zero_len = ROUND_DOWN_TO(zero_size / REG_SIZE, reg_unit(s.devinfo));
240 if (zero_len > 0) {
241 /* Note mlen is in REG_SIZE units. */
242 send->mlen -= zero_len;
243 progress = true;
244 }
245 }
246
247 if (progress)
248 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
249
250 return progress;
251 }
252
253 /**
254 * Opportunistically split SEND message payloads.
255 *
256 * Gfx9+ supports "split" SEND messages, which take two payloads that are
257 * implicitly concatenated. If we find a SEND message with a single payload,
258 * we can split that payload in two. This results in smaller contiguous
259 * register blocks for us to allocate. But it can help beyond that, too.
260 *
261 * We try and split a LOAD_PAYLOAD between sources which change registers.
262 * For example, a sampler message often contains a x/y/z coordinate that may
263 * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
264 * or array index, which comes from elsewhere. In this case, the first few
265 * sources will be different offsets of the same VGRF, then a later source
266 * will be a different VGRF. So we split there, possibly eliminating the
267 * payload concatenation altogether.
268 */
269 bool
brw_fs_opt_split_sends(fs_visitor & s)270 brw_fs_opt_split_sends(fs_visitor &s)
271 {
272 bool progress = false;
273
274 foreach_block_and_inst(block, fs_inst, send, s.cfg) {
275 if (send->opcode != SHADER_OPCODE_SEND ||
276 send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0 ||
277 send->src[2].file != VGRF)
278 continue;
279
280 /* Currently don't split sends that reuse a previously used payload. */
281 fs_inst *lp = (fs_inst *) send->prev;
282
283 if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
284 continue;
285
286 if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
287 continue;
288
289 /* Split either after the header (if present), or when consecutive
290 * sources switch from one VGRF to a different one.
291 */
292 unsigned mid = lp->header_size;
293 if (mid == 0) {
294 for (mid = 1; mid < lp->sources; mid++) {
295 if (lp->src[mid].file == BAD_FILE)
296 continue;
297
298 if (lp->src[0].file != lp->src[mid].file ||
299 lp->src[0].nr != lp->src[mid].nr)
300 break;
301 }
302 }
303
304 /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
305 * find out how many sources from the payload does it really need.
306 */
307 const unsigned end =
308 load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
309
310 /* Nothing to split. */
311 if (end <= mid)
312 continue;
313
314 const fs_builder ibld(&s, block, lp);
315 fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
316 fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
317
318 assert(lp1->size_written % REG_SIZE == 0);
319 assert(lp2->size_written % REG_SIZE == 0);
320 assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
321
322 lp1->dst = brw_vgrf(s.alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
323 lp2->dst = brw_vgrf(s.alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
324
325 send->resize_sources(4);
326 send->src[2] = lp1->dst;
327 send->src[3] = lp2->dst;
328 send->ex_mlen = lp2->size_written / REG_SIZE;
329 send->mlen -= send->ex_mlen;
330
331 progress = true;
332 }
333
334 if (progress)
335 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
336
337 return progress;
338 }
339
340 /**
341 * Remove redundant or useless halts.
342 *
343 * For example, we can eliminate halts in the following sequence:
344 *
345 * halt (redundant with the next halt)
346 * halt (useless; jumps to the next instruction)
347 * halt-target
348 */
349 bool
brw_fs_opt_remove_redundant_halts(fs_visitor & s)350 brw_fs_opt_remove_redundant_halts(fs_visitor &s)
351 {
352 bool progress = false;
353
354 unsigned halt_count = 0;
355 fs_inst *halt_target = NULL;
356 bblock_t *halt_target_block = NULL;
357 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
358 if (inst->opcode == BRW_OPCODE_HALT)
359 halt_count++;
360
361 if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
362 halt_target = inst;
363 halt_target_block = block;
364 break;
365 }
366 }
367
368 if (!halt_target) {
369 assert(halt_count == 0);
370 return false;
371 }
372
373 /* Delete any HALTs immediately before the halt target. */
374 for (fs_inst *prev = (fs_inst *) halt_target->prev;
375 !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
376 prev = (fs_inst *) halt_target->prev) {
377 prev->remove(halt_target_block);
378 halt_count--;
379 progress = true;
380 }
381
382 if (halt_count == 0) {
383 halt_target->remove(halt_target_block);
384 progress = true;
385 }
386
387 if (progress)
388 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
389
390 return progress;
391 }
392
393 /**
394 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
395 * flow. We could probably do better here with some form of divergence
396 * analysis.
397 */
398 bool
brw_fs_opt_eliminate_find_live_channel(fs_visitor & s)399 brw_fs_opt_eliminate_find_live_channel(fs_visitor &s)
400 {
401 bool progress = false;
402 unsigned depth = 0;
403
404 if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
405 s.prog_data)) {
406 /* The optimization below assumes that channel zero is live on thread
407 * dispatch, which may not be the case if the fixed function dispatches
408 * threads sparsely.
409 */
410 return false;
411 }
412
413 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
414 switch (inst->opcode) {
415 case BRW_OPCODE_IF:
416 case BRW_OPCODE_DO:
417 depth++;
418 break;
419
420 case BRW_OPCODE_ENDIF:
421 case BRW_OPCODE_WHILE:
422 depth--;
423 break;
424
425 case BRW_OPCODE_HALT:
426 /* This can potentially make control flow non-uniform until the end
427 * of the program.
428 */
429 goto out;
430
431 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
432 if (depth == 0) {
433 inst->opcode = BRW_OPCODE_MOV;
434 inst->src[0] = brw_imm_ud(0u);
435 inst->sources = 1;
436 inst->force_writemask_all = true;
437 progress = true;
438
439 /* emit_uniformize() frequently emits FIND_LIVE_CHANNEL paired
440 * with a BROADCAST. Save some work for opt_copy_propagation
441 * and opt_algebraic by trivially cleaning up both together.
442 */
443 assert(!inst->next->is_tail_sentinel());
444 fs_inst *bcast = (fs_inst *) inst->next;
445
446 /* Ignore stride when comparing */
447 if (bcast->opcode == SHADER_OPCODE_BROADCAST &&
448 inst->dst.file == VGRF &&
449 inst->dst.file == bcast->src[1].file &&
450 inst->dst.nr == bcast->src[1].nr &&
451 inst->dst.offset == bcast->src[1].offset) {
452 bcast->opcode = BRW_OPCODE_MOV;
453 if (!is_uniform(bcast->src[0]))
454 bcast->src[0] = component(bcast->src[0], 0);
455 bcast->sources = 1;
456 bcast->force_writemask_all = true;
457 }
458 }
459 break;
460
461 default:
462 break;
463 }
464 }
465
466 out:
467 if (progress)
468 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
469
470 return progress;
471 }
472
473 /**
474 * Rounding modes for conversion instructions are included for each
475 * conversion, but right now it is a state. So once it is set,
476 * we don't need to call it again for subsequent calls.
477 *
478 * This is useful for vector/matrices conversions, as setting the
479 * mode once is enough for the full vector/matrix
480 */
481 bool
brw_fs_opt_remove_extra_rounding_modes(fs_visitor & s)482 brw_fs_opt_remove_extra_rounding_modes(fs_visitor &s)
483 {
484 bool progress = false;
485 unsigned execution_mode = s.nir->info.float_controls_execution_mode;
486
487 brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
488 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
489 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
490 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
491 execution_mode)
492 base_mode = BRW_RND_MODE_RTNE;
493 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
494 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
495 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
496 execution_mode)
497 base_mode = BRW_RND_MODE_RTZ;
498
499 foreach_block (block, s.cfg) {
500 brw_rnd_mode prev_mode = base_mode;
501
502 foreach_inst_in_block_safe (fs_inst, inst, block) {
503 if (inst->opcode == SHADER_OPCODE_RND_MODE) {
504 assert(inst->src[0].file == IMM);
505 const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
506 if (mode == prev_mode) {
507 inst->remove(block);
508 progress = true;
509 } else {
510 prev_mode = mode;
511 }
512 }
513 }
514 }
515
516 if (progress)
517 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
518
519 return progress;
520 }
521
522