1 /*
2 * Copyright © 2014 Rob Clark <[email protected]>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <[email protected]>
7 */
8
9 #include "util/ralloc.h"
10 #include "util/u_math.h"
11
12 #include "ir3.h"
13 #include "ir3_shader.h"
14
15 /*
16 * Legalize:
17 *
18 * The legalize pass handles ensuring sufficient nop's and sync flags for
19 * correct execution.
20 *
21 * 1) Iteratively determine where sync ((sy)/(ss)) flags are needed,
22 * based on state flowing out of predecessor blocks until there is
23 * no further change. In some cases this requires inserting nops.
24 * 2) Mark (ei) on last varying input
25 * 3) Final nop scheduling for instruction latency
26 * 4) Resolve jumps and schedule blocks, marking potential convergence
27 * points with (jp)
28 */
29
30 struct ir3_legalize_ctx {
31 struct ir3_compiler *compiler;
32 struct ir3_shader_variant *so;
33 gl_shader_stage type;
34 int max_bary;
35 bool early_input_release;
36 bool has_inputs;
37 bool has_tex_prefetch;
38 };
39
40 struct ir3_nop_state {
41 unsigned full_ready[GPR_REG_SIZE];
42 unsigned half_ready[GPR_REG_SIZE];
43 };
44
45 struct ir3_legalize_state {
46 regmask_t needs_ss;
47 regmask_t needs_ss_scalar_full; /* half scalar ALU producer -> full scalar ALU consumer */
48 regmask_t needs_ss_scalar_half; /* full scalar ALU producer -> half scalar ALU consumer */
49 regmask_t needs_ss_war; /* write after read */
50 regmask_t needs_ss_or_sy_war; /* WAR for sy-producer sources */
51 regmask_t needs_ss_scalar_war; /* scalar ALU write -> ALU write */
52 regmask_t needs_ss_or_sy_scalar_war;
53 regmask_t needs_sy;
54 bool needs_ss_for_const;
55
56 /* Each of these arrays contains the cycle when the corresponding register
57 * becomes "ready" i.e. does not require any more nops. There is a special
58 * mechanism to let ALU instructions read compatible (i.e. same halfness)
59 * destinations of another ALU instruction with less delay, so this can
60 * depend on what type the consuming instruction is, which is why there are
61 * multiple arrays. The cycle is counted relative to the start of the block.
62 */
63
64 /* When ALU instructions reading the given full/half register will be ready.
65 */
66 struct ir3_nop_state alu_nop;
67
68 /* When non-ALU (e.g. cat5) instructions reading the given full/half register
69 * will be ready.
70 */
71 struct ir3_nop_state non_alu_nop;
72
73 /* When p0.x-w, a0.x, and a1.x are ready. */
74 unsigned pred_ready[4];
75 unsigned addr_ready[2];
76 };
77
78 struct ir3_legalize_block_data {
79 bool valid;
80 struct ir3_legalize_state begin_state;
81 struct ir3_legalize_state state;
82 };
83
84 static inline bool
needs_ss_war(struct ir3_legalize_state * state,struct ir3_register * dst,bool is_scalar_alu)85 needs_ss_war(struct ir3_legalize_state *state, struct ir3_register *dst,
86 bool is_scalar_alu)
87 {
88 if (regmask_get(&state->needs_ss_war, dst))
89 return true;
90 if (regmask_get(&state->needs_ss_or_sy_war, dst))
91 return true;
92
93 if (!is_scalar_alu) {
94 if (regmask_get(&state->needs_ss_scalar_war, dst))
95 return true;
96 if (regmask_get(&state->needs_ss_or_sy_scalar_war, dst))
97 return true;
98 }
99
100 return false;
101 }
102
103 static inline void
apply_ss(struct ir3_instruction * instr,struct ir3_legalize_state * state,bool mergedregs)104 apply_ss(struct ir3_instruction *instr,
105 struct ir3_legalize_state *state,
106 bool mergedregs)
107 {
108 instr->flags |= IR3_INSTR_SS;
109 regmask_init(&state->needs_ss_war, mergedregs);
110 regmask_init(&state->needs_ss_or_sy_war, mergedregs);
111 regmask_init(&state->needs_ss, mergedregs);
112 regmask_init(&state->needs_ss_scalar_war, mergedregs);
113 regmask_init(&state->needs_ss_or_sy_scalar_war, mergedregs);
114 regmask_init(&state->needs_ss_scalar_full, mergedregs);
115 regmask_init(&state->needs_ss_scalar_half, mergedregs);
116 state->needs_ss_for_const = false;
117 }
118
119 static inline void
apply_sy(struct ir3_instruction * instr,struct ir3_legalize_state * state,bool mergedregs)120 apply_sy(struct ir3_instruction *instr,
121 struct ir3_legalize_state *state,
122 bool mergedregs)
123 {
124 instr->flags |= IR3_INSTR_SY;
125 regmask_init(&state->needs_sy, mergedregs);
126 regmask_init(&state->needs_ss_or_sy_war, mergedregs);
127 regmask_init(&state->needs_ss_or_sy_scalar_war, mergedregs);
128 }
129
130 static bool
count_instruction(struct ir3_instruction * n,struct ir3_compiler * compiler)131 count_instruction(struct ir3_instruction *n, struct ir3_compiler *compiler)
132 {
133 /* NOTE: don't count branch/jump since we don't know yet if they will
134 * be eliminated later in resolve_jumps().. really should do that
135 * earlier so we don't have this constraint.
136 */
137 return (is_alu(n) && !is_scalar_alu(n, compiler)) ||
138 (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) &&
139 (n->opc != OPC_BRAA) && (n->opc != OPC_BRAO));
140 }
141
142 static unsigned *
get_ready_slot(struct ir3_legalize_state * state,struct ir3_register * reg,unsigned num,bool consumer_alu,bool matching_size)143 get_ready_slot(struct ir3_legalize_state *state,
144 struct ir3_register *reg, unsigned num,
145 bool consumer_alu, bool matching_size)
146 {
147 if (reg->flags & IR3_REG_PREDICATE) {
148 assert(num == reg->num);
149 assert(reg_num(reg) == REG_P0);
150 return &state->pred_ready[reg_comp(reg)];
151 }
152 if (reg->num == regid(REG_A0, 0))
153 return &state->addr_ready[0];
154 if (reg->num == regid(REG_A0, 1))
155 return &state->addr_ready[1];
156 struct ir3_nop_state *nop =
157 consumer_alu ? &state->alu_nop : &state->non_alu_nop;
158 assert(!(reg->flags & IR3_REG_SHARED));
159 if (reg->flags & IR3_REG_HALF) {
160 if (matching_size)
161 return &nop->half_ready[num];
162 else
163 return &nop->full_ready[num / 2];
164 } else {
165 if (matching_size)
166 return &nop->full_ready[num];
167 /* If "num" is large enough, then it can't alias a half-reg because only
168 * the first half of the full reg speace aliases half regs. Return NULL in
169 * this case.
170 */
171 else if (num * 2 < ARRAY_SIZE(nop->half_ready))
172 return &nop->half_ready[num * 2];
173 else
174 return NULL;
175 }
176 }
177
178 static unsigned
delay_calc(struct ir3_legalize_state * state,struct ir3_instruction * instr,unsigned cycle)179 delay_calc(struct ir3_legalize_state *state,
180 struct ir3_instruction *instr,
181 unsigned cycle)
182 {
183 /* As far as we know, shader outputs don't need any delay. */
184 if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
185 return 0;
186
187 unsigned delay = 0;
188 foreach_src_n (src, n, instr) {
189 if (src->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED))
190 continue;
191
192 unsigned elems = post_ra_reg_elems(src);
193 unsigned num = post_ra_reg_num(src);
194 unsigned src_cycle = cycle;
195
196 /* gat and swz have scalar sources and each source is read in a
197 * subsequent cycle.
198 */
199 if (instr->opc == OPC_GAT || instr->opc == OPC_SWZ)
200 src_cycle += n;
201
202 /* cat3 instructions consume their last source two cycles later, so they
203 * only need a delay of 1.
204 */
205 if ((is_mad(instr->opc) || is_madsh(instr->opc)) && n == 2)
206 src_cycle += 2;
207
208 for (unsigned elem = 0; elem < elems; elem++, num++) {
209 unsigned ready_cycle =
210 *get_ready_slot(state, src, num, is_alu(instr), true);
211 delay = MAX2(delay, MAX2(ready_cycle, src_cycle) - src_cycle);
212
213 /* Increment cycle for ALU instructions with (rptN) where sources are
214 * read each subsequent cycle.
215 */
216 if (instr->repeat && !(src->flags & IR3_REG_RELATIV))
217 src_cycle++;
218 }
219 }
220
221 return delay;
222 }
223
224 static void
delay_update(struct ir3_legalize_state * state,struct ir3_instruction * instr,unsigned cycle,bool mergedregs)225 delay_update(struct ir3_legalize_state *state,
226 struct ir3_instruction *instr,
227 unsigned cycle,
228 bool mergedregs)
229 {
230 if (writes_addr1(instr) && instr->block->in_early_preamble)
231 return;
232
233 foreach_dst_n (dst, n, instr) {
234 unsigned elems = post_ra_reg_elems(dst);
235 unsigned num = post_ra_reg_num(dst);
236 unsigned dst_cycle = cycle;
237
238 /* sct and swz have scalar destinations and each destination is written in
239 * a subsequent cycle.
240 */
241 if (instr->opc == OPC_SCT || instr->opc == OPC_SWZ)
242 dst_cycle += n;
243
244 /* For relative accesses with (rptN), we have no way of knowing which
245 * component is accessed when, so we have to assume the worst and mark
246 * every array member as being written at the end.
247 */
248 if (dst->flags & IR3_REG_RELATIV)
249 dst_cycle += instr->repeat;
250
251 if (dst->flags & IR3_REG_SHARED)
252 continue;
253
254 for (unsigned elem = 0; elem < elems; elem++, num++) {
255 for (unsigned consumer_alu = 0; consumer_alu < 2; consumer_alu++) {
256 for (unsigned matching_size = 0; matching_size < 2; matching_size++) {
257 unsigned *ready_slot =
258 get_ready_slot(state, dst, num, consumer_alu, matching_size);
259
260 if (!ready_slot)
261 continue;
262
263 bool reset_ready_slot = false;
264 unsigned delay = 0;
265 if (!is_alu(instr)) {
266 /* Apparently writes that require (ss) or (sy) are
267 * synchronized against previous writes, so consumers don't
268 * have to wait for any previous overlapping ALU instructions
269 * to complete.
270 */
271 reset_ready_slot = true;
272 } else if ((dst->flags & IR3_REG_PREDICATE) ||
273 reg_num(dst) == REG_A0) {
274 delay = 6;
275 if (!matching_size)
276 continue;
277 } else {
278 delay = (consumer_alu && matching_size) ? 3 : 6;
279 }
280
281 if (!matching_size) {
282 for (unsigned i = 0; i < reg_elem_size(dst); i++) {
283 ready_slot[i] =
284 reset_ready_slot ? 0 :
285 MAX2(ready_slot[i], dst_cycle + delay);
286 }
287 } else {
288 *ready_slot =
289 reset_ready_slot ? 0 :
290 MAX2(*ready_slot, dst_cycle + delay);
291 }
292 }
293 }
294
295 /* Increment cycle for ALU instructions with (rptN) where destinations
296 * are written each subsequent cycle.
297 */
298 if (instr->repeat && !(dst->flags & IR3_REG_RELATIV))
299 dst_cycle++;
300 }
301 }
302 }
303
304 /* We want to evaluate each block from the position of any other
305 * predecessor block, in order that the flags set are the union of
306 * all possible program paths.
307 *
308 * To do this, we need to know the output state (needs_ss/ss_war/sy)
309 * of all predecessor blocks. The tricky thing is loops, which mean
310 * that we can't simply recursively process each predecessor block
311 * before legalizing the current block.
312 *
313 * How we handle that is by looping over all the blocks until the
314 * results converge. If the output state of a given block changes
315 * in a given pass, this means that all successor blocks are not
316 * yet fully legalized.
317 */
318
319 static bool
legalize_block(struct ir3_legalize_ctx * ctx,struct ir3_block * block)320 legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
321 {
322 struct ir3_legalize_block_data *bd = block->data;
323
324 if (bd->valid)
325 return false;
326
327 struct ir3_instruction *last_n = NULL;
328 struct list_head instr_list;
329 struct ir3_legalize_state prev_state = bd->state;
330 struct ir3_legalize_state *state = &bd->begin_state;
331 bool last_input_needs_ss = false;
332 bool mergedregs = ctx->so->mergedregs;
333
334 /* Our input state is the OR of all predecessor blocks' state.
335 *
336 * Why don't we just zero the state at the beginning before merging in the
337 * predecessors? Because otherwise updates may not be a "lattice refinement",
338 * i.e. needs_ss may go from true to false for some register due to a (ss) we
339 * inserted the second time around (and the same for (sy)). This means that
340 * there's no solid guarantee the algorithm will converge, and in theory
341 * there may be infinite loops where we fight over the placment of an (ss).
342 */
343 for (unsigned i = 0; i < block->predecessors_count; i++) {
344 struct ir3_block *predecessor = block->predecessors[i];
345 struct ir3_legalize_block_data *pbd = predecessor->data;
346 struct ir3_legalize_state *pstate = &pbd->state;
347
348 /* Our input (ss)/(sy) state is based on OR'ing the output
349 * state of all our predecessor blocks
350 */
351 regmask_or(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
352 regmask_or(&state->needs_ss_war, &state->needs_ss_war,
353 &pstate->needs_ss_war);
354 regmask_or(&state->needs_ss_or_sy_war, &state->needs_ss_or_sy_war,
355 &pstate->needs_ss_or_sy_war);
356 regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
357 state->needs_ss_for_const |= pstate->needs_ss_for_const;
358
359 /* Our nop state is the max of the predecessor blocks */
360 for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
361 state->pred_ready[i] = MAX2(state->pred_ready[i],
362 pstate->pred_ready[i]);
363 for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
364 state->alu_nop.full_ready[i] = MAX2(state->alu_nop.full_ready[i],
365 pstate->alu_nop.full_ready[i]);
366 state->alu_nop.half_ready[i] = MAX2(state->alu_nop.half_ready[i],
367 pstate->alu_nop.half_ready[i]);
368 state->non_alu_nop.full_ready[i] = MAX2(state->non_alu_nop.full_ready[i],
369 pstate->non_alu_nop.full_ready[i]);
370 state->non_alu_nop.half_ready[i] = MAX2(state->non_alu_nop.half_ready[i],
371 pstate->non_alu_nop.half_ready[i]);
372 }
373 }
374
375 /* We need to take phsyical-only edges into account when tracking shared
376 * registers.
377 */
378 for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
379 struct ir3_block *predecessor = block->physical_predecessors[i];
380 struct ir3_legalize_block_data *pbd = predecessor->data;
381 struct ir3_legalize_state *pstate = &pbd->state;
382
383 regmask_or_shared(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
384 regmask_or_shared(&state->needs_ss_scalar_full,
385 &state->needs_ss_scalar_full,
386 &pstate->needs_ss_scalar_full);
387 regmask_or_shared(&state->needs_ss_scalar_half,
388 &state->needs_ss_scalar_half,
389 &pstate->needs_ss_scalar_half);
390 regmask_or_shared(&state->needs_ss_scalar_war, &state->needs_ss_scalar_war,
391 &pstate->needs_ss_scalar_war);
392 regmask_or_shared(&state->needs_ss_or_sy_scalar_war,
393 &state->needs_ss_or_sy_scalar_war,
394 &pstate->needs_ss_or_sy_scalar_war);
395 }
396
397 memcpy(&bd->state, state, sizeof(*state));
398 state = &bd->state;
399
400 unsigned input_count = 0;
401
402 foreach_instr (n, &block->instr_list) {
403 if (is_input(n)) {
404 input_count++;
405 }
406 }
407
408 unsigned inputs_remaining = input_count;
409
410 /* Either inputs are in the first block or we expect inputs to be released
411 * with the end of the program.
412 */
413 assert(input_count == 0 || !ctx->early_input_release ||
414 block == ir3_after_preamble(block->shader));
415
416 /* remove all the instructions from the list, we'll be adding
417 * them back in as we go
418 */
419 list_replace(&block->instr_list, &instr_list);
420 list_inithead(&block->instr_list);
421
422 unsigned cycle = 0;
423
424 foreach_instr_safe (n, &instr_list) {
425 unsigned i;
426
427 n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
428
429 /* _meta::tex_prefetch instructions removed later in
430 * collect_tex_prefetches()
431 */
432 if (is_meta(n) && (n->opc != OPC_META_TEX_PREFETCH))
433 continue;
434
435 if (is_input(n)) {
436 struct ir3_register *inloc = n->srcs[0];
437 assert(inloc->flags & IR3_REG_IMMED);
438
439 int last_inloc =
440 inloc->iim_val + ((inloc->flags & IR3_REG_R) ? n->repeat : 0);
441 ctx->max_bary = MAX2(ctx->max_bary, last_inloc);
442 }
443
444 if ((last_n && is_barrier(last_n)) || n->opc == OPC_SHPE) {
445 apply_ss(n, state, mergedregs);
446 apply_sy(n, state, mergedregs);
447 last_input_needs_ss = false;
448 }
449
450 if (last_n && (last_n->opc == OPC_PREDT)) {
451 apply_ss(n, state, mergedregs);
452 }
453
454 bool n_is_scalar_alu = is_scalar_alu(n, ctx->compiler);
455
456 /* NOTE: consider dst register too.. it could happen that
457 * texture sample instruction (for example) writes some
458 * components which are unused. A subsequent instruction
459 * that writes the same register can race w/ the sam instr
460 * resulting in undefined results:
461 */
462 for (i = 0; i < n->dsts_count + n->srcs_count; i++) {
463 struct ir3_register *reg;
464 if (i < n->dsts_count)
465 reg = n->dsts[i];
466 else
467 reg = n->srcs[i - n->dsts_count];
468
469 if (reg_gpr(reg)) {
470
471 /* TODO: we probably only need (ss) for alu
472 * instr consuming sfu result.. need to make
473 * some tests for both this and (sy)..
474 */
475 if (regmask_get(&state->needs_ss, reg)) {
476 apply_ss(n, state, mergedregs);
477 last_input_needs_ss = false;
478 }
479
480 /* There is a fast feedback path for scalar ALU instructions which
481 * only takes 1 cycle of latency, similar to the normal 3 cycle
482 * latency path for ALU instructions. For this fast path the
483 * producer and consumer must use the same register size (i.e. no
484 * writing a full register and then reading half of it or vice
485 * versa). If we don't hit this path, either because of a mismatched
486 * size or a read via the regular ALU, then the write latency is
487 * variable and we must use (ss) to wait for the scalar ALU. This is
488 * different from the fixed 6 cycle latency for mismatched vector
489 * ALU accesses.
490 */
491 if (n_is_scalar_alu) {
492 /* Check if we have a mismatched size RaW dependency */
493 if (regmask_get((reg->flags & IR3_REG_HALF) ?
494 &state->needs_ss_scalar_half :
495 &state->needs_ss_scalar_full, reg)) {
496 apply_ss(n, state, mergedregs);
497 last_input_needs_ss = false;
498 }
499 } else {
500 /* check if we have a scalar -> vector RaW dependency */
501 if (regmask_get(&state->needs_ss_scalar_half, reg) ||
502 regmask_get(&state->needs_ss_scalar_full, reg)) {
503 apply_ss(n, state, mergedregs);
504 last_input_needs_ss = false;
505 }
506 }
507
508 if (regmask_get(&state->needs_sy, reg)) {
509 apply_sy(n, state, mergedregs);
510 }
511 } else if ((reg->flags & IR3_REG_CONST)) {
512 if (state->needs_ss_for_const) {
513 apply_ss(n, state, mergedregs);
514 last_input_needs_ss = false;
515 }
516 } else if (reg_is_addr1(reg) && block->in_early_preamble) {
517 if (regmask_get(&state->needs_ss, reg)) {
518 apply_ss(n, state, mergedregs);
519 last_input_needs_ss = false;
520 }
521 }
522 }
523
524 foreach_dst (reg, n) {
525 if (needs_ss_war(state, reg, n_is_scalar_alu)) {
526 apply_ss(n, state, mergedregs);
527 last_input_needs_ss = false;
528 }
529 }
530
531 /* I'm not exactly what this is for, but it seems we need this on every
532 * mova1 in early preambles.
533 */
534 if (writes_addr1(n) && block->in_early_preamble)
535 n->srcs[0]->flags |= IR3_REG_R;
536
537 /* cat5+ does not have an (ss) bit, if needed we need to
538 * insert a nop to carry the sync flag. Would be kinda
539 * clever if we were aware of this during scheduling, but
540 * this should be a pretty rare case:
541 */
542 if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
543 struct ir3_instruction *nop;
544 nop = ir3_NOP(block);
545 nop->flags |= IR3_INSTR_SS;
546 n->flags &= ~IR3_INSTR_SS;
547 last_n = nop;
548 cycle++;
549 }
550
551 unsigned delay = delay_calc(state, n, cycle);
552
553 /* NOTE: I think the nopN encoding works for a5xx and
554 * probably a4xx, but not a3xx. So far only tested on
555 * a6xx.
556 */
557
558 if ((delay > 0) && (ctx->compiler->gen >= 6) && last_n &&
559 !n_is_scalar_alu &&
560 ((opc_cat(last_n->opc) == 2) || (opc_cat(last_n->opc) == 3)) &&
561 (last_n->repeat == 0)) {
562 /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
563 unsigned transfer = MIN2(delay, 3 - last_n->nop);
564 last_n->nop += transfer;
565 delay -= transfer;
566 cycle += transfer;
567 }
568
569 if ((delay > 0) && last_n && (last_n->opc == OPC_NOP)) {
570 /* the previous nop can encode at most 5 repeats: */
571 unsigned transfer = MIN2(delay, 5 - last_n->repeat);
572 last_n->repeat += transfer;
573 delay -= transfer;
574 cycle += transfer;
575 }
576
577 if (delay > 0) {
578 assert(delay <= 6);
579 ir3_NOP(block)->repeat = delay - 1;
580 cycle += delay;
581 }
582
583 if (ctx->compiler->samgq_workaround &&
584 ctx->type != MESA_SHADER_FRAGMENT &&
585 ctx->type != MESA_SHADER_COMPUTE && n->opc == OPC_SAMGQ) {
586 struct ir3_instruction *samgp;
587
588 list_delinit(&n->node);
589
590 for (i = 0; i < 4; i++) {
591 samgp = ir3_instr_clone(n);
592 samgp->opc = OPC_SAMGP0 + i;
593 if (i > 1)
594 samgp->flags |= IR3_INSTR_SY;
595 }
596 } else {
597 list_delinit(&n->node);
598 list_addtail(&n->node, &block->instr_list);
599 }
600
601 if (is_sfu(n))
602 regmask_set(&state->needs_ss, n->dsts[0]);
603
604 foreach_dst (dst, n) {
605 if (dst->flags & IR3_REG_SHARED) {
606 if (n_is_scalar_alu) {
607 if (dst->flags & IR3_REG_HALF)
608 regmask_set(&state->needs_ss_scalar_full, dst);
609 else
610 regmask_set(&state->needs_ss_scalar_half, dst);
611 } else {
612 regmask_set(&state->needs_ss, dst);
613 }
614 } else if (reg_is_addr1(dst) && block->in_early_preamble) {
615 regmask_set(&state->needs_ss, dst);
616 }
617 }
618
619 if (is_tex_or_prefetch(n) && n->dsts_count > 0) {
620 regmask_set(&state->needs_sy, n->dsts[0]);
621 if (n->opc == OPC_META_TEX_PREFETCH)
622 ctx->has_tex_prefetch = true;
623 } else if (n->opc == OPC_RESINFO && n->dsts_count > 0) {
624 regmask_set(&state->needs_ss, n->dsts[0]);
625 ir3_NOP(block)->flags |= IR3_INSTR_SS;
626 last_input_needs_ss = false;
627 } else if (is_load(n)) {
628 if (is_local_mem_load(n))
629 regmask_set(&state->needs_ss, n->dsts[0]);
630 else
631 regmask_set(&state->needs_sy, n->dsts[0]);
632 } else if (is_atomic(n->opc)) {
633 if (is_bindless_atomic(n->opc)) {
634 regmask_set(&state->needs_sy, n->srcs[2]);
635 } else if (is_global_a3xx_atomic(n->opc) ||
636 is_global_a6xx_atomic(n->opc)) {
637 regmask_set(&state->needs_sy, n->dsts[0]);
638 } else {
639 regmask_set(&state->needs_ss, n->dsts[0]);
640 }
641 } else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
642 state->needs_ss_for_const = true;
643 }
644
645 if (is_ssbo(n->opc) || is_global_a3xx_atomic(n->opc) ||
646 is_bindless_atomic(n->opc))
647 ctx->so->has_ssbo = true;
648
649 /* both tex/sfu appear to not always immediately consume
650 * their src register(s):
651 */
652 if (is_war_hazard_producer(n)) {
653 /* These WAR hazards can always be resolved with (ss). However, when
654 * the reader is a sy-producer, they can also be resolved using (sy)
655 * because once we have synced the reader's results using (sy), its
656 * sources have definitely been consumed. We track the two cases
657 * separately so that we don't add an unnecessary (ss) if a (sy) sync
658 * already happened.
659 * For example, this prevents adding the unnecessary (ss) in the
660 * following sequence:
661 * sam rd, rs, ...
662 * (sy)... ; sam synced so consumed its sources
663 * (ss)write rs ; (ss) unnecessary since rs has been consumed already
664 */
665 bool needs_ss = is_ss_producer(n) || is_store(n) || n->opc == OPC_STC;
666
667 if (n_is_scalar_alu) {
668 /* Scalar ALU also does not immediately read its source because it
669 * is not executed right away, but scalar ALU instructions are
670 * executed in-order so subsequent scalar ALU instructions don't
671 * need to wait for previous ones.
672 */
673 regmask_t *mask = needs_ss ? &state->needs_ss_scalar_war
674 : &state->needs_ss_or_sy_scalar_war;
675
676 foreach_src (reg, n) {
677 if ((reg->flags & IR3_REG_SHARED) || is_reg_a0(reg)) {
678 regmask_set(mask, reg);
679 }
680 }
681 } else {
682 regmask_t *mask =
683 needs_ss ? &state->needs_ss_war : &state->needs_ss_or_sy_war;
684
685 foreach_src (reg, n) {
686 if (!(reg->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
687 regmask_set(mask, reg);
688 }
689 }
690 }
691 }
692
693 bool count = count_instruction(n, ctx->compiler);
694 if (count)
695 cycle += 1;
696
697 delay_update(state, n, cycle, mergedregs);
698
699 if (count)
700 cycle += n->repeat;
701
702 if (ctx->early_input_release && is_input(n)) {
703 last_input_needs_ss |= (n->opc == OPC_LDLV);
704
705 assert(inputs_remaining > 0);
706 inputs_remaining--;
707 if (inputs_remaining == 0) {
708 /* This is the last input. We add the (ei) flag to release
709 * varying memory after this executes. If it's an ldlv,
710 * however, we need to insert a dummy bary.f on which we can
711 * set the (ei) flag. We may also need to insert an (ss) to
712 * guarantee that all ldlv's have finished fetching their
713 * results before releasing the varying memory.
714 */
715 struct ir3_instruction *last_input = n;
716 if (n->opc == OPC_LDLV) {
717 struct ir3_instruction *baryf;
718
719 /* (ss)bary.f (ei)r63.x, 0, r0.x */
720 baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2);
721 ir3_dst_create(baryf, regid(63, 0), 0);
722 ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
723 ir3_src_create(baryf, regid(0, 0), 0);
724
725 last_input = baryf;
726 }
727
728 last_input->dsts[0]->flags |= IR3_REG_EI;
729 if (last_input_needs_ss) {
730 apply_ss(last_input, state, mergedregs);
731 }
732 }
733 }
734
735 last_n = n;
736 }
737
738 assert(inputs_remaining == 0 || !ctx->early_input_release);
739
740 if (block == ir3_after_preamble(ctx->so->ir) &&
741 ctx->has_tex_prefetch && !ctx->has_inputs) {
742 /* texture prefetch, but *no* inputs.. we need to insert a
743 * dummy bary.f at the top of the shader to unblock varying
744 * storage:
745 */
746 struct ir3_instruction *baryf;
747
748 /* (ss)bary.f (ei)r63.x, 0, r0.x */
749 baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2);
750 ir3_dst_create(baryf, regid(63, 0), 0)->flags |= IR3_REG_EI;
751 ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
752 ir3_src_create(baryf, regid(0, 0), 0);
753
754 /* insert the dummy bary.f at head: */
755 list_delinit(&baryf->node);
756 list_add(&baryf->node, &block->instr_list);
757 }
758
759 /* Currently our nop state contains the cycle offset from the start of this
760 * block when each register becomes ready. But successor blocks need the
761 * cycle offset from their start, which is this block's end. Translate the
762 * cycle offset.
763 */
764 for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
765 state->pred_ready[i] = MAX2(state->pred_ready[i], cycle) - cycle;
766 for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
767 state->alu_nop.full_ready[i] =
768 MAX2(state->alu_nop.full_ready[i], cycle) - cycle;
769 state->alu_nop.half_ready[i] =
770 MAX2(state->alu_nop.half_ready[i], cycle) - cycle;
771 state->non_alu_nop.full_ready[i] =
772 MAX2(state->non_alu_nop.full_ready[i], cycle) - cycle;
773 state->non_alu_nop.half_ready[i] =
774 MAX2(state->non_alu_nop.half_ready[i], cycle) - cycle;
775 }
776
777 bd->valid = true;
778
779 if (memcmp(&prev_state, state, sizeof(*state))) {
780 /* our output state changed, this invalidates all of our
781 * successors:
782 */
783 for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
784 if (!block->successors[i])
785 break;
786 struct ir3_legalize_block_data *pbd = block->successors[i]->data;
787 pbd->valid = false;
788 }
789 }
790
791 return true;
792 }
793
794 /* Expands dsxpp and dsypp macros to:
795 *
796 * dsxpp.1 dst, src
797 * dsxpp.1.p dst, src
798 *
799 * We apply this after flags syncing, as we don't want to sync in between the
800 * two (which might happen if dst == src).
801 */
802 static bool
apply_fine_deriv_macro(struct ir3_legalize_ctx * ctx,struct ir3_block * block)803 apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
804 {
805 struct list_head instr_list;
806
807 /* remove all the instructions from the list, we'll be adding
808 * them back in as we go
809 */
810 list_replace(&block->instr_list, &instr_list);
811 list_inithead(&block->instr_list);
812
813 foreach_instr_safe (n, &instr_list) {
814 list_addtail(&n->node, &block->instr_list);
815
816 if (n->opc == OPC_DSXPP_MACRO || n->opc == OPC_DSYPP_MACRO) {
817 n->opc = (n->opc == OPC_DSXPP_MACRO) ? OPC_DSXPP_1 : OPC_DSYPP_1;
818
819 struct ir3_instruction *op_p = ir3_instr_clone(n);
820 op_p->flags = IR3_INSTR_P;
821
822 ctx->so->need_full_quad = true;
823 }
824 }
825
826 return true;
827 }
828
829 /* Some instructions can take a dummy destination of r63.x, which we model as it
830 * not having a destination in the IR to avoid having special code to handle
831 * this. Insert the dummy destination after everything else is done.
832 */
833 static bool
expand_dummy_dests(struct ir3_block * block)834 expand_dummy_dests(struct ir3_block *block)
835 {
836 foreach_instr (n, &block->instr_list) {
837 if ((n->opc == OPC_SAM || n->opc == OPC_LDC || n->opc == OPC_RESINFO) &&
838 n->dsts_count == 0) {
839 struct ir3_register *dst = ir3_dst_create(n, INVALID_REG, 0);
840 /* Copy the blob's writemask */
841 if (n->opc == OPC_SAM)
842 dst->wrmask = 0b1111;
843 }
844 }
845 return true;
846 }
847
848 static void
apply_push_consts_load_macro(struct ir3_legalize_ctx * ctx,struct ir3_block * block)849 apply_push_consts_load_macro(struct ir3_legalize_ctx *ctx,
850 struct ir3_block *block)
851 {
852 foreach_instr (n, &block->instr_list) {
853 if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
854 struct ir3_instruction *stsc = ir3_instr_create(block, OPC_STSC, 0, 2);
855 ir3_instr_move_after(stsc, n);
856 ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
857 n->push_consts.dst_base;
858 ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
859 n->push_consts.src_base;
860 stsc->cat6.iim_val = n->push_consts.src_size;
861 stsc->cat6.type = TYPE_U32;
862
863 if (ctx->compiler->stsc_duplication_quirk) {
864 struct ir3_instruction *nop = ir3_NOP(block);
865 ir3_instr_move_after(nop, stsc);
866 nop->flags |= IR3_INSTR_SS;
867 ir3_instr_move_after(ir3_instr_clone(stsc), nop);
868 }
869
870 list_delinit(&n->node);
871 break;
872 } else if (!is_meta(n)) {
873 break;
874 }
875 }
876 }
877
878 /* NOTE: branch instructions are always the last instruction(s)
879 * in the block. We take advantage of this as we resolve the
880 * branches, since "if (foo) break;" constructs turn into
881 * something like:
882 *
883 * block3 {
884 * ...
885 * 0029:021: mov.s32s32 r62.x, r1.y
886 * 0082:022: br !p0.x, target=block5
887 * 0083:023: br p0.x, target=block4
888 * // succs: if _[0029:021: mov.s32s32] block4; else block5;
889 * }
890 * block4 {
891 * 0084:024: jump, target=block6
892 * // succs: block6;
893 * }
894 * block5 {
895 * 0085:025: jump, target=block7
896 * // succs: block7;
897 * }
898 *
899 * ie. only instruction in block4/block5 is a jump, so when
900 * resolving branches we can easily detect this by checking
901 * that the first instruction in the target block is itself
902 * a jump, and setup the br directly to the jump's target
903 * (and strip back out the now unreached jump)
904 *
905 * TODO sometimes we end up with things like:
906 *
907 * br !p0.x, #2
908 * br p0.x, #12
909 * add.u r0.y, r0.y, 1
910 *
911 * If we swapped the order of the branches, we could drop one.
912 */
913 static struct ir3_block *
resolve_dest_block(struct ir3_block * block)914 resolve_dest_block(struct ir3_block *block)
915 {
916 /* special case for last block: */
917 if (!block->successors[0])
918 return block;
919
920 /* NOTE that we may or may not have inserted the jump
921 * in the target block yet, so conditions to resolve
922 * the dest to the dest block's successor are:
923 *
924 * (1) successor[1] == NULL &&
925 * (2) (block-is-empty || only-instr-is-jump)
926 */
927 if (block->successors[1] == NULL) {
928 if (list_is_empty(&block->instr_list)) {
929 return block->successors[0];
930 } else if (list_length(&block->instr_list) == 1) {
931 struct ir3_instruction *instr =
932 list_first_entry(&block->instr_list, struct ir3_instruction, node);
933 if (instr->opc == OPC_JUMP) {
934 /* If this jump is backwards, then we will probably convert
935 * the jump being resolved to a backwards jump, which will
936 * change a loop-with-continue or loop-with-if into a
937 * doubly-nested loop and change the convergence behavior.
938 * Disallow this here.
939 */
940 if (block->successors[0]->index <= block->index)
941 return block;
942 return block->successors[0];
943 }
944 }
945 }
946 return block;
947 }
948
949 static void
remove_unused_block(struct ir3_block * old_target)950 remove_unused_block(struct ir3_block *old_target)
951 {
952 list_delinit(&old_target->node);
953
954 /* cleanup dangling predecessors: */
955 for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) {
956 if (old_target->successors[i]) {
957 struct ir3_block *succ = old_target->successors[i];
958 ir3_block_remove_predecessor(succ, old_target);
959 }
960 }
961 }
962
963 static bool
retarget_jump(struct ir3_instruction * instr,struct ir3_block * new_target)964 retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target)
965 {
966 struct ir3_block *old_target = instr->cat0.target;
967 struct ir3_block *cur_block = instr->block;
968
969 /* update current blocks successors to reflect the retargetting: */
970 if (cur_block->successors[0] == old_target) {
971 cur_block->successors[0] = new_target;
972 } else {
973 assert(cur_block->successors[1] == old_target);
974 cur_block->successors[1] = new_target;
975 }
976
977 /* update new target's predecessors: */
978 ir3_block_add_predecessor(new_target, cur_block);
979
980 /* and remove old_target's predecessor: */
981 ir3_block_remove_predecessor(old_target, cur_block);
982
983 instr->cat0.target = new_target;
984
985 if (old_target->predecessors_count == 0) {
986 remove_unused_block(old_target);
987 return true;
988 }
989
990 return false;
991 }
992
993 static bool
is_invertible_branch(struct ir3_instruction * instr)994 is_invertible_branch(struct ir3_instruction *instr)
995 {
996 switch (instr->opc) {
997 case OPC_BR:
998 case OPC_BRAA:
999 case OPC_BRAO:
1000 case OPC_BANY:
1001 case OPC_BALL:
1002 return true;
1003 default:
1004 return false;
1005 }
1006 }
1007
1008 static bool
opt_jump(struct ir3 * ir)1009 opt_jump(struct ir3 *ir)
1010 {
1011 bool progress = false;
1012
1013 unsigned index = 0;
1014 foreach_block (block, &ir->block_list)
1015 block->index = index++;
1016
1017 foreach_block (block, &ir->block_list) {
1018 /* This pass destroys the physical CFG so don't keep it around to avoid
1019 * validation errors.
1020 */
1021 block->physical_successors_count = 0;
1022 block->physical_predecessors_count = 0;
1023
1024 foreach_instr (instr, &block->instr_list) {
1025 if (!is_flow(instr) || !instr->cat0.target)
1026 continue;
1027
1028 struct ir3_block *tblock = resolve_dest_block(instr->cat0.target);
1029 if (tblock != instr->cat0.target) {
1030 progress = true;
1031
1032 /* Exit early if we deleted a block to avoid iterator
1033 * weirdness/assert fails
1034 */
1035 if (retarget_jump(instr, tblock))
1036 return true;
1037 }
1038 }
1039
1040 /* Detect the case where the block ends either with:
1041 * - A single unconditional jump to the next block.
1042 * - Two jump instructions with opposite conditions, and one of the
1043 * them jumps to the next block.
1044 * We can remove the one that jumps to the next block in either case.
1045 */
1046 if (list_is_empty(&block->instr_list))
1047 continue;
1048
1049 struct ir3_instruction *jumps[2] = {NULL, NULL};
1050 jumps[0] =
1051 list_last_entry(&block->instr_list, struct ir3_instruction, node);
1052 if (!list_is_singular(&block->instr_list))
1053 jumps[1] =
1054 list_last_entry(&jumps[0]->node, struct ir3_instruction, node);
1055
1056 if (jumps[0]->opc == OPC_JUMP)
1057 jumps[1] = NULL;
1058 else if (!is_invertible_branch(jumps[0]) || !jumps[1] ||
1059 !is_invertible_branch(jumps[1])) {
1060 continue;
1061 }
1062
1063 for (unsigned i = 0; i < 2; i++) {
1064 if (!jumps[i])
1065 continue;
1066 struct ir3_block *tblock = jumps[i]->cat0.target;
1067 if (&tblock->node == block->node.next) {
1068 list_delinit(&jumps[i]->node);
1069 progress = true;
1070 break;
1071 }
1072 }
1073 }
1074
1075 return progress;
1076 }
1077
1078 static void
resolve_jumps(struct ir3 * ir)1079 resolve_jumps(struct ir3 *ir)
1080 {
1081 foreach_block (block, &ir->block_list)
1082 foreach_instr (instr, &block->instr_list)
1083 if (is_flow(instr) && instr->cat0.target) {
1084 struct ir3_instruction *target = list_first_entry(
1085 &instr->cat0.target->instr_list, struct ir3_instruction, node);
1086
1087 instr->cat0.immed = (int)target->ip - (int)instr->ip;
1088 }
1089 }
1090
1091 static void
mark_jp(struct ir3_block * block)1092 mark_jp(struct ir3_block *block)
1093 {
1094 /* We only call this on the end block (in kill_sched) or after retargeting
1095 * all jumps to empty blocks (in mark_xvergence_points) so there's no need to
1096 * worry about empty blocks.
1097 */
1098 assert(!list_is_empty(&block->instr_list));
1099
1100 struct ir3_instruction *target =
1101 list_first_entry(&block->instr_list, struct ir3_instruction, node);
1102 target->flags |= IR3_INSTR_JP;
1103 }
1104
1105 /* Mark points where control flow reconverges.
1106 *
1107 * Re-convergence points are where "parked" threads are reconverged with threads
1108 * that took the opposite path last time around. We already calculated them, we
1109 * just need to mark them with (jp).
1110 */
1111 static void
mark_xvergence_points(struct ir3 * ir)1112 mark_xvergence_points(struct ir3 *ir)
1113 {
1114 foreach_block (block, &ir->block_list) {
1115 if (block->reconvergence_point)
1116 mark_jp(block);
1117 }
1118 }
1119
1120 static void
invert_branch(struct ir3_instruction * branch)1121 invert_branch(struct ir3_instruction *branch)
1122 {
1123 switch (branch->opc) {
1124 case OPC_BR:
1125 break;
1126 case OPC_BALL:
1127 branch->opc = OPC_BANY;
1128 break;
1129 case OPC_BANY:
1130 branch->opc = OPC_BALL;
1131 break;
1132 case OPC_BRAA:
1133 branch->opc = OPC_BRAO;
1134 break;
1135 case OPC_BRAO:
1136 branch->opc = OPC_BRAA;
1137 break;
1138 default:
1139 unreachable("can't get here");
1140 }
1141
1142 branch->cat0.inv1 = !branch->cat0.inv1;
1143 branch->cat0.inv2 = !branch->cat0.inv2;
1144 branch->cat0.target = branch->block->successors[1];
1145 }
1146
1147 /* Insert the branch/jump instructions for flow control between blocks.
1148 * Initially this is done naively, without considering if the successor
1149 * block immediately follows the current block (ie. so no jump required),
1150 * but that is cleaned up in opt_jump().
1151 */
1152 static void
block_sched(struct ir3 * ir)1153 block_sched(struct ir3 *ir)
1154 {
1155 foreach_block (block, &ir->block_list) {
1156 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1157
1158 if (block->successors[1]) {
1159 /* if/else, conditional branches to "then" or "else": */
1160 struct ir3_instruction *br1, *br2;
1161
1162 assert(terminator);
1163 unsigned opc = terminator->opc;
1164
1165 if (opc == OPC_GETONE || opc == OPC_SHPS || opc == OPC_GETLAST) {
1166 /* getone/shps can't be inverted, and it wouldn't even make sense
1167 * to follow it with an inverted branch, so follow it by an
1168 * unconditional branch.
1169 */
1170 assert(terminator->srcs_count == 0);
1171 br1 = terminator;
1172 br1->cat0.target = block->successors[1];
1173
1174 br2 = ir3_JUMP(block);
1175 br2->cat0.target = block->successors[0];
1176 } else if (opc == OPC_BR || opc == OPC_BRAA || opc == OPC_BRAO ||
1177 opc == OPC_BALL || opc == OPC_BANY) {
1178 /* create "else" branch first (since "then" block should
1179 * frequently/always end up being a fall-thru):
1180 */
1181 br1 = terminator;
1182 br2 = ir3_instr_clone(br1);
1183 invert_branch(br1);
1184 br2->cat0.target = block->successors[0];
1185 } else {
1186 assert(opc == OPC_PREDT || opc == OPC_PREDF);
1187
1188 /* Handled by prede_sched. */
1189 terminator->cat0.target = block->successors[0];
1190 continue;
1191 }
1192
1193 /* Creating br2 caused it to be moved before the terminator b1, move it
1194 * back.
1195 */
1196 ir3_instr_move_after(br2, br1);
1197 } else if (block->successors[0]) {
1198 /* otherwise unconditional jump or predt/predf to next block which
1199 * should already have been inserted.
1200 */
1201 assert(terminator);
1202 assert(terminator->opc == OPC_JUMP || terminator->opc == OPC_PREDT ||
1203 terminator->opc == OPC_PREDF);
1204 terminator->cat0.target = block->successors[0];
1205 }
1206 }
1207 }
1208
1209 static void
prede_sched(struct ir3 * ir)1210 prede_sched(struct ir3 *ir)
1211 {
1212 unsigned index = 0;
1213 foreach_block (block, &ir->block_list)
1214 block->index = index++;
1215
1216 foreach_block (block, &ir->block_list) {
1217 /* Look for the following pattern generated by NIR lowering. The numbers
1218 * at the top of blocks are their index.
1219 * |--- i ----|
1220 * | ... |
1221 * | pred[tf] |
1222 * |----------|
1223 * succ0 / \ succ1
1224 * |-- i+1 ---| |-- i+2 ---|
1225 * | ... | | ... |
1226 * | pred[ft] | | ... |
1227 * |----------| |----------|
1228 * succ0 \ / succ0
1229 * |--- j ----|
1230 * | ... |
1231 * |----------|
1232 */
1233 struct ir3_block *succ0 = block->successors[0];
1234 struct ir3_block *succ1 = block->successors[1];
1235
1236 if (!succ1)
1237 continue;
1238
1239 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1240 if (!terminator)
1241 continue;
1242 if (terminator->opc != OPC_PREDT && terminator->opc != OPC_PREDF)
1243 continue;
1244
1245 assert(!succ0->successors[1] && !succ1->successors[1]);
1246 assert(succ0->successors[0] == succ1->successors[0]);
1247 assert(succ0->predecessors_count == 1 && succ1->predecessors_count == 1);
1248 assert(succ0->index == (block->index + 1));
1249 assert(succ1->index == (block->index + 2));
1250
1251 struct ir3_instruction *succ0_terminator =
1252 ir3_block_get_terminator(succ0);
1253 assert(succ0_terminator);
1254 assert(succ0_terminator->opc ==
1255 (terminator->opc == OPC_PREDT ? OPC_PREDF : OPC_PREDT));
1256
1257 ASSERTED struct ir3_instruction *succ1_terminator =
1258 ir3_block_get_terminator(succ1);
1259 assert(!succ1_terminator || (succ1_terminator->opc == OPC_JUMP));
1260
1261 /* Simple case: both successors contain instructions. Keep both blocks and
1262 * insert prede before the second successor's terminator:
1263 * |--- i ----|
1264 * | ... |
1265 * | pred[tf] |
1266 * |----------|
1267 * succ0 / \ succ1
1268 * |-- i+1 ---| |-- i+2 ---|
1269 * | ... | | ... |
1270 * | pred[ft] | | prede |
1271 * |----------| |----------|
1272 * succ0 \ / succ0
1273 * |--- j ----|
1274 * | ... |
1275 * |----------|
1276 */
1277 if (!list_is_empty(&succ1->instr_list)) {
1278 ir3_PREDE(succ1);
1279 continue;
1280 }
1281
1282 /* Second successor is empty so we can remove it:
1283 * |--- i ----|
1284 * | ... |
1285 * | pred[tf] |
1286 * |----------|
1287 * succ0 / \ succ1
1288 * |-- i+1 ---| |
1289 * | ... | |
1290 * | prede | |
1291 * |----------| |
1292 * succ0 \ /
1293 * |--- j ----|
1294 * | ... |
1295 * |----------|
1296 */
1297 list_delinit(&succ0_terminator->node);
1298 ir3_PREDE(succ0);
1299 remove_unused_block(succ1);
1300 block->successors[1] = succ0->successors[0];
1301 ir3_block_add_predecessor(succ0->successors[0], block);
1302 }
1303 }
1304
1305 /* Here we workaround the fact that kill doesn't actually kill the thread as
1306 * GL expects. The last instruction always needs to be an end instruction,
1307 * which means that if we're stuck in a loop where kill is the only way out,
1308 * then we may have to jump out to the end. kill may also have the d3d
1309 * semantics of converting the thread to a helper thread, rather than setting
1310 * the exec mask to 0, in which case the helper thread could get stuck in an
1311 * infinite loop.
1312 *
1313 * We do this late, both to give the scheduler the opportunity to reschedule
1314 * kill instructions earlier and to avoid having to create a separate basic
1315 * block.
1316 *
1317 * TODO: Assuming that the wavefront doesn't stop as soon as all threads are
1318 * killed, we might benefit by doing this more aggressively when the remaining
1319 * part of the program after the kill is large, since that would let us
1320 * skip over the instructions when there are no non-killed threads left.
1321 */
1322 static void
kill_sched(struct ir3 * ir,struct ir3_shader_variant * so)1323 kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
1324 {
1325 ir3_count_instructions(ir);
1326
1327 /* True if we know that this block will always eventually lead to the end
1328 * block:
1329 */
1330 bool always_ends = true;
1331 bool added = false;
1332 struct ir3_block *last_block =
1333 list_last_entry(&ir->block_list, struct ir3_block, node);
1334
1335 foreach_block_rev (block, &ir->block_list) {
1336 for (unsigned i = 0; i < 2 && block->successors[i]; i++) {
1337 if (block->successors[i]->start_ip <= block->end_ip)
1338 always_ends = false;
1339 }
1340
1341 if (always_ends)
1342 continue;
1343
1344 foreach_instr_safe (instr, &block->instr_list) {
1345 if (instr->opc != OPC_KILL)
1346 continue;
1347
1348 struct ir3_instruction *br = ir3_instr_create(block, OPC_BR, 0, 1);
1349 ir3_src_create(br, instr->srcs[0]->num, instr->srcs[0]->flags)->wrmask =
1350 1;
1351 br->cat0.target =
1352 list_last_entry(&ir->block_list, struct ir3_block, node);
1353
1354 list_del(&br->node);
1355 list_add(&br->node, &instr->node);
1356
1357 added = true;
1358 }
1359 }
1360
1361 if (added) {
1362 /* I'm not entirely sure how the branchstack works, but we probably
1363 * need to add at least one entry for the divergence which is resolved
1364 * at the end:
1365 */
1366 so->branchstack++;
1367
1368 /* We don't update predecessors/successors, so we have to do this
1369 * manually:
1370 */
1371 mark_jp(last_block);
1372 }
1373 }
1374
1375 static void
dbg_sync_sched(struct ir3 * ir,struct ir3_shader_variant * so)1376 dbg_sync_sched(struct ir3 *ir, struct ir3_shader_variant *so)
1377 {
1378 foreach_block (block, &ir->block_list) {
1379 foreach_instr_safe (instr, &block->instr_list) {
1380 if (is_ss_producer(instr) || is_sy_producer(instr)) {
1381 struct ir3_instruction *nop = ir3_NOP(block);
1382 nop->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
1383 ir3_instr_move_after(nop, instr);
1384 }
1385 }
1386 }
1387 }
1388
1389 static void
dbg_nop_sched(struct ir3 * ir,struct ir3_shader_variant * so)1390 dbg_nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
1391 {
1392 foreach_block (block, &ir->block_list) {
1393 foreach_instr_safe (instr, &block->instr_list) {
1394 struct ir3_instruction *nop = ir3_NOP(block);
1395 nop->repeat = 5;
1396 ir3_instr_move_before(nop, instr);
1397 }
1398 }
1399 }
1400
1401 static void
dbg_expand_rpt(struct ir3 * ir)1402 dbg_expand_rpt(struct ir3 *ir)
1403 {
1404 foreach_block (block, &ir->block_list) {
1405 foreach_instr_safe (instr, &block->instr_list) {
1406 if (instr->repeat == 0 || instr->opc == OPC_NOP ||
1407 instr->opc == OPC_SWZ || instr->opc == OPC_GAT ||
1408 instr->opc == OPC_SCT) {
1409 continue;
1410 }
1411
1412 for (unsigned i = 0; i <= instr->repeat; ++i) {
1413 struct ir3_instruction *rpt = ir3_instr_clone(instr);
1414 ir3_instr_move_before(rpt, instr);
1415 rpt->repeat = 0;
1416
1417 foreach_dst (dst, rpt) {
1418 dst->num += i;
1419 dst->wrmask = 1;
1420 }
1421
1422 foreach_src (src, rpt) {
1423 if (!(src->flags & IR3_REG_R))
1424 continue;
1425
1426 src->num += i;
1427 src->uim_val += i;
1428 src->wrmask = 1;
1429 src->flags &= ~IR3_REG_R;
1430 }
1431 }
1432
1433 list_delinit(&instr->node);
1434 }
1435 }
1436 }
1437
1438 struct ir3_helper_block_data {
1439 /* Whether helper invocations may be used on any path starting at the
1440 * beginning of the block.
1441 */
1442 bool uses_helpers_beginning;
1443
1444 /* Whether helper invocations may be used by the end of the block. Branch
1445 * instructions are considered to be "between" blocks, because (eq) has to be
1446 * inserted after them in the successor blocks, so branch instructions using
1447 * helpers will result in uses_helpers_end = true for their block.
1448 */
1449 bool uses_helpers_end;
1450 };
1451
1452 /* Insert (eq) after the last instruction using the results of helper
1453 * invocations. Use a backwards dataflow analysis to determine at which points
1454 * in the program helper invocations are definitely never used, and then insert
1455 * (eq) at the point where we cross from a point where they may be used to a
1456 * point where they are never used.
1457 */
1458 static void
helper_sched(struct ir3_legalize_ctx * ctx,struct ir3 * ir,struct ir3_shader_variant * so)1459 helper_sched(struct ir3_legalize_ctx *ctx, struct ir3 *ir,
1460 struct ir3_shader_variant *so)
1461 {
1462 bool non_prefetch_helpers = false;
1463
1464 foreach_block (block, &ir->block_list) {
1465 struct ir3_helper_block_data *bd =
1466 rzalloc(ctx, struct ir3_helper_block_data);
1467 foreach_instr (instr, &block->instr_list) {
1468 if (uses_helpers(instr)) {
1469 bd->uses_helpers_beginning = true;
1470 if (instr->opc != OPC_META_TEX_PREFETCH) {
1471 non_prefetch_helpers = true;
1472 }
1473 }
1474
1475 if (instr->opc == OPC_SHPE) {
1476 /* (eq) is not allowed in preambles, mark the whole preamble as
1477 * requiring helpers to avoid putting it there.
1478 */
1479 bd->uses_helpers_beginning = true;
1480 bd->uses_helpers_end = true;
1481 }
1482 }
1483
1484 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1485 if (terminator) {
1486 if (terminator->opc == OPC_BALL || terminator->opc == OPC_BANY ||
1487 (terminator->opc == OPC_GETONE &&
1488 (terminator->flags & IR3_INSTR_NEEDS_HELPERS))) {
1489 bd->uses_helpers_beginning = true;
1490 bd->uses_helpers_end = true;
1491 non_prefetch_helpers = true;
1492 }
1493 }
1494
1495 block->data = bd;
1496 }
1497
1498 /* If only prefetches use helpers then we can disable them in the shader via
1499 * a register setting.
1500 */
1501 if (!non_prefetch_helpers) {
1502 so->prefetch_end_of_quad = true;
1503 return;
1504 }
1505
1506 bool progress;
1507 do {
1508 progress = false;
1509 foreach_block_rev (block, &ir->block_list) {
1510 struct ir3_helper_block_data *bd = block->data;
1511
1512 if (!bd->uses_helpers_beginning)
1513 continue;
1514
1515 for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
1516 struct ir3_block *pred = block->physical_predecessors[i];
1517 struct ir3_helper_block_data *pred_bd = pred->data;
1518 if (!pred_bd->uses_helpers_end) {
1519 pred_bd->uses_helpers_end = true;
1520 }
1521 if (!pred_bd->uses_helpers_beginning) {
1522 pred_bd->uses_helpers_beginning = true;
1523 progress = true;
1524 }
1525 }
1526 }
1527 } while (progress);
1528
1529 /* Now, we need to determine the points where helper invocations become
1530 * unused.
1531 */
1532 foreach_block (block, &ir->block_list) {
1533 struct ir3_helper_block_data *bd = block->data;
1534 if (bd->uses_helpers_end)
1535 continue;
1536
1537 /* We need to check the predecessors because of situations with critical
1538 * edges like this that can occur after optimizing jumps:
1539 *
1540 * br p0.x, #endif
1541 * ...
1542 * sam ...
1543 * ...
1544 * endif:
1545 * ...
1546 * end
1547 *
1548 * The endif block will have uses_helpers_beginning = false and
1549 * uses_helpers_end = false, but because we jump to there from the
1550 * beginning of the if where uses_helpers_end = true, we still want to
1551 * add an (eq) at the beginning of the block:
1552 *
1553 * br p0.x, #endif
1554 * ...
1555 * sam ...
1556 * (eq)nop
1557 * ...
1558 * endif:
1559 * (eq)nop
1560 * ...
1561 * end
1562 *
1563 * This an extra nop in the case where the branch isn't taken, but that's
1564 * probably preferable to adding an extra jump instruction which is what
1565 * would happen if we ran this pass before optimizing jumps:
1566 *
1567 * br p0.x, #else
1568 * ...
1569 * sam ...
1570 * (eq)nop
1571 * ...
1572 * jump #endif
1573 * else:
1574 * (eq)nop
1575 * endif:
1576 * ...
1577 * end
1578 *
1579 * We also need this to make sure we insert (eq) after branches which use
1580 * helper invocations.
1581 */
1582 bool pred_uses_helpers = bd->uses_helpers_beginning;
1583 for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
1584 struct ir3_block *pred = block->physical_predecessors[i];
1585 struct ir3_helper_block_data *pred_bd = pred->data;
1586 if (pred_bd->uses_helpers_end) {
1587 pred_uses_helpers = true;
1588 break;
1589 }
1590 }
1591
1592 if (!pred_uses_helpers)
1593 continue;
1594
1595 /* The last use of helpers is somewhere between the beginning and the
1596 * end. first_instr will be the first instruction where helpers are no
1597 * longer required, or NULL if helpers are not required just at the end.
1598 */
1599 struct ir3_instruction *first_instr = NULL;
1600 foreach_instr_rev (instr, &block->instr_list) {
1601 /* Skip prefetches because they actually execute before the block
1602 * starts and at this stage they aren't guaranteed to be at the start
1603 * of the block.
1604 */
1605 if (uses_helpers(instr) && instr->opc != OPC_META_TEX_PREFETCH)
1606 break;
1607 first_instr = instr;
1608 }
1609
1610 bool killed = false;
1611 bool expensive_instruction_in_block = false;
1612 if (first_instr) {
1613 foreach_instr_from (instr, first_instr, &block->instr_list) {
1614 /* If there's already a nop, we don't have to worry about whether to
1615 * insert one.
1616 */
1617 if (instr->opc == OPC_NOP) {
1618 instr->flags |= IR3_INSTR_EQ;
1619 killed = true;
1620 break;
1621 }
1622
1623 /* ALU and SFU instructions probably aren't going to benefit much
1624 * from killing helper invocations, because they complete at least
1625 * an entire quad in a cycle and don't access any quad-divergent
1626 * memory, so delay emitting (eq) in the hopes that we find a nop
1627 * afterwards.
1628 */
1629 if (is_alu(instr) || is_sfu(instr))
1630 continue;
1631 if (instr->opc == OPC_PREDE)
1632 continue;
1633
1634 expensive_instruction_in_block = true;
1635 break;
1636 }
1637 }
1638
1639 /* If this block isn't the last block before the end instruction, assume
1640 * that there may be expensive instructions in later blocks so it's worth
1641 * it to insert a nop.
1642 */
1643 if (!killed && (expensive_instruction_in_block ||
1644 block->successors[0] != ir3_end_block(ir))) {
1645 struct ir3_instruction *nop = ir3_NOP(block);
1646 nop->flags |= IR3_INSTR_EQ;
1647 if (first_instr)
1648 ir3_instr_move_before(nop, first_instr);
1649 }
1650 }
1651 }
1652
1653 bool
ir3_legalize(struct ir3 * ir,struct ir3_shader_variant * so,int * max_bary)1654 ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
1655 {
1656 struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
1657 bool mergedregs = so->mergedregs;
1658 bool progress;
1659
1660 ctx->so = so;
1661 ctx->max_bary = -1;
1662 ctx->compiler = ir->compiler;
1663 ctx->type = ir->type;
1664
1665 /* allocate per-block data: */
1666 foreach_block (block, &ir->block_list) {
1667 struct ir3_legalize_block_data *bd =
1668 rzalloc(ctx, struct ir3_legalize_block_data);
1669
1670 regmask_init(&bd->state.needs_ss_war, mergedregs);
1671 regmask_init(&bd->state.needs_ss_or_sy_war, mergedregs);
1672 regmask_init(&bd->state.needs_ss_scalar_war, mergedregs);
1673 regmask_init(&bd->state.needs_ss_or_sy_scalar_war, mergedregs);
1674 regmask_init(&bd->state.needs_ss_scalar_full, mergedregs);
1675 regmask_init(&bd->state.needs_ss_scalar_half, mergedregs);
1676 regmask_init(&bd->state.needs_ss, mergedregs);
1677 regmask_init(&bd->state.needs_sy, mergedregs);
1678 regmask_init(&bd->begin_state.needs_ss_war, mergedregs);
1679 regmask_init(&bd->begin_state.needs_ss_or_sy_war, mergedregs);
1680 regmask_init(&bd->begin_state.needs_ss_scalar_war, mergedregs);
1681 regmask_init(&bd->begin_state.needs_ss_or_sy_scalar_war, mergedregs);
1682 regmask_init(&bd->begin_state.needs_ss_scalar_full, mergedregs);
1683 regmask_init(&bd->begin_state.needs_ss_scalar_half, mergedregs);
1684 regmask_init(&bd->begin_state.needs_ss, mergedregs);
1685 regmask_init(&bd->begin_state.needs_sy, mergedregs);
1686
1687 block->data = bd;
1688 }
1689
1690 /* We may have failed to pull all input loads into the first block.
1691 * In such case at the moment we aren't able to find a better place
1692 * to for (ei) than the end of the program.
1693 * a5xx and a6xx do automatically release varying storage at the end.
1694 */
1695 ctx->early_input_release = true;
1696
1697 struct ir3_block *start_block = ir3_after_preamble(ir);
1698
1699 /* Gather information to determine whether we can enable early preamble.
1700 */
1701 bool gpr_in_preamble = false;
1702 bool pred_in_preamble = false;
1703 bool relative_in_preamble = false;
1704 bool in_preamble = start_block != ir3_start_block(ir);
1705 bool has_preamble = start_block != ir3_start_block(ir);
1706
1707 foreach_block (block, &ir->block_list) {
1708 if (block == start_block)
1709 in_preamble = false;
1710
1711 foreach_instr (instr, &block->instr_list) {
1712 if (is_input(instr)) {
1713 ctx->has_inputs = true;
1714 if (block != start_block) {
1715 ctx->early_input_release = false;
1716 }
1717 }
1718
1719 if (is_meta(instr))
1720 continue;
1721
1722 foreach_src (reg, instr) {
1723 if (in_preamble) {
1724 if (!(reg->flags & (IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED)) &&
1725 is_reg_gpr(reg))
1726 gpr_in_preamble = true;
1727 if (reg->flags & IR3_REG_RELATIV)
1728 relative_in_preamble = true;
1729 }
1730 }
1731
1732 foreach_dst (reg, instr) {
1733 if (is_dest_gpr(reg)) {
1734 if (in_preamble) {
1735 if (!(reg->flags & IR3_REG_SHARED))
1736 gpr_in_preamble = true;
1737 if (reg->flags & IR3_REG_RELATIV)
1738 relative_in_preamble = true;
1739 }
1740 }
1741 }
1742
1743 if (in_preamble && writes_pred(instr)) {
1744 pred_in_preamble = true;
1745 }
1746 }
1747 }
1748
1749 so->early_preamble = has_preamble && !gpr_in_preamble &&
1750 !pred_in_preamble && !relative_in_preamble &&
1751 ir->compiler->has_early_preamble &&
1752 !(ir3_shader_debug & IR3_DBG_NOEARLYPREAMBLE);
1753
1754 /* On a7xx, sync behavior for a1.x is different in the early preamble. RaW
1755 * dependencies must be synchronized with (ss) there must be an extra
1756 * (r) on the source of the mova1 instruction.
1757 */
1758 if (so->early_preamble && ir->compiler->gen >= 7) {
1759 foreach_block (block, &ir->block_list) {
1760 if (block == start_block)
1761 break;
1762 block->in_early_preamble = true;
1763 }
1764 }
1765
1766 assert(ctx->early_input_release || ctx->compiler->gen >= 5);
1767
1768 if (ir3_shader_debug & IR3_DBG_EXPANDRPT) {
1769 dbg_expand_rpt(ir);
1770 }
1771
1772 /* process each block: */
1773 do {
1774 progress = false;
1775 foreach_block (block, &ir->block_list) {
1776 progress |= legalize_block(ctx, block);
1777 }
1778 } while (progress);
1779
1780 *max_bary = ctx->max_bary;
1781
1782 foreach_block (block, &ir->block_list) {
1783 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1784 if (terminator && terminator->opc == OPC_GETONE) {
1785 apply_push_consts_load_macro(ctx, block->successors[0]);
1786 break;
1787 }
1788 }
1789
1790 block_sched(ir);
1791
1792 foreach_block (block, &ir->block_list) {
1793 progress |= apply_fine_deriv_macro(ctx, block);
1794 }
1795
1796 if (ir3_shader_debug & IR3_DBG_FULLSYNC) {
1797 dbg_sync_sched(ir, so);
1798 }
1799
1800 if (ir3_shader_debug & IR3_DBG_FULLNOP) {
1801 dbg_nop_sched(ir, so);
1802 }
1803
1804 bool cfg_changed = false;
1805 while (opt_jump(ir))
1806 cfg_changed = true;
1807
1808 prede_sched(ir);
1809
1810 if (cfg_changed)
1811 ir3_calc_reconvergence(so);
1812
1813 if (so->type == MESA_SHADER_FRAGMENT)
1814 kill_sched(ir, so);
1815
1816 /* TODO: does (eq) exist before a6xx? */
1817 if (so->type == MESA_SHADER_FRAGMENT && so->need_pixlod &&
1818 so->compiler->gen >= 6)
1819 helper_sched(ctx, ir, so);
1820
1821 foreach_block (block, &ir->block_list) {
1822 progress |= expand_dummy_dests(block);
1823 }
1824
1825 ir3_count_instructions(ir);
1826 resolve_jumps(ir);
1827
1828 mark_xvergence_points(ir);
1829
1830 ralloc_free(ctx);
1831
1832 return true;
1833 }
1834