1 /*
2 * Copyright 2023 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "aco_ir.h"
7
8 #include "util/bitscan.h"
9 #include "util/macros.h"
10
11 #include <limits>
12
13 /*
14 * This pass implements a simple forward list-scheduler which works on a small
15 * partial DAG of 16 nodes at any time. Only ALU instructions are scheduled
16 * entirely freely. Memory load instructions must be kept in-order and any other
17 * instruction must not be re-scheduled at all.
18 *
19 * The main goal of this scheduler is to create more memory clauses, schedule
20 * memory loads early, and to improve ALU instruction level parallelism.
21 */
22
23 namespace aco {
24 namespace {
25
26 constexpr unsigned num_nodes = 16;
27 using mask_t = uint16_t;
28 static_assert(std::numeric_limits<mask_t>::digits >= num_nodes);
29
30 struct VOPDInfo {
VOPDInfoaco::__anonf73aa87b0111::VOPDInfo31 VOPDInfo() : is_opy_only(0), is_dst_odd(0), src_banks(0), has_literal(0), is_commutative(0) {}
32 uint16_t is_opy_only : 1;
33 uint16_t is_dst_odd : 1;
34 uint16_t src_banks : 10; /* 0-3: src0, 4-7: src1, 8-9: src2 */
35 uint16_t has_literal : 1;
36 uint16_t is_commutative : 1;
37 aco_opcode op = aco_opcode::num_opcodes;
38 uint32_t literal = 0;
39 };
40
41 struct InstrInfo {
42 Instruction* instr;
43 int32_t priority;
44 mask_t dependency_mask; /* bitmask of nodes which have to be scheduled before this node. */
45 uint8_t next_non_reorderable; /* index of next non-reorderable instruction node after this one. */
46 bool potential_clause; /* indicates that this instruction is not (yet) immediately followed by a
47 reorderable instruction. */
48 };
49
50 struct RegisterInfo {
51 mask_t read_mask; /* bitmask of nodes which have to be scheduled before the next write. */
52 int8_t latency; /* estimated latency of last register write. */
53 uint8_t direct_dependency : 4; /* node that has to be scheduled before any other access. */
54 uint8_t has_direct_dependency : 1; /* whether there is an unscheduled direct dependency. */
55 uint8_t padding : 3;
56 };
57
58 struct SchedILPContext {
59 Program* program;
60 bool is_vopd = false;
61 InstrInfo nodes[num_nodes];
62 RegisterInfo regs[512];
63 mask_t non_reorder_mask = 0; /* bitmask of instruction nodes which should not be reordered. */
64 mask_t active_mask = 0; /* bitmask of valid instruction nodes. */
65 uint8_t next_non_reorderable = UINT8_MAX; /* index of next node which should not be reordered. */
66 uint8_t last_non_reorderable = UINT8_MAX; /* index of last node which should not be reordered. */
67
68 /* VOPD scheduler: */
69 VOPDInfo vopd[num_nodes];
70 VOPDInfo prev_vopd_info;
71 InstrInfo prev_info;
72
73 mask_t vopd_odd_mask = 0;
74 mask_t vopd_even_mask = 0;
75 };
76
77 /**
78 * Returns true for side-effect free SALU and VALU instructions.
79 */
80 bool
can_reorder(const Instruction * const instr)81 can_reorder(const Instruction* const instr)
82 {
83 if (instr->isVALU())
84 return true;
85 if (!instr->isSALU() || instr->isSOPP())
86 return false;
87
88 switch (instr->opcode) {
89 /* SOP2 */
90 case aco_opcode::s_cbranch_g_fork:
91 case aco_opcode::s_rfe_restore_b64:
92 /* SOP1 */
93 case aco_opcode::s_setpc_b64:
94 case aco_opcode::s_swappc_b64:
95 case aco_opcode::s_rfe_b64:
96 case aco_opcode::s_cbranch_join:
97 case aco_opcode::s_set_gpr_idx_idx:
98 case aco_opcode::s_sendmsg_rtn_b32:
99 case aco_opcode::s_sendmsg_rtn_b64:
100 case aco_opcode::s_barrier_signal:
101 case aco_opcode::s_barrier_signal_isfirst:
102 case aco_opcode::s_get_barrier_state:
103 case aco_opcode::s_barrier_init:
104 case aco_opcode::s_barrier_join:
105 case aco_opcode::s_wakeup_barrier:
106 /* SOPK */
107 case aco_opcode::s_cbranch_i_fork:
108 case aco_opcode::s_getreg_b32:
109 case aco_opcode::s_setreg_b32:
110 case aco_opcode::s_setreg_imm32_b32:
111 case aco_opcode::s_call_b64:
112 case aco_opcode::s_waitcnt_vscnt:
113 case aco_opcode::s_waitcnt_vmcnt:
114 case aco_opcode::s_waitcnt_expcnt:
115 case aco_opcode::s_waitcnt_lgkmcnt:
116 case aco_opcode::s_subvector_loop_begin:
117 case aco_opcode::s_subvector_loop_end:
118 /* SOPC */
119 case aco_opcode::s_setvskip:
120 case aco_opcode::s_set_gpr_idx_on: return false;
121 default: break;
122 }
123
124 return true;
125 }
126
127 VOPDInfo
get_vopd_info(const SchedILPContext & ctx,const Instruction * instr)128 get_vopd_info(const SchedILPContext& ctx, const Instruction* instr)
129 {
130 if (instr->format != Format::VOP1 && instr->format != Format::VOP2)
131 return VOPDInfo();
132
133 VOPDInfo info;
134 info.is_commutative = true;
135 switch (instr->opcode) {
136 case aco_opcode::v_fmac_f32: info.op = aco_opcode::v_dual_fmac_f32; break;
137 case aco_opcode::v_fmaak_f32: info.op = aco_opcode::v_dual_fmaak_f32; break;
138 case aco_opcode::v_fmamk_f32:
139 info.op = aco_opcode::v_dual_fmamk_f32;
140 info.is_commutative = false;
141 break;
142 case aco_opcode::v_mul_f32: info.op = aco_opcode::v_dual_mul_f32; break;
143 case aco_opcode::v_add_f32: info.op = aco_opcode::v_dual_add_f32; break;
144 case aco_opcode::v_sub_f32: info.op = aco_opcode::v_dual_sub_f32; break;
145 case aco_opcode::v_subrev_f32: info.op = aco_opcode::v_dual_subrev_f32; break;
146 case aco_opcode::v_mul_legacy_f32: info.op = aco_opcode::v_dual_mul_dx9_zero_f32; break;
147 case aco_opcode::v_mov_b32: info.op = aco_opcode::v_dual_mov_b32; break;
148 case aco_opcode::v_bfrev_b32:
149 if (!instr->operands[0].isConstant())
150 return VOPDInfo();
151 info.op = aco_opcode::v_dual_mov_b32;
152 break;
153 case aco_opcode::v_cndmask_b32:
154 info.op = aco_opcode::v_dual_cndmask_b32;
155 info.is_commutative = false;
156 break;
157 case aco_opcode::v_max_f32: info.op = aco_opcode::v_dual_max_f32; break;
158 case aco_opcode::v_min_f32: info.op = aco_opcode::v_dual_min_f32; break;
159 case aco_opcode::v_dot2c_f32_f16: info.op = aco_opcode::v_dual_dot2acc_f32_f16; break;
160 case aco_opcode::v_add_u32:
161 info.op = aco_opcode::v_dual_add_nc_u32;
162 info.is_opy_only = true;
163 break;
164 case aco_opcode::v_lshlrev_b32:
165 info.op = aco_opcode::v_dual_lshlrev_b32;
166 info.is_opy_only = true;
167 info.is_commutative = false;
168 break;
169 case aco_opcode::v_and_b32:
170 info.op = aco_opcode::v_dual_and_b32;
171 info.is_opy_only = true;
172 break;
173 default: return VOPDInfo();
174 }
175
176 /* Each instruction may use at most one SGPR. */
177 if (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[0].isOfType(RegType::sgpr))
178 return VOPDInfo();
179
180 info.is_dst_odd = instr->definitions[0].physReg().reg() & 0x1;
181
182 static const unsigned bank_mask[3] = {0x3, 0x3, 0x1};
183 bool has_sgpr = false;
184 for (unsigned i = 0; i < instr->operands.size(); i++) {
185 Operand op = instr->operands[i];
186 if (instr->opcode == aco_opcode::v_bfrev_b32)
187 op = Operand::get_const(ctx.program->gfx_level, util_bitreverse(op.constantValue()), 4);
188
189 unsigned port = (instr->opcode == aco_opcode::v_fmamk_f32 && i == 1) ? 2 : i;
190 if (op.isOfType(RegType::vgpr))
191 info.src_banks |= 1 << (port * 4 + (op.physReg().reg() & bank_mask[port]));
192
193 /* Check all operands because of fmaak/fmamk. */
194 if (op.isLiteral()) {
195 assert(!info.has_literal || info.literal == op.constantValue());
196 info.has_literal = true;
197 info.literal = op.constantValue();
198 }
199
200 /* Check all operands because of cndmask. */
201 has_sgpr |= !op.isConstant() && op.isOfType(RegType::sgpr);
202 }
203
204 /* An instruction can't use both a literal and an SGPR. */
205 if (has_sgpr && info.has_literal)
206 return VOPDInfo();
207
208 info.is_commutative &= instr->operands[0].isOfType(RegType::vgpr);
209
210 return info;
211 }
212
213 bool
is_vopd_compatible(const VOPDInfo & a,const VOPDInfo & b)214 is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
215 {
216 if ((a.is_opy_only && b.is_opy_only) || (a.is_dst_odd == b.is_dst_odd))
217 return false;
218
219 /* Both can use a literal, but it must be the same literal. */
220 if (a.has_literal && b.has_literal && a.literal != b.literal)
221 return false;
222
223 /* The rest is checking src VGPR bank compatibility. */
224 if ((a.src_banks & b.src_banks) == 0)
225 return true;
226
227 if (!a.is_commutative && !b.is_commutative)
228 return false;
229
230 uint16_t src0 = a.src_banks & 0xf;
231 uint16_t src1 = a.src_banks & 0xf0;
232 uint16_t src2 = a.src_banks & 0x300;
233 uint16_t a_src_banks = (src0 << 4) | (src1 >> 4) | src2;
234 if ((a_src_banks & b.src_banks) != 0)
235 return false;
236
237 /* If we have to turn v_mov_b32 into v_add_u32 but there is already an OPY-only instruction,
238 * we can't do it.
239 */
240 if (a.op == aco_opcode::v_dual_mov_b32 && !b.is_commutative && b.is_opy_only)
241 return false;
242 if (b.op == aco_opcode::v_dual_mov_b32 && !a.is_commutative && a.is_opy_only)
243 return false;
244
245 return true;
246 }
247
248 bool
can_use_vopd(const SchedILPContext & ctx,unsigned idx)249 can_use_vopd(const SchedILPContext& ctx, unsigned idx)
250 {
251 VOPDInfo cur_vopd = ctx.vopd[idx];
252 Instruction* first = ctx.nodes[idx].instr;
253 Instruction* second = ctx.prev_info.instr;
254
255 if (!second)
256 return false;
257
258 if (ctx.prev_vopd_info.op == aco_opcode::num_opcodes || cur_vopd.op == aco_opcode::num_opcodes)
259 return false;
260
261 if (!is_vopd_compatible(ctx.prev_vopd_info, cur_vopd))
262 return false;
263
264 assert(first->definitions.size() == 1);
265 assert(first->definitions[0].size() == 1);
266 assert(second->definitions.size() == 1);
267 assert(second->definitions[0].size() == 1);
268
269 /* Check for WaW dependency. */
270 if (first->definitions[0].physReg() == second->definitions[0].physReg())
271 return false;
272
273 /* Check for RaW dependency. */
274 for (Operand op : second->operands) {
275 assert(op.size() == 1);
276 if (first->definitions[0].physReg() == op.physReg())
277 return false;
278 }
279
280 /* WaR dependencies are not a concern. */
281 return true;
282 }
283
284 unsigned
get_latency(const Instruction * const instr)285 get_latency(const Instruction* const instr)
286 {
287 /* Note, that these are not accurate latency estimations. */
288 if (instr->isVALU())
289 return 5;
290 if (instr->isSALU())
291 return 2;
292 if (instr->isVMEM() || instr->isFlatLike())
293 return 32;
294 if (instr->isSMEM())
295 return 5;
296 if (instr->accessesLDS())
297 return 2;
298
299 return 0;
300 }
301
302 bool
is_memory_instr(const Instruction * const instr)303 is_memory_instr(const Instruction* const instr)
304 {
305 /* For memory instructions, we allow to reorder them with ALU if it helps
306 * to form larger clauses or to increase def-use distances.
307 */
308 return instr->isVMEM() || instr->isFlatLike() || instr->isSMEM() || instr->accessesLDS() ||
309 instr->isEXP();
310 }
311
312 constexpr unsigned max_sgpr = 128;
313 constexpr unsigned min_vgpr = 256;
314
315 void
add_entry(SchedILPContext & ctx,Instruction * const instr,const uint32_t idx)316 add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
317 {
318 InstrInfo& entry = ctx.nodes[idx];
319 entry.instr = instr;
320 entry.priority = 0;
321 const mask_t mask = BITFIELD_BIT(idx);
322 bool reorder = can_reorder(instr);
323 ctx.active_mask |= mask;
324
325 if (ctx.is_vopd) {
326 VOPDInfo vopd = get_vopd_info(ctx, entry.instr);
327
328 ctx.vopd[idx] = vopd;
329 ctx.vopd_odd_mask &= ~mask;
330 ctx.vopd_odd_mask |= vopd.is_dst_odd ? mask : 0;
331 ctx.vopd_even_mask &= ~mask;
332 ctx.vopd_even_mask |= vopd.is_dst_odd || vopd.op == aco_opcode::num_opcodes ? 0 : mask;
333 }
334
335 for (const Operand& op : instr->operands) {
336 assert(op.isFixed());
337 unsigned reg = op.physReg();
338 if (reg >= max_sgpr && reg != scc && reg < min_vgpr) {
339 reorder &= reg != pops_exiting_wave_id;
340 continue;
341 }
342
343 for (unsigned i = 0; i < op.size(); i++) {
344 RegisterInfo& reg_info = ctx.regs[reg + i];
345
346 /* Add register reads. */
347 reg_info.read_mask |= mask;
348
349 int cycles_since_reg_write = num_nodes;
350 if (reg_info.has_direct_dependency) {
351 /* A previous dependency is still part of the DAG. */
352 entry.dependency_mask |= BITFIELD_BIT(reg_info.direct_dependency);
353 cycles_since_reg_write = ctx.nodes[reg_info.direct_dependency].priority;
354 }
355
356 if (reg_info.latency) {
357 /* Ignore and reset register latencies for memory loads and other non-reorderable
358 * instructions. We schedule these as early as possible anyways.
359 */
360 if (reorder && reg_info.latency > cycles_since_reg_write) {
361 entry.priority = MIN2(entry.priority, cycles_since_reg_write - reg_info.latency);
362
363 /* If a previous register write created some latency, ensure that this
364 * is the first read of the register by making this instruction a direct
365 * dependency of all following register reads.
366 */
367 reg_info.has_direct_dependency = 1;
368 reg_info.direct_dependency = idx;
369 }
370 reg_info.latency = 0;
371 }
372 }
373 }
374
375 /* Check if this instructions reads implicit registers. */
376 if (needs_exec_mask(instr)) {
377 for (unsigned reg = exec_lo; reg <= exec_hi; reg++) {
378 if (ctx.regs[reg].has_direct_dependency)
379 entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
380 ctx.regs[reg].read_mask |= mask;
381 }
382 }
383 if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
384 for (unsigned reg = flat_scr_lo; reg <= flat_scr_hi; reg++) {
385 if (ctx.regs[reg].has_direct_dependency)
386 entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
387 ctx.regs[reg].read_mask |= mask;
388 }
389 }
390
391 for (const Definition& def : instr->definitions) {
392 for (unsigned i = 0; i < def.size(); i++) {
393 RegisterInfo& reg_info = ctx.regs[def.physReg().reg() + i];
394
395 /* Add all previous register reads and writes to the dependencies. */
396 entry.dependency_mask |= reg_info.read_mask;
397 reg_info.read_mask = mask;
398
399 /* This register write is a direct dependency for all following reads. */
400 reg_info.has_direct_dependency = 1;
401 reg_info.direct_dependency = idx;
402
403 if (!ctx.is_vopd) {
404 /* Add latency information for the next register read. */
405 reg_info.latency = get_latency(instr);
406 }
407 }
408 }
409
410 if (!reorder) {
411 ctx.non_reorder_mask |= mask;
412
413 /* Set this node as last non-reorderable instruction */
414 if (ctx.next_non_reorderable == UINT8_MAX) {
415 ctx.next_non_reorderable = idx;
416 } else {
417 ctx.nodes[ctx.last_non_reorderable].next_non_reorderable = idx;
418 }
419 ctx.last_non_reorderable = idx;
420 entry.next_non_reorderable = UINT8_MAX;
421
422 /* Just don't reorder these at all. */
423 if (!is_memory_instr(instr) || instr->definitions.empty() ||
424 get_sync_info(instr).semantics & semantic_volatile || ctx.is_vopd) {
425 /* Add all previous instructions as dependencies. */
426 entry.dependency_mask = ctx.active_mask;
427 }
428
429 /* Remove non-reorderable instructions from dependencies, since WaR dependencies can interfere
430 * with clause formation. This should be fine, since these are always scheduled in-order and
431 * any cases that are actually a concern for clause formation are added as transitive
432 * dependencies. */
433 entry.dependency_mask &= ~ctx.non_reorder_mask;
434 entry.potential_clause = true;
435 } else if (ctx.last_non_reorderable != UINT8_MAX) {
436 ctx.nodes[ctx.last_non_reorderable].potential_clause = false;
437 }
438
439 entry.dependency_mask &= ~mask;
440
441 for (unsigned i = 0; i < num_nodes; i++) {
442 if (!ctx.nodes[i].instr || i == idx)
443 continue;
444
445 /* Add transitive dependencies. */
446 if (entry.dependency_mask & BITFIELD_BIT(i))
447 entry.dependency_mask |= ctx.nodes[i].dependency_mask;
448
449 /* increment base priority */
450 ctx.nodes[i].priority++;
451 }
452 }
453
454 void
remove_entry(SchedILPContext & ctx,const Instruction * const instr,const uint32_t idx)455 remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_t idx)
456 {
457 const mask_t mask = ~BITFIELD_BIT(idx);
458 ctx.active_mask &= mask;
459
460 for (const Operand& op : instr->operands) {
461 const unsigned reg = op.physReg();
462 if (reg >= max_sgpr && reg != scc && reg < min_vgpr)
463 continue;
464
465 for (unsigned i = 0; i < op.size(); i++) {
466 RegisterInfo& reg_info = ctx.regs[reg + i];
467 reg_info.read_mask &= mask;
468 reg_info.has_direct_dependency &= reg_info.direct_dependency != idx;
469 }
470 }
471 if (needs_exec_mask(instr)) {
472 ctx.regs[exec_lo].read_mask &= mask;
473 ctx.regs[exec_hi].read_mask &= mask;
474 }
475 if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
476 ctx.regs[flat_scr_lo].read_mask &= mask;
477 ctx.regs[flat_scr_hi].read_mask &= mask;
478 }
479 for (const Definition& def : instr->definitions) {
480 for (unsigned i = 0; i < def.size(); i++) {
481 unsigned reg = def.physReg().reg() + i;
482 ctx.regs[reg].read_mask &= mask;
483 ctx.regs[reg].has_direct_dependency &= ctx.regs[reg].direct_dependency != idx;
484 }
485 }
486
487 for (unsigned i = 0; i < num_nodes; i++)
488 ctx.nodes[i].dependency_mask &= mask;
489
490 if (ctx.next_non_reorderable == idx) {
491 ctx.non_reorder_mask &= mask;
492 ctx.next_non_reorderable = ctx.nodes[idx].next_non_reorderable;
493 if (ctx.last_non_reorderable == idx)
494 ctx.last_non_reorderable = UINT8_MAX;
495 }
496 }
497
498 /**
499 * Returns a bitfield of nodes which have to be scheduled before the
500 * next non-reorderable instruction.
501 * If the next non-reorderable instruction can form a clause, returns the
502 * dependencies of the entire clause.
503 */
504 mask_t
collect_clause_dependencies(const SchedILPContext & ctx,const uint8_t next,mask_t clause_mask)505 collect_clause_dependencies(const SchedILPContext& ctx, const uint8_t next, mask_t clause_mask)
506 {
507 const InstrInfo& entry = ctx.nodes[next];
508 mask_t dependencies = entry.dependency_mask;
509 clause_mask |= (entry.potential_clause << next);
510
511 if (!is_memory_instr(entry.instr))
512 return dependencies;
513
514 /* If this is potentially an "open" clause, meaning that the clause might
515 * consist of instruction not yet added to the DAG, consider all previous
516 * instructions as dependencies. This prevents splitting of larger, already
517 * formed clauses.
518 */
519 if (next == ctx.last_non_reorderable && entry.potential_clause)
520 return (~clause_mask & ctx.active_mask) | dependencies;
521
522 if (entry.next_non_reorderable == UINT8_MAX)
523 return dependencies;
524
525 /* Check if this can form a clause with the following non-reorderable instruction */
526 if (should_form_clause(entry.instr, ctx.nodes[entry.next_non_reorderable].instr)) {
527 mask_t clause_deps =
528 collect_clause_dependencies(ctx, entry.next_non_reorderable, clause_mask);
529
530 /* if the following clause is independent from us, add their dependencies */
531 if (!(clause_deps & BITFIELD_BIT(next)))
532 dependencies |= clause_deps;
533 }
534
535 return dependencies;
536 }
537
538 /**
539 * Returns the index of the next instruction to be selected.
540 */
541 unsigned
select_instruction_ilp(const SchedILPContext & ctx)542 select_instruction_ilp(const SchedILPContext& ctx)
543 {
544 mask_t mask = ctx.active_mask;
545
546 /* First, collect all dependencies of the next non-reorderable instruction(s).
547 * These make up the list of possible candidates.
548 */
549 if (ctx.next_non_reorderable != UINT8_MAX)
550 mask = collect_clause_dependencies(ctx, ctx.next_non_reorderable, 0);
551
552 /* If the next non-reorderable instruction has no dependencies, select it */
553 if (mask == 0)
554 return ctx.next_non_reorderable;
555
556 /* Otherwise, select the instruction with highest priority of all candidates. */
557 unsigned idx = -1u;
558 int32_t priority = INT32_MIN;
559 u_foreach_bit (i, mask) {
560 const InstrInfo& candidate = ctx.nodes[i];
561
562 /* Check if the candidate has pending dependencies. */
563 if (candidate.dependency_mask)
564 continue;
565
566 if (idx == -1u || candidate.priority > priority) {
567 idx = i;
568 priority = candidate.priority;
569 }
570 }
571
572 assert(idx != -1u);
573 return idx;
574 }
575
576 bool
compare_nodes_vopd(const SchedILPContext & ctx,int num_vopd_odd_minus_even,bool * use_vopd,unsigned current,unsigned candidate)577 compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool* use_vopd,
578 unsigned current, unsigned candidate)
579 {
580 if (can_use_vopd(ctx, candidate)) {
581 /* If we can form a VOPD instruction, always prefer to do so. */
582 if (!*use_vopd) {
583 *use_vopd = true;
584 return true;
585 }
586 } else {
587 if (*use_vopd)
588 return false;
589
590 /* Neither current nor candidate can form a VOPD instruction with the previously scheduled
591 * instruction. */
592 VOPDInfo current_vopd = ctx.vopd[current];
593 VOPDInfo candidate_vopd = ctx.vopd[candidate];
594
595 /* Delay scheduling VOPD-capable instructions in case an opportunity appears later. */
596 bool current_vopd_capable = current_vopd.op != aco_opcode::num_opcodes;
597 bool candidate_vopd_capable = candidate_vopd.op != aco_opcode::num_opcodes;
598 if (current_vopd_capable != candidate_vopd_capable)
599 return !candidate_vopd_capable;
600
601 /* If we have to select from VOPD-capable instructions, prefer maintaining a balance of
602 * odd/even instructions, in case selecting this instruction fails to make a pair.
603 */
604 if (current_vopd_capable && num_vopd_odd_minus_even != 0) {
605 assert(candidate_vopd_capable);
606 bool prefer_vopd_dst_odd = num_vopd_odd_minus_even > 0;
607 if (current_vopd.is_dst_odd != candidate_vopd.is_dst_odd)
608 return prefer_vopd_dst_odd ? candidate_vopd.is_dst_odd : !candidate_vopd.is_dst_odd;
609 }
610 }
611
612 return ctx.nodes[candidate].priority > ctx.nodes[current].priority;
613 }
614
615 unsigned
select_instruction_vopd(const SchedILPContext & ctx,bool * use_vopd)616 select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd)
617 {
618 *use_vopd = false;
619
620 mask_t mask = ctx.active_mask;
621 if (ctx.next_non_reorderable != UINT8_MAX)
622 mask = ctx.nodes[ctx.next_non_reorderable].dependency_mask;
623
624 if (mask == 0)
625 return ctx.next_non_reorderable;
626
627 int num_vopd_odd_minus_even =
628 (int)util_bitcount(ctx.vopd_odd_mask & mask) - (int)util_bitcount(ctx.vopd_even_mask & mask);
629
630 unsigned cur = -1u;
631 u_foreach_bit (i, mask) {
632 const InstrInfo& candidate = ctx.nodes[i];
633
634 /* Check if the candidate has pending dependencies. */
635 if (candidate.dependency_mask)
636 continue;
637
638 if (cur == -1u) {
639 cur = i;
640 *use_vopd = can_use_vopd(ctx, i);
641 } else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, cur, i)) {
642 cur = i;
643 }
644 }
645
646 assert(cur != -1u);
647 return cur;
648 }
649
650 void
get_vopd_opcode_operands(const SchedILPContext & ctx,Instruction * instr,const VOPDInfo & info,bool swap,aco_opcode * op,unsigned * num_operands,Operand * operands)651 get_vopd_opcode_operands(const SchedILPContext& ctx, Instruction* instr, const VOPDInfo& info,
652 bool swap, aco_opcode* op, unsigned* num_operands, Operand* operands)
653 {
654 *op = info.op;
655 *num_operands += instr->operands.size();
656 std::copy(instr->operands.begin(), instr->operands.end(), operands);
657
658 if (instr->opcode == aco_opcode::v_bfrev_b32) {
659 operands[0] = Operand::get_const(ctx.program->gfx_level,
660 util_bitreverse(operands[0].constantValue()), 4);
661 }
662
663 if (swap && info.op == aco_opcode::v_dual_mov_b32) {
664 *op = aco_opcode::v_dual_add_nc_u32;
665 (*num_operands)++;
666 operands[1] = operands[0];
667 operands[0] = Operand::zero();
668 } else if (swap) {
669 if (info.op == aco_opcode::v_dual_sub_f32)
670 *op = aco_opcode::v_dual_subrev_f32;
671 else if (info.op == aco_opcode::v_dual_subrev_f32)
672 *op = aco_opcode::v_dual_sub_f32;
673 std::swap(operands[0], operands[1]);
674 }
675 }
676
677 Instruction*
create_vopd_instruction(const SchedILPContext & ctx,unsigned idx)678 create_vopd_instruction(const SchedILPContext& ctx, unsigned idx)
679 {
680 Instruction* x = ctx.prev_info.instr;
681 Instruction* y = ctx.nodes[idx].instr;
682 VOPDInfo x_info = ctx.prev_vopd_info;
683 VOPDInfo y_info = ctx.vopd[idx];
684
685 bool swap_x = false, swap_y = false;
686 if (x_info.src_banks & y_info.src_banks) {
687 assert(x_info.is_commutative || y_info.is_commutative);
688 /* Avoid swapping v_mov_b32 because it will become an OPY-only opcode. */
689 if (x_info.op == aco_opcode::v_dual_mov_b32 && !y_info.is_commutative) {
690 swap_x = true;
691 x_info.is_opy_only = true;
692 } else {
693 swap_x = x_info.is_commutative && x_info.op != aco_opcode::v_dual_mov_b32;
694 swap_y = y_info.is_commutative && !swap_x;
695 }
696 }
697
698 if (x_info.is_opy_only) {
699 std::swap(x, y);
700 std::swap(x_info, y_info);
701 std::swap(swap_x, swap_y);
702 }
703
704 aco_opcode x_op, y_op;
705 unsigned num_operands = 0;
706 Operand operands[6];
707 get_vopd_opcode_operands(ctx, x, x_info, swap_x, &x_op, &num_operands, operands);
708 get_vopd_opcode_operands(ctx, y, y_info, swap_y, &y_op, &num_operands, operands + num_operands);
709
710 Instruction* instr = create_instruction(x_op, Format::VOPD, num_operands, 2);
711 instr->vopd().opy = y_op;
712 instr->definitions[0] = x->definitions[0];
713 instr->definitions[1] = y->definitions[0];
714 std::copy(operands, operands + num_operands, instr->operands.begin());
715
716 return instr;
717 }
718
719 template <typename It>
720 void
do_schedule(SchedILPContext & ctx,It & insert_it,It & remove_it,It instructions_begin,It instructions_end)721 do_schedule(SchedILPContext& ctx, It& insert_it, It& remove_it, It instructions_begin,
722 It instructions_end)
723 {
724 for (unsigned i = 0; i < num_nodes; i++) {
725 if (remove_it == instructions_end)
726 break;
727
728 add_entry(ctx, (remove_it++)->get(), i);
729 }
730
731 ctx.prev_info.instr = NULL;
732 bool use_vopd = false;
733
734 while (ctx.active_mask) {
735 unsigned next_idx =
736 ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd) : select_instruction_ilp(ctx);
737 Instruction* next_instr = ctx.nodes[next_idx].instr;
738
739 if (use_vopd) {
740 std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx));
741 ctx.prev_info.instr = NULL;
742 } else {
743 (insert_it++)->reset(next_instr);
744 ctx.prev_info = ctx.nodes[next_idx];
745 ctx.prev_vopd_info = ctx.vopd[next_idx];
746 }
747
748 remove_entry(ctx, next_instr, next_idx);
749 ctx.nodes[next_idx].instr = NULL;
750
751 if (remove_it != instructions_end) {
752 add_entry(ctx, (remove_it++)->get(), next_idx);
753 } else if (ctx.last_non_reorderable != UINT8_MAX) {
754 ctx.nodes[ctx.last_non_reorderable].potential_clause = false;
755 ctx.last_non_reorderable = UINT8_MAX;
756 }
757 }
758 }
759
760 } // namespace
761
762 void
schedule_ilp(Program * program)763 schedule_ilp(Program* program)
764 {
765 SchedILPContext ctx = {program};
766
767 for (Block& block : program->blocks) {
768 auto it = block.instructions.begin();
769 auto insert_it = block.instructions.begin();
770 do_schedule(ctx, insert_it, it, block.instructions.begin(), block.instructions.end());
771 block.instructions.resize(insert_it - block.instructions.begin());
772 }
773 }
774
775 void
schedule_vopd(Program * program)776 schedule_vopd(Program* program)
777 {
778 if (program->gfx_level < GFX11 || program->wave_size != 32)
779 return;
780
781 SchedILPContext ctx = {program};
782 ctx.is_vopd = true;
783
784 for (Block& block : program->blocks) {
785 auto it = block.instructions.rbegin();
786 auto insert_it = block.instructions.rbegin();
787 do_schedule(ctx, insert_it, it, block.instructions.rbegin(), block.instructions.rend());
788 block.instructions.erase(block.instructions.begin(), insert_it.base());
789 }
790 }
791
792 } // namespace aco
793