1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include "aco_builder.h"
9 #include "aco_ir.h"
10 #include "aco_util.h"
11
12 #include "common/ac_descriptors.h"
13 #include "common/sid.h"
14
15 #include <algorithm>
16 #include <cstring>
17 #include <map>
18 #include <set>
19 #include <unordered_map>
20 #include <unordered_set>
21 #include <vector>
22
23 namespace std {
24 template <> struct hash<aco::Temp> {
operator ()std::hash25 size_t operator()(aco::Temp temp) const noexcept
26 {
27 uint32_t v;
28 std::memcpy(&v, &temp, sizeof(temp));
29 return std::hash<uint32_t>{}(v);
30 }
31 };
32 } // namespace std
33
34 /*
35 * Implements the spilling algorithm on SSA-form based on
36 * "Register Spilling and Live-Range Splitting for SSA-Form Programs"
37 * by Matthias Braun and Sebastian Hack.
38 *
39 * Key difference between this algorithm and the min-algorithm from the paper
40 * is the use of average use distances rather than next-use distances per
41 * instruction.
42 * As we decrement the number of remaining uses, the average use distances
43 * give an approximation of the next-use distances while being computationally
44 * and memory-wise less expensive.
45 */
46
47 namespace aco {
48
49 namespace {
50
51 struct remat_info {
52 Instruction* instr;
53 };
54
55 struct loop_info {
56 uint32_t index;
57 aco::unordered_map<Temp, uint32_t> spills;
58 IDSet live_in;
59 };
60
61 struct use_info {
62 uint32_t num_uses = 0;
63 uint32_t last_use = 0;
scoreaco::__anon6dee26dc0111::use_info64 float score() { return last_use / num_uses; }
65 };
66
67 struct spill_ctx {
68 RegisterDemand target_pressure;
69 Program* program;
70 aco::monotonic_buffer_resource memory;
71
72 std::vector<aco::map<Temp, Temp>> renames;
73 std::vector<aco::unordered_map<Temp, uint32_t>> spills_entry;
74 std::vector<aco::unordered_map<Temp, uint32_t>> spills_exit;
75
76 std::vector<bool> processed;
77 std::vector<loop_info> loop;
78
79 std::vector<use_info> ssa_infos;
80 std::vector<std::pair<RegClass, std::unordered_set<uint32_t>>> interferences;
81 std::vector<std::vector<uint32_t>> affinities;
82 std::vector<bool> is_reloaded;
83 aco::unordered_map<Temp, remat_info> remat;
84 std::set<Instruction*> unused_remats;
85 unsigned wave_size;
86
87 unsigned sgpr_spill_slots;
88 unsigned vgpr_spill_slots;
89 Temp scratch_rsrc;
90
spill_ctxaco::__anon6dee26dc0111::spill_ctx91 spill_ctx(const RegisterDemand target_pressure_, Program* program_)
92 : target_pressure(target_pressure_), program(program_), memory(),
93 renames(program->blocks.size(), aco::map<Temp, Temp>(memory)),
94 spills_entry(program->blocks.size(), aco::unordered_map<Temp, uint32_t>(memory)),
95 spills_exit(program->blocks.size(), aco::unordered_map<Temp, uint32_t>(memory)),
96 processed(program->blocks.size(), false), ssa_infos(program->peekAllocationId()),
97 remat(memory), wave_size(program->wave_size), sgpr_spill_slots(0), vgpr_spill_slots(0)
98 {}
99
add_affinityaco::__anon6dee26dc0111::spill_ctx100 void add_affinity(uint32_t first, uint32_t second)
101 {
102 unsigned found_first = affinities.size();
103 unsigned found_second = affinities.size();
104 for (unsigned i = 0; i < affinities.size(); i++) {
105 std::vector<uint32_t>& vec = affinities[i];
106 for (uint32_t entry : vec) {
107 if (entry == first)
108 found_first = i;
109 else if (entry == second)
110 found_second = i;
111 }
112 }
113 if (found_first == affinities.size() && found_second == affinities.size()) {
114 affinities.emplace_back(std::vector<uint32_t>({first, second}));
115 } else if (found_first < affinities.size() && found_second == affinities.size()) {
116 affinities[found_first].push_back(second);
117 } else if (found_second < affinities.size() && found_first == affinities.size()) {
118 affinities[found_second].push_back(first);
119 } else if (found_first != found_second) {
120 /* merge second into first */
121 affinities[found_first].insert(affinities[found_first].end(),
122 affinities[found_second].begin(),
123 affinities[found_second].end());
124 affinities.erase(std::next(affinities.begin(), found_second));
125 } else {
126 assert(found_first == found_second);
127 }
128 }
129
add_to_spillsaco::__anon6dee26dc0111::spill_ctx130 uint32_t add_to_spills(Temp to_spill, aco::unordered_map<Temp, uint32_t>& spills)
131 {
132 const uint32_t spill_id = allocate_spill_id(to_spill.regClass());
133 for (auto pair : spills)
134 add_interference(spill_id, pair.second);
135 if (!loop.empty()) {
136 for (auto pair : loop.back().spills)
137 add_interference(spill_id, pair.second);
138 }
139
140 spills[to_spill] = spill_id;
141 return spill_id;
142 }
143
add_interferenceaco::__anon6dee26dc0111::spill_ctx144 void add_interference(uint32_t first, uint32_t second)
145 {
146 if (interferences[first].first.type() != interferences[second].first.type())
147 return;
148
149 bool inserted = interferences[first].second.insert(second).second;
150 if (inserted)
151 interferences[second].second.insert(first);
152 }
153
allocate_spill_idaco::__anon6dee26dc0111::spill_ctx154 uint32_t allocate_spill_id(RegClass rc)
155 {
156 interferences.emplace_back(rc, std::unordered_set<uint32_t>());
157 is_reloaded.push_back(false);
158 return next_spill_id++;
159 }
160
161 uint32_t next_spill_id = 0;
162 };
163
164 /**
165 * Gathers information about the number of uses and point of last use
166 * per SSA value.
167 *
168 * Phi definitions are added to live-ins.
169 */
170 void
gather_ssa_use_info(spill_ctx & ctx)171 gather_ssa_use_info(spill_ctx& ctx)
172 {
173 unsigned instruction_idx = 0;
174 for (Block& block : ctx.program->blocks) {
175 for (int i = block.instructions.size() - 1; i >= 0; i--) {
176 aco_ptr<Instruction>& instr = block.instructions[i];
177 for (const Operand& op : instr->operands) {
178 if (op.isTemp()) {
179 use_info& info = ctx.ssa_infos[op.tempId()];
180 info.num_uses++;
181 info.last_use = std::max(info.last_use, instruction_idx + i);
182 }
183 }
184 }
185
186 /* All live-in variables at loop headers get an additional artificial use.
187 * As we decrement the number of uses while processing the blocks, this
188 * ensures that the number of uses won't becomes zero before the loop
189 * (and the variables' live-ranges) end.
190 */
191 if (block.kind & block_kind_loop_header) {
192 for (unsigned t : ctx.program->live.live_in[block.index])
193 ctx.ssa_infos[t].num_uses++;
194 }
195
196 instruction_idx += block.instructions.size();
197 }
198 }
199
200 bool
should_rematerialize(aco_ptr<Instruction> & instr)201 should_rematerialize(aco_ptr<Instruction>& instr)
202 {
203 /* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */
204 if (instr->format != Format::VOP1 && instr->format != Format::SOP1 &&
205 instr->format != Format::PSEUDO && instr->format != Format::SOPK)
206 return false;
207 /* TODO: pseudo-instruction rematerialization is only supported for
208 * p_create_vector/p_parallelcopy */
209 if (instr->isPseudo() && instr->opcode != aco_opcode::p_create_vector &&
210 instr->opcode != aco_opcode::p_parallelcopy)
211 return false;
212 if (instr->isSOPK() && instr->opcode != aco_opcode::s_movk_i32)
213 return false;
214
215 for (const Operand& op : instr->operands) {
216 /* TODO: rematerialization using temporaries isn't yet supported */
217 if (!op.isConstant())
218 return false;
219 }
220
221 /* TODO: rematerialization with multiple definitions isn't yet supported */
222 if (instr->definitions.size() > 1)
223 return false;
224
225 return true;
226 }
227
228 aco_ptr<Instruction>
do_reload(spill_ctx & ctx,Temp tmp,Temp new_name,uint32_t spill_id)229 do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id)
230 {
231 std::unordered_map<Temp, remat_info>::iterator remat = ctx.remat.find(tmp);
232 if (remat != ctx.remat.end()) {
233 Instruction* instr = remat->second.instr;
234 assert((instr->isVOP1() || instr->isSOP1() || instr->isPseudo() || instr->isSOPK()) &&
235 "unsupported");
236 assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector ||
237 instr->opcode == aco_opcode::p_parallelcopy) &&
238 "unsupported");
239 assert(instr->definitions.size() == 1 && "unsupported");
240
241 aco_ptr<Instruction> res;
242 res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(),
243 instr->definitions.size()));
244 if (instr->isSOPK())
245 res->salu().imm = instr->salu().imm;
246
247 for (unsigned i = 0; i < instr->operands.size(); i++) {
248 res->operands[i] = instr->operands[i];
249 if (instr->operands[i].isTemp()) {
250 assert(false && "unsupported");
251 if (ctx.remat.count(instr->operands[i].getTemp()))
252 ctx.unused_remats.erase(ctx.remat[instr->operands[i].getTemp()].instr);
253 }
254 }
255 res->definitions[0] = Definition(new_name);
256 return res;
257 } else {
258 aco_ptr<Instruction> reload{create_instruction(aco_opcode::p_reload, Format::PSEUDO, 1, 1)};
259 reload->operands[0] = Operand::c32(spill_id);
260 reload->definitions[0] = Definition(new_name);
261 ctx.is_reloaded[spill_id] = true;
262 return reload;
263 }
264 }
265
266 void
get_rematerialize_info(spill_ctx & ctx)267 get_rematerialize_info(spill_ctx& ctx)
268 {
269 for (Block& block : ctx.program->blocks) {
270 bool logical = false;
271 for (aco_ptr<Instruction>& instr : block.instructions) {
272 if (instr->opcode == aco_opcode::p_logical_start)
273 logical = true;
274 else if (instr->opcode == aco_opcode::p_logical_end)
275 logical = false;
276 if (logical && should_rematerialize(instr)) {
277 for (const Definition& def : instr->definitions) {
278 if (def.isTemp()) {
279 ctx.remat[def.getTemp()] = remat_info{instr.get()};
280 ctx.unused_remats.insert(instr.get());
281 }
282 }
283 }
284 }
285 }
286 }
287
288 RegisterDemand
init_live_in_vars(spill_ctx & ctx,Block * block,unsigned block_idx)289 init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
290 {
291 RegisterDemand spilled_registers;
292
293 /* first block, nothing was spilled before */
294 if (block->linear_preds.empty())
295 return {0, 0};
296
297 /* live-in variables at the beginning of the current block */
298 const IDSet& live_in = ctx.program->live.live_in[block_idx];
299
300 /* loop header block */
301 if (block->kind & block_kind_loop_header) {
302 assert(block->linear_preds[0] == block_idx - 1);
303 assert(block->logical_preds[0] == block_idx - 1);
304
305 /* check how many live-through variables should be spilled */
306 RegisterDemand reg_pressure = block->live_in_demand;
307 RegisterDemand loop_demand = reg_pressure;
308 unsigned i = block_idx;
309 while (ctx.program->blocks[i].loop_nest_depth >= block->loop_nest_depth)
310 loop_demand.update(ctx.program->blocks[i++].register_demand);
311
312 for (auto spilled : ctx.spills_exit[block_idx - 1]) {
313 /* variable is not live at loop entry: probably a phi operand */
314 if (!live_in.count(spilled.first.id()))
315 continue;
316
317 /* keep live-through variables spilled */
318 ctx.spills_entry[block_idx][spilled.first] = spilled.second;
319 spilled_registers += spilled.first;
320 loop_demand -= spilled.first;
321 }
322 if (!ctx.loop.empty()) {
323 /* If this is a nested loop, keep variables from the outer loop spilled. */
324 for (auto spilled : ctx.loop.back().spills) {
325 /* If the inner loop comes after the last continue statement of the outer loop,
326 * the loop-carried variables might not be live-in for the inner loop.
327 */
328 if (live_in.count(spilled.first.id()) &&
329 ctx.spills_entry[block_idx].insert(spilled).second) {
330 spilled_registers += spilled.first;
331 loop_demand -= spilled.first;
332 }
333 }
334 }
335
336 /* select more live-through variables and constants */
337 RegType type = RegType::vgpr;
338 while (loop_demand.exceeds(ctx.target_pressure)) {
339 /* if VGPR demand is low enough, select SGPRs */
340 if (type == RegType::vgpr && loop_demand.vgpr <= ctx.target_pressure.vgpr)
341 type = RegType::sgpr;
342 /* if SGPR demand is low enough, break */
343 if (type == RegType::sgpr && loop_demand.sgpr <= ctx.target_pressure.sgpr)
344 break;
345
346 float score = 0.0;
347 unsigned remat = 0;
348 Temp to_spill;
349 for (unsigned t : live_in) {
350 Temp var = Temp(t, ctx.program->temp_rc[t]);
351 if (var.type() != type || ctx.spills_entry[block_idx].count(var) ||
352 var.regClass().is_linear_vgpr())
353 continue;
354
355 unsigned can_remat = ctx.remat.count(var);
356 if (can_remat > remat || (can_remat == remat && ctx.ssa_infos[t].score() > score)) {
357 to_spill = var;
358 score = ctx.ssa_infos[t].score();
359 remat = can_remat;
360 }
361 }
362
363 /* select SGPRs or break */
364 if (score == 0.0) {
365 if (type == RegType::sgpr)
366 break;
367 type = RegType::sgpr;
368 continue;
369 }
370
371 ctx.add_to_spills(to_spill, ctx.spills_entry[block_idx]);
372 spilled_registers += to_spill;
373 loop_demand -= to_spill;
374 }
375
376 /* create new loop_info */
377 loop_info info = {block_idx, ctx.spills_entry[block_idx], live_in};
378 ctx.loop.emplace_back(std::move(info));
379
380 /* shortcut */
381 if (!loop_demand.exceeds(ctx.target_pressure))
382 return spilled_registers;
383
384 /* if reg pressure is too high at beginning of loop, add variables with furthest use */
385 reg_pressure -= spilled_registers;
386
387 while (reg_pressure.exceeds(ctx.target_pressure)) {
388 float score = 0;
389 Temp to_spill;
390 type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;
391 for (aco_ptr<Instruction>& phi : block->instructions) {
392 if (!is_phi(phi))
393 break;
394 if (!phi->definitions[0].isTemp() || phi->definitions[0].isKill())
395 continue;
396 Temp var = phi->definitions[0].getTemp();
397 if (var.type() == type && !ctx.spills_entry[block_idx].count(var) &&
398 ctx.ssa_infos[var.id()].score() > score) {
399 to_spill = var;
400 score = ctx.ssa_infos[var.id()].score();
401 }
402 }
403 assert(score != 0.0);
404 ctx.add_to_spills(to_spill, ctx.spills_entry[block_idx]);
405 spilled_registers += to_spill;
406 reg_pressure -= to_spill;
407 }
408
409 return spilled_registers;
410 }
411
412 /* branch block */
413 if (block->linear_preds.size() == 1 && !(block->kind & block_kind_loop_exit)) {
414 /* keep variables spilled */
415 unsigned pred_idx = block->linear_preds[0];
416 for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
417 if (pair.first.type() != RegType::sgpr)
418 continue;
419
420 if (live_in.count(pair.first.id())) {
421 spilled_registers += pair.first;
422 ctx.spills_entry[block_idx].emplace(pair);
423 }
424 }
425
426 if (block->logical_preds.empty())
427 return spilled_registers;
428
429 pred_idx = block->logical_preds[0];
430 for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
431 if (pair.first.type() != RegType::vgpr)
432 continue;
433
434 if (live_in.count(pair.first.id())) {
435 spilled_registers += pair.first;
436 ctx.spills_entry[block_idx].emplace(pair);
437 }
438 }
439
440 return spilled_registers;
441 }
442
443 /* else: merge block */
444 std::map<Temp, bool> partial_spills;
445
446 /* keep variables spilled on all incoming paths */
447 for (unsigned t : live_in) {
448 const RegClass rc = ctx.program->temp_rc[t];
449 Temp var = Temp(t, rc);
450 Block::edge_vec& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;
451
452 /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload
453 * it. Otherwise, if any predecessor reloads it, ensure it's reloaded on all other
454 * predecessors. The idea is that it's better in practice to rematerialize redundantly than to
455 * create lots of phis. */
456 const bool remat = ctx.remat.count(var);
457 /* If the variable is spilled at the current loop-header, spilling is essentially for free
458 * while reloading is not. Thus, keep them spilled if they are at least partially spilled.
459 */
460 const bool avoid_respill = block->loop_nest_depth && ctx.loop.back().spills.count(var);
461 bool spill = true;
462 bool partial_spill = false;
463 uint32_t spill_id = 0;
464 for (unsigned pred_idx : preds) {
465 if (!ctx.spills_exit[pred_idx].count(var)) {
466 spill = false;
467 } else {
468 partial_spill = true;
469 /* it might be that on one incoming path, the variable has a different spill_id, but
470 * add_couple_code() will take care of that. */
471 spill_id = ctx.spills_exit[pred_idx][var];
472 }
473 }
474 spill |= (remat && partial_spill);
475 spill |= (avoid_respill && partial_spill);
476 if (spill) {
477 ctx.spills_entry[block_idx][var] = spill_id;
478 partial_spills.erase(var);
479 spilled_registers += var;
480 } else {
481 partial_spills[var] = partial_spill;
482 }
483 }
484
485 /* same for phis */
486 for (aco_ptr<Instruction>& phi : block->instructions) {
487 if (!is_phi(phi))
488 break;
489 if (!phi->definitions[0].isTemp() || phi->definitions[0].isKill())
490 continue;
491
492 Block::edge_vec& preds =
493 phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
494 bool is_all_undef = true;
495 bool is_all_spilled = true;
496 bool is_partial_spill = false;
497 for (unsigned i = 0; i < phi->operands.size(); i++) {
498 if (phi->operands[i].isUndefined())
499 continue;
500 bool spilled = phi->operands[i].isTemp() &&
501 ctx.spills_exit[preds[i]].count(phi->operands[i].getTemp());
502 is_all_spilled &= spilled;
503 is_partial_spill |= spilled;
504 is_all_undef = false;
505 }
506
507 if (is_all_spilled && !is_all_undef) {
508 /* The phi is spilled at all predecessors. Keep it spilled. */
509 ctx.add_to_spills(phi->definitions[0].getTemp(), ctx.spills_entry[block_idx]);
510 spilled_registers += phi->definitions[0].getTemp();
511 partial_spills.erase(phi->definitions[0].getTemp());
512 } else {
513 /* Phis might increase the register pressure. */
514 partial_spills[phi->definitions[0].getTemp()] = is_partial_spill;
515 }
516 }
517
518 /* if reg pressure at first instruction is still too high, add partially spilled variables */
519 RegisterDemand reg_pressure = block->live_in_demand;
520 reg_pressure -= spilled_registers;
521
522 while (reg_pressure.exceeds(ctx.target_pressure)) {
523 assert(!partial_spills.empty());
524 std::map<Temp, bool>::iterator it = partial_spills.begin();
525 Temp to_spill = Temp();
526 bool is_partial_spill = false;
527 float score = 0.0;
528 RegType type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;
529
530 while (it != partial_spills.end()) {
531 assert(!ctx.spills_entry[block_idx].count(it->first));
532
533 if (it->first.type() == type && !it->first.regClass().is_linear_vgpr() &&
534 ((it->second && !is_partial_spill) ||
535 (it->second == is_partial_spill && ctx.ssa_infos[it->first.id()].score() > score))) {
536 score = ctx.ssa_infos[it->first.id()].score();
537 to_spill = it->first;
538 is_partial_spill = it->second;
539 }
540 ++it;
541 }
542 assert(score != 0.0);
543 ctx.add_to_spills(to_spill, ctx.spills_entry[block_idx]);
544 partial_spills.erase(to_spill);
545 spilled_registers += to_spill;
546 reg_pressure -= to_spill;
547 }
548
549 return spilled_registers;
550 }
551
552 void
add_coupling_code(spill_ctx & ctx,Block * block,IDSet & live_in)553 add_coupling_code(spill_ctx& ctx, Block* block, IDSet& live_in)
554 {
555 const unsigned block_idx = block->index;
556 /* No coupling code necessary */
557 if (block->linear_preds.size() == 0)
558 return;
559
560 /* Branch block: update renames */
561 if (block->linear_preds.size() == 1 &&
562 !(block->kind & (block_kind_loop_exit | block_kind_loop_header))) {
563 assert(ctx.processed[block->linear_preds[0]]);
564
565 ctx.renames[block_idx] = ctx.renames[block->linear_preds[0]];
566 if (!block->logical_preds.empty() && block->logical_preds[0] != block->linear_preds[0]) {
567 for (auto it : ctx.renames[block->logical_preds[0]]) {
568 if (it.first.type() == RegType::vgpr)
569 ctx.renames[block_idx].insert_or_assign(it.first, it.second);
570 }
571 }
572 return;
573 }
574
575 std::vector<aco_ptr<Instruction>> instructions;
576
577 /* loop header and merge blocks: check if all (linear) predecessors have been processed */
578 for (ASSERTED unsigned pred : block->linear_preds)
579 assert(ctx.processed[pred]);
580
581 /* iterate the phi nodes for which operands to spill at the predecessor */
582 for (aco_ptr<Instruction>& phi : block->instructions) {
583 if (!is_phi(phi))
584 break;
585
586 for (const Operand& op : phi->operands) {
587 if (op.isTemp())
588 ctx.ssa_infos[op.tempId()].num_uses--;
589 }
590
591 /* The phi is not spilled */
592 if (!phi->definitions[0].isTemp() ||
593 !ctx.spills_entry[block_idx].count(phi->definitions[0].getTemp()))
594 continue;
595
596 Block::edge_vec& preds =
597 phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
598 uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()];
599 phi->definitions[0].setKill(true);
600
601 for (unsigned i = 0; i < phi->operands.size(); i++) {
602 if (phi->operands[i].isUndefined())
603 continue;
604
605 unsigned pred_idx = preds[i];
606 Operand spill_op = phi->operands[i];
607 phi->operands[i] = Operand(phi->definitions[0].regClass());
608
609 if (spill_op.isTemp()) {
610 assert(spill_op.isKill());
611 Temp var = spill_op.getTemp();
612
613 std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
614 /* prevent the defining instruction from being DCE'd if it could be rematerialized */
615 if (rename_it == ctx.renames[preds[i]].end() && ctx.remat.count(var))
616 ctx.unused_remats.erase(ctx.remat[var].instr);
617
618 /* check if variable is already spilled at predecessor */
619 auto spilled = ctx.spills_exit[pred_idx].find(var);
620 if (spilled != ctx.spills_exit[pred_idx].end()) {
621 if (spilled->second != def_spill_id)
622 ctx.add_affinity(def_spill_id, spilled->second);
623 continue;
624 }
625
626 /* If the phi operand has the same name as the definition,
627 * add to predecessor's spilled variables, so that it gets
628 * skipped in the loop below.
629 */
630 if (var == phi->definitions[0].getTemp())
631 ctx.spills_exit[pred_idx][var] = def_spill_id;
632
633 /* rename if necessary */
634 if (rename_it != ctx.renames[pred_idx].end()) {
635 spill_op.setTemp(rename_it->second);
636 ctx.renames[pred_idx].erase(rename_it);
637 }
638 }
639
640 /* add interferences */
641 for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx])
642 ctx.add_interference(def_spill_id, pair.second);
643
644 aco_ptr<Instruction> spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
645 spill->operands[0] = spill_op;
646 spill->operands[1] = Operand::c32(def_spill_id);
647 Block& pred = ctx.program->blocks[pred_idx];
648 unsigned idx = pred.instructions.size();
649 do {
650 assert(idx != 0);
651 idx--;
652 } while (phi->opcode == aco_opcode::p_phi &&
653 pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
654 std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
655 pred.instructions.insert(it, std::move(spill));
656 }
657 }
658
659 /* iterate all (other) spilled variables for which to spill at the predecessor */
660 // TODO: would be better to have them sorted: first vgprs and first with longest distance
661 for (std::pair<Temp, uint32_t> pair : ctx.spills_entry[block_idx]) {
662 /* if variable is not live-in, it must be from a phi: this works because of CSSA form */
663 if (!live_in.count(pair.first.id()))
664 continue;
665
666 Block::edge_vec& preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
667 for (unsigned pred_idx : preds) {
668 /* variable is already spilled at predecessor */
669 auto spilled = ctx.spills_exit[pred_idx].find(pair.first);
670 if (spilled != ctx.spills_exit[pred_idx].end()) {
671 if (spilled->second != pair.second)
672 ctx.add_affinity(pair.second, spilled->second);
673 continue;
674 }
675
676 /* If this variable is spilled through the entire loop, no need to re-spill.
677 * It can be reloaded from the same spill-slot it got at the loop-preheader.
678 * No need to add interferences since every spilled variable in the loop already
679 * interferes with the spilled loop-variables. Make sure that the spill_ids match.
680 */
681 const uint32_t loop_nest_depth = std::min(ctx.program->blocks[pred_idx].loop_nest_depth,
682 ctx.program->blocks[block_idx].loop_nest_depth);
683 if (loop_nest_depth) {
684 auto spill = ctx.loop[loop_nest_depth - 1].spills.find(pair.first);
685 if (spill != ctx.loop[loop_nest_depth - 1].spills.end() && spill->second == pair.second)
686 continue;
687 }
688
689 /* add interferences between spilled variable and predecessors exit spills */
690 for (std::pair<Temp, uint32_t> exit_spill : ctx.spills_exit[pred_idx])
691 ctx.add_interference(exit_spill.second, pair.second);
692
693 /* variable is in register at predecessor and has to be spilled */
694 /* rename if necessary */
695 Temp var = pair.first;
696 std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
697 if (rename_it != ctx.renames[pred_idx].end()) {
698 var = rename_it->second;
699 ctx.renames[pred_idx].erase(rename_it);
700 }
701
702 aco_ptr<Instruction> spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
703 spill->operands[0] = Operand(var);
704 spill->operands[1] = Operand::c32(pair.second);
705 Block& pred = ctx.program->blocks[pred_idx];
706 unsigned idx = pred.instructions.size();
707 do {
708 assert(idx != 0);
709 idx--;
710 } while (pair.first.type() == RegType::vgpr &&
711 pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
712 std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
713 pred.instructions.insert(it, std::move(spill));
714 }
715 }
716
717 /* iterate phis for which operands to reload */
718 for (aco_ptr<Instruction>& phi : block->instructions) {
719 if (!is_phi(phi))
720 break;
721 if (phi->definitions[0].isKill())
722 continue;
723
724 assert(!phi->definitions[0].isTemp() ||
725 !ctx.spills_entry[block_idx].count(phi->definitions[0].getTemp()));
726
727 Block::edge_vec& preds =
728 phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
729 for (unsigned i = 0; i < phi->operands.size(); i++) {
730 if (!phi->operands[i].isTemp())
731 continue;
732 unsigned pred_idx = preds[i];
733
734 /* if the operand was reloaded, rename */
735 if (!ctx.spills_exit[pred_idx].count(phi->operands[i].getTemp())) {
736 std::map<Temp, Temp>::iterator it =
737 ctx.renames[pred_idx].find(phi->operands[i].getTemp());
738 if (it != ctx.renames[pred_idx].end()) {
739 phi->operands[i].setTemp(it->second);
740 /* prevent the defining instruction from being DCE'd if it could be rematerialized */
741 } else {
742 auto remat_it = ctx.remat.find(phi->operands[i].getTemp());
743 if (remat_it != ctx.remat.end()) {
744 ctx.unused_remats.erase(remat_it->second.instr);
745 }
746 }
747 continue;
748 }
749
750 Temp tmp = phi->operands[i].getTemp();
751
752 /* reload phi operand at end of predecessor block */
753 Temp new_name = ctx.program->allocateTmp(tmp.regClass());
754 Block& pred = ctx.program->blocks[pred_idx];
755 unsigned idx = pred.instructions.size();
756 do {
757 assert(idx != 0);
758 idx--;
759 } while (phi->opcode == aco_opcode::p_phi &&
760 pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
761 std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
762 aco_ptr<Instruction> reload =
763 do_reload(ctx, tmp, new_name, ctx.spills_exit[pred_idx][tmp]);
764
765 /* reload spilled exec mask directly to exec */
766 if (!phi->definitions[0].isTemp()) {
767 assert(phi->definitions[0].isFixed() && phi->definitions[0].physReg() == exec);
768 reload->definitions[0] = phi->definitions[0];
769 phi->operands[i] = Operand(exec, ctx.program->lane_mask);
770 } else {
771 ctx.spills_exit[pred_idx].erase(tmp);
772 ctx.renames[pred_idx][tmp] = new_name;
773 phi->operands[i].setTemp(new_name);
774 }
775
776 pred.instructions.insert(it, std::move(reload));
777 }
778 }
779
780 /* iterate live variables for which to reload */
781 for (unsigned t : live_in) {
782 const RegClass rc = ctx.program->temp_rc[t];
783 Temp var = Temp(t, rc);
784
785 /* skip spilled variables */
786 if (ctx.spills_entry[block_idx].count(var))
787 continue;
788
789 Block::edge_vec& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;
790 for (unsigned pred_idx : preds) {
791 /* skip if the variable is not spilled at the predecessor */
792 if (!ctx.spills_exit[pred_idx].count(var))
793 continue;
794
795 /* variable is spilled at predecessor and has to be reloaded */
796 Temp new_name = ctx.program->allocateTmp(rc);
797 Block& pred = ctx.program->blocks[pred_idx];
798 unsigned idx = pred.instructions.size();
799 do {
800 assert(idx != 0);
801 idx--;
802 } while (rc.type() == RegType::vgpr &&
803 pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
804 std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
805
806 aco_ptr<Instruction> reload =
807 do_reload(ctx, var, new_name, ctx.spills_exit[pred.index][var]);
808 pred.instructions.insert(it, std::move(reload));
809
810 ctx.spills_exit[pred.index].erase(var);
811 ctx.renames[pred.index][var] = new_name;
812 }
813
814 /* check if we have to create a new phi for this variable */
815 Temp rename = Temp();
816 bool is_same = true;
817 for (unsigned pred_idx : preds) {
818 if (!ctx.renames[pred_idx].count(var)) {
819 if (rename == Temp())
820 rename = var;
821 else
822 is_same = rename == var;
823 } else {
824 if (rename == Temp())
825 rename = ctx.renames[pred_idx][var];
826 else
827 is_same = rename == ctx.renames[pred_idx][var];
828 }
829
830 if (!is_same)
831 break;
832 }
833
834 if (!is_same) {
835 /* the variable was renamed differently in the predecessors: we have to create a phi */
836 aco_opcode opcode = rc.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
837 aco_ptr<Instruction> phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
838 rename = ctx.program->allocateTmp(rc);
839 for (unsigned i = 0; i < phi->operands.size(); i++) {
840 Temp tmp;
841 if (ctx.renames[preds[i]].count(var)) {
842 tmp = ctx.renames[preds[i]][var];
843 } else if (preds[i] >= block_idx) {
844 tmp = rename;
845 } else {
846 tmp = var;
847 /* prevent the defining instruction from being DCE'd if it could be rematerialized */
848 if (ctx.remat.count(tmp))
849 ctx.unused_remats.erase(ctx.remat[tmp].instr);
850 }
851 phi->operands[i] = Operand(tmp);
852 }
853 phi->definitions[0] = Definition(rename);
854 phi->register_demand = block->live_in_demand;
855 block->instructions.insert(block->instructions.begin(), std::move(phi));
856 }
857
858 /* the variable was renamed: add new name to renames */
859 if (!(rename == Temp() || rename == var))
860 ctx.renames[block_idx][var] = rename;
861 }
862 }
863
864 void
process_block(spill_ctx & ctx,unsigned block_idx,Block * block,RegisterDemand spilled_registers)865 process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand spilled_registers)
866 {
867 assert(!ctx.processed[block_idx]);
868
869 std::vector<aco_ptr<Instruction>> instructions;
870 unsigned idx = 0;
871
872 /* phis are handled separately */
873 while (block->instructions[idx]->opcode == aco_opcode::p_phi ||
874 block->instructions[idx]->opcode == aco_opcode::p_linear_phi) {
875 const Definition def = block->instructions[idx]->definitions[0];
876 if (def.isTemp() && !def.isKill() && def.tempId() < ctx.ssa_infos.size())
877 ctx.program->live.live_in[block_idx].insert(def.tempId());
878 instructions.emplace_back(std::move(block->instructions[idx++]));
879 }
880
881 auto& current_spills = ctx.spills_exit[block_idx];
882
883 while (idx < block->instructions.size()) {
884 aco_ptr<Instruction>& instr = block->instructions[idx];
885
886 /* Spilling is handled as part of phis (they should always have the same or higher register
887 * demand). If we try to spill here, we might not be able to reduce the register demand enough
888 * because there is no path to spill constant/undef phi operands. */
889 if (instr->opcode == aco_opcode::p_branch) {
890 instructions.emplace_back(std::move(instr));
891 idx++;
892 continue;
893 }
894
895 std::map<Temp, std::pair<Temp, uint32_t>> reloads;
896
897 /* rename and reload operands */
898 for (Operand& op : instr->operands) {
899 if (!op.isTemp())
900 continue;
901
902 if (op.isFirstKill())
903 ctx.program->live.live_in[block_idx].erase(op.tempId());
904 ctx.ssa_infos[op.tempId()].num_uses--;
905
906 if (!current_spills.count(op.getTemp()))
907 continue;
908
909 /* the Operand is spilled: add it to reloads */
910 Temp new_tmp = ctx.program->allocateTmp(op.regClass());
911 ctx.renames[block_idx][op.getTemp()] = new_tmp;
912 reloads[new_tmp] = std::make_pair(op.getTemp(), current_spills[op.getTemp()]);
913 current_spills.erase(op.getTemp());
914 spilled_registers -= new_tmp;
915 }
916
917 /* check if register demand is low enough during and after the current instruction */
918 if (block->register_demand.exceeds(ctx.target_pressure)) {
919 RegisterDemand new_demand = instr->register_demand;
920
921 /* if reg pressure is too high, spill variable with furthest next use */
922 while ((new_demand - spilled_registers).exceeds(ctx.target_pressure)) {
923 float score = 0.0;
924 Temp to_spill;
925 unsigned do_rematerialize = 0;
926 unsigned avoid_respill = 0;
927 RegType type = RegType::sgpr;
928 if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr)
929 type = RegType::vgpr;
930
931 for (unsigned t : ctx.program->live.live_in[block_idx]) {
932 RegClass rc = ctx.program->temp_rc[t];
933 Temp var = Temp(t, rc);
934 if (rc.type() != type || current_spills.count(var) || rc.is_linear_vgpr())
935 continue;
936
937 unsigned can_rematerialize = ctx.remat.count(var);
938 unsigned loop_variable = block->loop_nest_depth && ctx.loop.back().spills.count(var);
939 if (avoid_respill > loop_variable || do_rematerialize > can_rematerialize)
940 continue;
941
942 if (can_rematerialize > do_rematerialize || loop_variable > avoid_respill ||
943 ctx.ssa_infos[t].score() > score) {
944 /* Don't spill operands */
945 if (std::any_of(instr->operands.begin(), instr->operands.end(),
946 [&](Operand& op) { return op.isTemp() && op.getTemp() == var; }))
947 continue;
948
949 to_spill = var;
950 score = ctx.ssa_infos[t].score();
951 do_rematerialize = can_rematerialize;
952 avoid_respill = loop_variable;
953 }
954 }
955 assert(score != 0.0);
956
957 if (avoid_respill) {
958 /* This variable is spilled at the loop-header of the current loop.
959 * Re-use the spill-slot in order to avoid an extra store.
960 */
961 current_spills[to_spill] = ctx.loop.back().spills[to_spill];
962 spilled_registers += to_spill;
963 continue;
964 }
965
966 uint32_t spill_id = ctx.add_to_spills(to_spill, current_spills);
967 /* add interferences with reloads */
968 for (std::pair<const Temp, std::pair<Temp, uint32_t>>& pair : reloads)
969 ctx.add_interference(spill_id, pair.second.second);
970
971 spilled_registers += to_spill;
972
973 /* rename if necessary */
974 if (ctx.renames[block_idx].count(to_spill)) {
975 to_spill = ctx.renames[block_idx][to_spill];
976 }
977
978 /* add spill to new instructions */
979 aco_ptr<Instruction> spill{
980 create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
981 spill->operands[0] = Operand(to_spill);
982 spill->operands[1] = Operand::c32(spill_id);
983 instructions.emplace_back(std::move(spill));
984 }
985 }
986
987 for (const Definition& def : instr->definitions) {
988 if (def.isTemp() && !def.isKill())
989 ctx.program->live.live_in[block_idx].insert(def.tempId());
990 }
991 /* rename operands */
992 for (Operand& op : instr->operands) {
993 if (op.isTemp()) {
994 auto rename_it = ctx.renames[block_idx].find(op.getTemp());
995 if (rename_it != ctx.renames[block_idx].end()) {
996 op.setTemp(rename_it->second);
997 } else {
998 /* prevent its defining instruction from being DCE'd if it could be rematerialized */
999 auto remat_it = ctx.remat.find(op.getTemp());
1000 if (remat_it != ctx.remat.end()) {
1001 ctx.unused_remats.erase(remat_it->second.instr);
1002 }
1003 }
1004 }
1005 }
1006
1007 /* add reloads and instruction to new instructions */
1008 for (std::pair<const Temp, std::pair<Temp, uint32_t>>& pair : reloads) {
1009 aco_ptr<Instruction> reload =
1010 do_reload(ctx, pair.second.first, pair.first, pair.second.second);
1011 instructions.emplace_back(std::move(reload));
1012 }
1013 instructions.emplace_back(std::move(instr));
1014 idx++;
1015 }
1016
1017 block->instructions = std::move(instructions);
1018 }
1019
1020 void
spill_block(spill_ctx & ctx,unsigned block_idx)1021 spill_block(spill_ctx& ctx, unsigned block_idx)
1022 {
1023 Block* block = &ctx.program->blocks[block_idx];
1024
1025 /* determine set of variables which are spilled at the beginning of the block */
1026 RegisterDemand spilled_registers = init_live_in_vars(ctx, block, block_idx);
1027
1028 if (!(block->kind & block_kind_loop_header)) {
1029 /* add spill/reload code on incoming control flow edges */
1030 add_coupling_code(ctx, block, ctx.program->live.live_in[block_idx]);
1031 }
1032
1033 assert(ctx.spills_exit[block_idx].empty());
1034 ctx.spills_exit[block_idx] = ctx.spills_entry[block_idx];
1035 process_block(ctx, block_idx, block, spilled_registers);
1036
1037 ctx.processed[block_idx] = true;
1038
1039 /* check if the next block leaves the current loop */
1040 if (block->loop_nest_depth == 0 ||
1041 ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth)
1042 return;
1043
1044 uint32_t loop_header_idx = ctx.loop.back().index;
1045
1046 /* preserve original renames at end of loop header block */
1047 aco::map<Temp, Temp> renames = std::move(ctx.renames[loop_header_idx]);
1048
1049 /* add coupling code to all loop header predecessors */
1050 for (unsigned t : ctx.loop.back().live_in)
1051 ctx.ssa_infos[t].num_uses--;
1052 add_coupling_code(ctx, &ctx.program->blocks[loop_header_idx], ctx.loop.back().live_in);
1053 renames.swap(ctx.renames[loop_header_idx]);
1054
1055 /* remove loop header info from stack */
1056 ctx.loop.pop_back();
1057 if (renames.empty())
1058 return;
1059
1060 /* Add the new renames to each block */
1061 for (std::pair<Temp, Temp> rename : renames) {
1062 /* If there is already a rename, don't overwrite it. */
1063 for (unsigned idx = loop_header_idx; idx <= block_idx; idx++)
1064 ctx.renames[idx].insert(rename);
1065 }
1066
1067 /* propagate new renames through loop: i.e. repair the SSA */
1068 for (unsigned idx = loop_header_idx; idx <= block_idx; idx++) {
1069 Block& current = ctx.program->blocks[idx];
1070 /* rename all uses in this block */
1071 for (aco_ptr<Instruction>& instr : current.instructions) {
1072 /* no need to rename the loop header phis once again. */
1073 if (idx == loop_header_idx && is_phi(instr))
1074 continue;
1075
1076 for (Operand& op : instr->operands) {
1077 if (!op.isTemp())
1078 continue;
1079
1080 auto rename = renames.find(op.getTemp());
1081 if (rename != renames.end())
1082 op.setTemp(rename->second);
1083 }
1084 }
1085 }
1086 }
1087
1088 Temp
load_scratch_resource(spill_ctx & ctx,Builder & bld,bool apply_scratch_offset)1089 load_scratch_resource(spill_ctx& ctx, Builder& bld, bool apply_scratch_offset)
1090 {
1091 Temp private_segment_buffer = ctx.program->private_segment_buffer;
1092 if (!private_segment_buffer.bytes()) {
1093 Temp addr_lo =
1094 bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
1095 Temp addr_hi =
1096 bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
1097 private_segment_buffer =
1098 bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
1099 } else if (ctx.program->stage.hw != AC_HW_COMPUTE_SHADER) {
1100 private_segment_buffer =
1101 bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand::zero());
1102 }
1103
1104 if (apply_scratch_offset) {
1105 Temp addr_lo = bld.tmp(s1);
1106 Temp addr_hi = bld.tmp(s1);
1107 bld.pseudo(aco_opcode::p_split_vector, Definition(addr_lo), Definition(addr_hi),
1108 private_segment_buffer);
1109
1110 Temp carry = bld.tmp(s1);
1111 addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo,
1112 ctx.program->scratch_offset);
1113 addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi,
1114 Operand::c32(0), bld.scc(carry));
1115
1116 private_segment_buffer =
1117 bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
1118 }
1119
1120 struct ac_buffer_state ac_state = {0};
1121 uint32_t desc[4];
1122
1123 ac_state.size = 0xffffffff;
1124 ac_state.format = PIPE_FORMAT_R32_FLOAT;
1125 for (int i = 0; i < 4; i++)
1126 ac_state.swizzle[i] = PIPE_SWIZZLE_0;
1127 /* older generations need element size = 4 bytes. element size removed in GFX9 */
1128 ac_state.element_size = ctx.program->gfx_level <= GFX8 ? 1u : 0u;
1129 ac_state.index_stride = ctx.program->wave_size == 64 ? 3u : 2u;
1130 ac_state.add_tid = true;
1131 ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
1132
1133 ac_build_buffer_descriptor(ctx.program->gfx_level, &ac_state, desc);
1134
1135 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
1136 Operand::c32(desc[2]), Operand::c32(desc[3]));
1137 }
1138
1139 void
setup_vgpr_spill_reload(spill_ctx & ctx,Block & block,std::vector<aco_ptr<Instruction>> & instructions,uint32_t spill_slot,Temp & scratch_offset,unsigned * offset)1140 setup_vgpr_spill_reload(spill_ctx& ctx, Block& block,
1141 std::vector<aco_ptr<Instruction>>& instructions, uint32_t spill_slot,
1142 Temp& scratch_offset, unsigned* offset)
1143 {
1144 uint32_t scratch_size = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
1145
1146 uint32_t offset_range;
1147 if (ctx.program->gfx_level >= GFX9) {
1148 offset_range =
1149 ctx.program->dev.scratch_global_offset_max - ctx.program->dev.scratch_global_offset_min;
1150 } else {
1151 if (scratch_size < 4095)
1152 offset_range = 4095 - scratch_size;
1153 else
1154 offset_range = 0;
1155 }
1156
1157 bool overflow = (ctx.vgpr_spill_slots - 1) * 4 > offset_range;
1158
1159 Builder rsrc_bld(ctx.program);
1160 if (block.kind & block_kind_top_level) {
1161 rsrc_bld.reset(&instructions);
1162 } else if (ctx.scratch_rsrc == Temp() && (!overflow || ctx.program->gfx_level < GFX9)) {
1163 Block* tl_block = █
1164 while (!(tl_block->kind & block_kind_top_level))
1165 tl_block = &ctx.program->blocks[tl_block->linear_idom];
1166
1167 /* find p_logical_end */
1168 std::vector<aco_ptr<Instruction>>& prev_instructions = tl_block->instructions;
1169 unsigned idx = prev_instructions.size() - 1;
1170 while (prev_instructions[idx]->opcode != aco_opcode::p_logical_end)
1171 idx--;
1172 rsrc_bld.reset(&prev_instructions, std::next(prev_instructions.begin(), idx));
1173 }
1174
1175 /* If spilling overflows the constant offset range at any point, we need to emit the soffset
1176 * before every spill/reload to avoid increasing register demand.
1177 */
1178 Builder offset_bld = rsrc_bld;
1179 if (overflow)
1180 offset_bld.reset(&instructions);
1181
1182 *offset = spill_slot * 4;
1183 if (ctx.program->gfx_level >= GFX9) {
1184 *offset += ctx.program->dev.scratch_global_offset_min;
1185
1186 if (ctx.scratch_rsrc == Temp() || overflow) {
1187 int32_t saddr = scratch_size - ctx.program->dev.scratch_global_offset_min;
1188 if ((int32_t)*offset > (int32_t)ctx.program->dev.scratch_global_offset_max) {
1189 saddr += (int32_t)*offset;
1190 *offset = 0;
1191 }
1192
1193 /* GFX9+ uses scratch_* instructions, which don't use a resource. */
1194 ctx.scratch_rsrc = offset_bld.copy(offset_bld.def(s1), Operand::c32(saddr));
1195 }
1196 } else {
1197 if (ctx.scratch_rsrc == Temp())
1198 ctx.scratch_rsrc = load_scratch_resource(ctx, rsrc_bld, overflow);
1199
1200 if (overflow) {
1201 uint32_t soffset =
1202 ctx.program->config->scratch_bytes_per_wave + *offset * ctx.program->wave_size;
1203 *offset = 0;
1204
1205 scratch_offset = offset_bld.copy(offset_bld.def(s1), Operand::c32(soffset));
1206 } else {
1207 *offset += scratch_size;
1208 }
1209 }
1210 }
1211
1212 void
spill_vgpr(spill_ctx & ctx,Block & block,std::vector<aco_ptr<Instruction>> & instructions,aco_ptr<Instruction> & spill,std::vector<uint32_t> & slots)1213 spill_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& instructions,
1214 aco_ptr<Instruction>& spill, std::vector<uint32_t>& slots)
1215 {
1216 ctx.program->config->spilled_vgprs += spill->operands[0].size();
1217
1218 uint32_t spill_id = spill->operands[1].constantValue();
1219 uint32_t spill_slot = slots[spill_id];
1220
1221 Temp scratch_offset = ctx.program->scratch_offset;
1222 unsigned offset;
1223 setup_vgpr_spill_reload(ctx, block, instructions, spill_slot, scratch_offset, &offset);
1224
1225 assert(spill->operands[0].isTemp());
1226 Temp temp = spill->operands[0].getTemp();
1227 assert(temp.type() == RegType::vgpr && !temp.is_linear());
1228
1229 Builder bld(ctx.program, &instructions);
1230 if (temp.size() > 1) {
1231 Instruction* split{
1232 create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())};
1233 split->operands[0] = Operand(temp);
1234 for (unsigned i = 0; i < temp.size(); i++)
1235 split->definitions[i] = bld.def(v1);
1236 bld.insert(split);
1237 for (unsigned i = 0; i < temp.size(); i++, offset += 4) {
1238 Temp elem = split->definitions[i].getTemp();
1239 if (ctx.program->gfx_level >= GFX9) {
1240 bld.scratch(aco_opcode::scratch_store_dword, Operand(v1), ctx.scratch_rsrc, elem,
1241 offset, memory_sync_info(storage_vgpr_spill, semantic_private));
1242 } else {
1243 Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc,
1244 Operand(v1), scratch_offset, elem, offset, false);
1245 instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1246 instr->mubuf().cache.value = ac_swizzled;
1247 }
1248 }
1249 } else if (ctx.program->gfx_level >= GFX9) {
1250 bld.scratch(aco_opcode::scratch_store_dword, Operand(v1), ctx.scratch_rsrc, temp, offset,
1251 memory_sync_info(storage_vgpr_spill, semantic_private));
1252 } else {
1253 Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc, Operand(v1),
1254 scratch_offset, temp, offset, false);
1255 instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1256 instr->mubuf().cache.value = ac_swizzled;
1257 }
1258 }
1259
1260 void
reload_vgpr(spill_ctx & ctx,Block & block,std::vector<aco_ptr<Instruction>> & instructions,aco_ptr<Instruction> & reload,std::vector<uint32_t> & slots)1261 reload_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& instructions,
1262 aco_ptr<Instruction>& reload, std::vector<uint32_t>& slots)
1263 {
1264 uint32_t spill_id = reload->operands[0].constantValue();
1265 uint32_t spill_slot = slots[spill_id];
1266
1267 Temp scratch_offset = ctx.program->scratch_offset;
1268 unsigned offset;
1269 setup_vgpr_spill_reload(ctx, block, instructions, spill_slot, scratch_offset, &offset);
1270
1271 Definition def = reload->definitions[0];
1272
1273 Builder bld(ctx.program, &instructions);
1274 if (def.size() > 1) {
1275 Instruction* vec{
1276 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)};
1277 vec->definitions[0] = def;
1278 for (unsigned i = 0; i < def.size(); i++, offset += 4) {
1279 Temp tmp = bld.tmp(v1);
1280 vec->operands[i] = Operand(tmp);
1281 if (ctx.program->gfx_level >= GFX9) {
1282 bld.scratch(aco_opcode::scratch_load_dword, Definition(tmp), Operand(v1),
1283 ctx.scratch_rsrc, offset,
1284 memory_sync_info(storage_vgpr_spill, semantic_private));
1285 } else {
1286 Instruction* instr =
1287 bld.mubuf(aco_opcode::buffer_load_dword, Definition(tmp), ctx.scratch_rsrc,
1288 Operand(v1), scratch_offset, offset, false);
1289 instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1290 instr->mubuf().cache.value = ac_swizzled;
1291 }
1292 }
1293 bld.insert(vec);
1294 } else if (ctx.program->gfx_level >= GFX9) {
1295 bld.scratch(aco_opcode::scratch_load_dword, def, Operand(v1), ctx.scratch_rsrc, offset,
1296 memory_sync_info(storage_vgpr_spill, semantic_private));
1297 } else {
1298 Instruction* instr = bld.mubuf(aco_opcode::buffer_load_dword, def, ctx.scratch_rsrc,
1299 Operand(v1), scratch_offset, offset, false);
1300 instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1301 instr->mubuf().cache.value = ac_swizzled;
1302 }
1303 }
1304
1305 void
add_interferences(spill_ctx & ctx,std::vector<bool> & is_assigned,std::vector<uint32_t> & slots,std::vector<bool> & slots_used,unsigned id)1306 add_interferences(spill_ctx& ctx, std::vector<bool>& is_assigned, std::vector<uint32_t>& slots,
1307 std::vector<bool>& slots_used, unsigned id)
1308 {
1309 for (unsigned other : ctx.interferences[id].second) {
1310 if (!is_assigned[other])
1311 continue;
1312
1313 RegClass other_rc = ctx.interferences[other].first;
1314 unsigned slot = slots[other];
1315 std::fill(slots_used.begin() + slot, slots_used.begin() + slot + other_rc.size(), true);
1316 }
1317 }
1318
1319 unsigned
find_available_slot(std::vector<bool> & used,unsigned wave_size,unsigned size,bool is_sgpr)1320 find_available_slot(std::vector<bool>& used, unsigned wave_size, unsigned size, bool is_sgpr)
1321 {
1322 unsigned wave_size_minus_one = wave_size - 1;
1323 unsigned slot = 0;
1324
1325 while (true) {
1326 bool available = true;
1327 for (unsigned i = 0; i < size; i++) {
1328 if (slot + i < used.size() && used[slot + i]) {
1329 available = false;
1330 break;
1331 }
1332 }
1333 if (!available) {
1334 slot++;
1335 continue;
1336 }
1337
1338 if (is_sgpr && ((slot & wave_size_minus_one) > wave_size - size)) {
1339 slot = align(slot, wave_size);
1340 continue;
1341 }
1342
1343 std::fill(used.begin(), used.end(), false);
1344
1345 if (slot + size > used.size())
1346 used.resize(slot + size);
1347
1348 return slot;
1349 }
1350 }
1351
1352 void
assign_spill_slots_helper(spill_ctx & ctx,RegType type,std::vector<bool> & is_assigned,std::vector<uint32_t> & slots,unsigned * num_slots)1353 assign_spill_slots_helper(spill_ctx& ctx, RegType type, std::vector<bool>& is_assigned,
1354 std::vector<uint32_t>& slots, unsigned* num_slots)
1355 {
1356 std::vector<bool> slots_used;
1357
1358 /* assign slots for ids with affinities first */
1359 for (std::vector<uint32_t>& vec : ctx.affinities) {
1360 if (ctx.interferences[vec[0]].first.type() != type)
1361 continue;
1362
1363 for (unsigned id : vec) {
1364 if (!ctx.is_reloaded[id])
1365 continue;
1366
1367 add_interferences(ctx, is_assigned, slots, slots_used, id);
1368 }
1369
1370 unsigned slot = find_available_slot(
1371 slots_used, ctx.wave_size, ctx.interferences[vec[0]].first.size(), type == RegType::sgpr);
1372
1373 for (unsigned id : vec) {
1374 assert(!is_assigned[id]);
1375
1376 if (ctx.is_reloaded[id]) {
1377 slots[id] = slot;
1378 is_assigned[id] = true;
1379 }
1380 }
1381 }
1382
1383 /* assign slots for ids without affinities */
1384 for (unsigned id = 0; id < ctx.interferences.size(); id++) {
1385 if (is_assigned[id] || !ctx.is_reloaded[id] || ctx.interferences[id].first.type() != type)
1386 continue;
1387
1388 add_interferences(ctx, is_assigned, slots, slots_used, id);
1389
1390 unsigned slot = find_available_slot(
1391 slots_used, ctx.wave_size, ctx.interferences[id].first.size(), type == RegType::sgpr);
1392
1393 slots[id] = slot;
1394 is_assigned[id] = true;
1395 }
1396
1397 *num_slots = slots_used.size();
1398 }
1399
1400 void
end_unused_spill_vgprs(spill_ctx & ctx,Block & block,std::vector<Temp> & vgpr_spill_temps,const std::vector<uint32_t> & slots,const aco::unordered_map<Temp,uint32_t> & spills)1401 end_unused_spill_vgprs(spill_ctx& ctx, Block& block, std::vector<Temp>& vgpr_spill_temps,
1402 const std::vector<uint32_t>& slots,
1403 const aco::unordered_map<Temp, uint32_t>& spills)
1404 {
1405 std::vector<bool> is_used(vgpr_spill_temps.size());
1406 for (std::pair<Temp, uint32_t> pair : spills) {
1407 if (pair.first.type() == RegType::sgpr && ctx.is_reloaded[pair.second])
1408 is_used[slots[pair.second] / ctx.wave_size] = true;
1409 }
1410
1411 std::vector<Temp> temps;
1412 for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
1413 if (vgpr_spill_temps[i].id() && !is_used[i]) {
1414 temps.push_back(vgpr_spill_temps[i]);
1415 vgpr_spill_temps[i] = Temp();
1416 }
1417 }
1418 if (temps.empty() || block.linear_preds.empty())
1419 return;
1420
1421 aco_ptr<Instruction> destr{
1422 create_instruction(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, temps.size(), 0)};
1423 for (unsigned i = 0; i < temps.size(); i++)
1424 destr->operands[i] = Operand(temps[i]);
1425
1426 std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
1427 while (is_phi(*it))
1428 ++it;
1429 block.instructions.insert(it, std::move(destr));
1430 }
1431
1432 void
assign_spill_slots(spill_ctx & ctx,unsigned spills_to_vgpr)1433 assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
1434 {
1435 std::vector<uint32_t> slots(ctx.interferences.size());
1436 std::vector<bool> is_assigned(ctx.interferences.size());
1437
1438 /* first, handle affinities: just merge all interferences into both spill ids */
1439 for (std::vector<uint32_t>& vec : ctx.affinities) {
1440 for (unsigned i = 0; i < vec.size(); i++) {
1441 for (unsigned j = i + 1; j < vec.size(); j++) {
1442 assert(vec[i] != vec[j]);
1443 bool reloaded = ctx.is_reloaded[vec[i]] || ctx.is_reloaded[vec[j]];
1444 ctx.is_reloaded[vec[i]] = reloaded;
1445 ctx.is_reloaded[vec[j]] = reloaded;
1446 }
1447 }
1448 }
1449 for (ASSERTED uint32_t i = 0; i < ctx.interferences.size(); i++)
1450 for (ASSERTED uint32_t id : ctx.interferences[i].second)
1451 assert(i != id);
1452
1453 /* for each spill slot, assign as many spill ids as possible */
1454 assign_spill_slots_helper(ctx, RegType::sgpr, is_assigned, slots, &ctx.sgpr_spill_slots);
1455 assign_spill_slots_helper(ctx, RegType::vgpr, is_assigned, slots, &ctx.vgpr_spill_slots);
1456
1457 for (unsigned id = 0; id < is_assigned.size(); id++)
1458 assert(is_assigned[id] || !ctx.is_reloaded[id]);
1459
1460 for (std::vector<uint32_t>& vec : ctx.affinities) {
1461 for (unsigned i = 0; i < vec.size(); i++) {
1462 for (unsigned j = i + 1; j < vec.size(); j++) {
1463 assert(is_assigned[vec[i]] == is_assigned[vec[j]]);
1464 if (!is_assigned[vec[i]])
1465 continue;
1466 assert(ctx.is_reloaded[vec[i]] == ctx.is_reloaded[vec[j]]);
1467 assert(ctx.interferences[vec[i]].first.type() ==
1468 ctx.interferences[vec[j]].first.type());
1469 assert(slots[vec[i]] == slots[vec[j]]);
1470 }
1471 }
1472 }
1473
1474 /* hope, we didn't mess up */
1475 std::vector<Temp> vgpr_spill_temps((ctx.sgpr_spill_slots + ctx.wave_size - 1) / ctx.wave_size);
1476 assert(vgpr_spill_temps.size() <= spills_to_vgpr);
1477
1478 /* replace pseudo instructions with actual hardware instructions */
1479 unsigned last_top_level_block_idx = 0;
1480 for (Block& block : ctx.program->blocks) {
1481
1482 if (block.kind & block_kind_top_level) {
1483 last_top_level_block_idx = block.index;
1484
1485 end_unused_spill_vgprs(ctx, block, vgpr_spill_temps, slots, ctx.spills_entry[block.index]);
1486
1487 /* If the block has no predecessors (for example in RT resume shaders),
1488 * we cannot reuse the current scratch_rsrc temp because its definition is unreachable */
1489 if (block.linear_preds.empty())
1490 ctx.scratch_rsrc = Temp();
1491 }
1492
1493 std::vector<aco_ptr<Instruction>>::iterator it;
1494 std::vector<aco_ptr<Instruction>> instructions;
1495 instructions.reserve(block.instructions.size());
1496 Builder bld(ctx.program, &instructions);
1497 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1498
1499 if ((*it)->opcode == aco_opcode::p_spill) {
1500 uint32_t spill_id = (*it)->operands[1].constantValue();
1501
1502 if (!ctx.is_reloaded[spill_id]) {
1503 /* never reloaded, so don't spill */
1504 } else if (!is_assigned[spill_id]) {
1505 unreachable("No spill slot assigned for spill id");
1506 } else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) {
1507 spill_vgpr(ctx, block, instructions, *it, slots);
1508 } else {
1509 ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
1510
1511 uint32_t spill_slot = slots[spill_id];
1512
1513 /* check if the linear vgpr already exists */
1514 if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
1515 Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear());
1516 vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
1517 aco_ptr<Instruction> create{
1518 create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
1519 create->definitions[0] = Definition(linear_vgpr);
1520 /* find the right place to insert this definition */
1521 if (last_top_level_block_idx == block.index) {
1522 /* insert right before the current instruction */
1523 instructions.emplace_back(std::move(create));
1524 } else {
1525 assert(last_top_level_block_idx < block.index);
1526 /* insert after p_logical_end of the last top-level block */
1527 std::vector<aco_ptr<Instruction>>& block_instrs =
1528 ctx.program->blocks[last_top_level_block_idx].instructions;
1529 auto insert_point =
1530 std::find_if(block_instrs.rbegin(), block_instrs.rend(),
1531 [](const auto& iter) {
1532 return iter->opcode == aco_opcode::p_logical_end;
1533 })
1534 .base();
1535 block_instrs.insert(insert_point, std::move(create));
1536 }
1537 }
1538
1539 /* spill sgpr: just add the vgpr temp to operands */
1540 Instruction* spill = create_instruction(aco_opcode::p_spill, Format::PSEUDO, 3, 0);
1541 spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
1542 spill->operands[1] = Operand::c32(spill_slot % ctx.wave_size);
1543 spill->operands[2] = (*it)->operands[0];
1544 instructions.emplace_back(aco_ptr<Instruction>(spill));
1545 }
1546
1547 } else if ((*it)->opcode == aco_opcode::p_reload) {
1548 uint32_t spill_id = (*it)->operands[0].constantValue();
1549 assert(ctx.is_reloaded[spill_id]);
1550
1551 if (!is_assigned[spill_id]) {
1552 unreachable("No spill slot assigned for spill id");
1553 } else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) {
1554 reload_vgpr(ctx, block, instructions, *it, slots);
1555 } else {
1556 uint32_t spill_slot = slots[spill_id];
1557
1558 /* check if the linear vgpr already exists */
1559 if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
1560 Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear());
1561 vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
1562 aco_ptr<Instruction> create{
1563 create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
1564 create->definitions[0] = Definition(linear_vgpr);
1565 /* find the right place to insert this definition */
1566 if (last_top_level_block_idx == block.index) {
1567 /* insert right before the current instruction */
1568 instructions.emplace_back(std::move(create));
1569 } else {
1570 assert(last_top_level_block_idx < block.index);
1571 /* insert after p_logical_end of the last top-level block */
1572 std::vector<aco_ptr<Instruction>>& block_instrs =
1573 ctx.program->blocks[last_top_level_block_idx].instructions;
1574 auto insert_point =
1575 std::find_if(block_instrs.rbegin(), block_instrs.rend(),
1576 [](const auto& iter) {
1577 return iter->opcode == aco_opcode::p_logical_end;
1578 })
1579 .base();
1580 block_instrs.insert(insert_point, std::move(create));
1581 }
1582 }
1583
1584 /* reload sgpr: just add the vgpr temp to operands */
1585 Instruction* reload = create_instruction(aco_opcode::p_reload, Format::PSEUDO, 2, 1);
1586 reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
1587 reload->operands[1] = Operand::c32(spill_slot % ctx.wave_size);
1588 reload->definitions[0] = (*it)->definitions[0];
1589 instructions.emplace_back(aco_ptr<Instruction>(reload));
1590 }
1591 } else if (!ctx.unused_remats.count(it->get())) {
1592 instructions.emplace_back(std::move(*it));
1593 }
1594 }
1595 block.instructions = std::move(instructions);
1596 }
1597
1598 /* update required scratch memory */
1599 ctx.program->config->scratch_bytes_per_wave += ctx.vgpr_spill_slots * 4 * ctx.program->wave_size;
1600 }
1601
1602 } /* end namespace */
1603
1604 void
spill(Program * program)1605 spill(Program* program)
1606 {
1607 program->config->spilled_vgprs = 0;
1608 program->config->spilled_sgprs = 0;
1609
1610 program->progress = CompilationProgress::after_spilling;
1611
1612 /* no spilling when register pressure is low enough */
1613 if (program->num_waves > 0)
1614 return;
1615
1616 /* lower to CSSA before spilling to ensure correctness w.r.t. phis */
1617 lower_to_cssa(program);
1618
1619 /* calculate target register demand */
1620 const RegisterDemand demand = program->max_reg_demand; /* current max */
1621 const uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
1622 const uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
1623 uint16_t extra_vgprs = 0;
1624 uint16_t extra_sgprs = 0;
1625
1626 /* calculate extra VGPRs required for spilling SGPRs */
1627 if (demand.sgpr > sgpr_limit) {
1628 unsigned sgpr_spills = demand.sgpr - sgpr_limit;
1629 extra_vgprs = DIV_ROUND_UP(sgpr_spills * 2, program->wave_size) + 1;
1630 }
1631 /* add extra SGPRs required for spilling VGPRs */
1632 if (demand.vgpr + extra_vgprs > vgpr_limit) {
1633 if (program->gfx_level >= GFX9)
1634 extra_sgprs = 1; /* SADDR */
1635 else
1636 extra_sgprs = 5; /* scratch_resource (s4) + scratch_offset (s1) */
1637 if (demand.sgpr + extra_sgprs > sgpr_limit) {
1638 /* re-calculate in case something has changed */
1639 unsigned sgpr_spills = demand.sgpr + extra_sgprs - sgpr_limit;
1640 extra_vgprs = DIV_ROUND_UP(sgpr_spills * 2, program->wave_size) + 1;
1641 }
1642 }
1643 /* the spiller has to target the following register demand */
1644 const RegisterDemand target(vgpr_limit - extra_vgprs, sgpr_limit - extra_sgprs);
1645
1646 /* initialize ctx */
1647 spill_ctx ctx(target, program);
1648 gather_ssa_use_info(ctx);
1649 get_rematerialize_info(ctx);
1650
1651 /* create spills and reloads */
1652 for (unsigned i = 0; i < program->blocks.size(); i++)
1653 spill_block(ctx, i);
1654
1655 /* assign spill slots and DCE rematerialized code */
1656 assign_spill_slots(ctx, extra_vgprs);
1657
1658 /* update live variable information */
1659 live_var_analysis(program);
1660
1661 assert(program->num_waves > 0);
1662 }
1663
1664 } // namespace aco
1665