xref: /aosp_15_r20/external/mesa3d/src/amd/compiler/aco_spill.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Valve Corporation
3  * Copyright © 2018 Google
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "aco_builder.h"
9 #include "aco_ir.h"
10 #include "aco_util.h"
11 
12 #include "common/ac_descriptors.h"
13 #include "common/sid.h"
14 
15 #include <algorithm>
16 #include <cstring>
17 #include <map>
18 #include <set>
19 #include <unordered_map>
20 #include <unordered_set>
21 #include <vector>
22 
23 namespace std {
24 template <> struct hash<aco::Temp> {
operator ()std::hash25    size_t operator()(aco::Temp temp) const noexcept
26    {
27       uint32_t v;
28       std::memcpy(&v, &temp, sizeof(temp));
29       return std::hash<uint32_t>{}(v);
30    }
31 };
32 } // namespace std
33 
34 /*
35  * Implements the spilling algorithm on SSA-form based on
36  * "Register Spilling and Live-Range Splitting for SSA-Form Programs"
37  * by Matthias Braun and Sebastian Hack.
38  *
39  * Key difference between this algorithm and the min-algorithm from the paper
40  * is the use of average use distances rather than next-use distances per
41  * instruction.
42  * As we decrement the number of remaining uses, the average use distances
43  * give an approximation of the next-use distances while being computationally
44  * and memory-wise less expensive.
45  */
46 
47 namespace aco {
48 
49 namespace {
50 
51 struct remat_info {
52    Instruction* instr;
53 };
54 
55 struct loop_info {
56    uint32_t index;
57    aco::unordered_map<Temp, uint32_t> spills;
58    IDSet live_in;
59 };
60 
61 struct use_info {
62    uint32_t num_uses = 0;
63    uint32_t last_use = 0;
scoreaco::__anon6dee26dc0111::use_info64    float score() { return last_use / num_uses; }
65 };
66 
67 struct spill_ctx {
68    RegisterDemand target_pressure;
69    Program* program;
70    aco::monotonic_buffer_resource memory;
71 
72    std::vector<aco::map<Temp, Temp>> renames;
73    std::vector<aco::unordered_map<Temp, uint32_t>> spills_entry;
74    std::vector<aco::unordered_map<Temp, uint32_t>> spills_exit;
75 
76    std::vector<bool> processed;
77    std::vector<loop_info> loop;
78 
79    std::vector<use_info> ssa_infos;
80    std::vector<std::pair<RegClass, std::unordered_set<uint32_t>>> interferences;
81    std::vector<std::vector<uint32_t>> affinities;
82    std::vector<bool> is_reloaded;
83    aco::unordered_map<Temp, remat_info> remat;
84    std::set<Instruction*> unused_remats;
85    unsigned wave_size;
86 
87    unsigned sgpr_spill_slots;
88    unsigned vgpr_spill_slots;
89    Temp scratch_rsrc;
90 
spill_ctxaco::__anon6dee26dc0111::spill_ctx91    spill_ctx(const RegisterDemand target_pressure_, Program* program_)
92        : target_pressure(target_pressure_), program(program_), memory(),
93          renames(program->blocks.size(), aco::map<Temp, Temp>(memory)),
94          spills_entry(program->blocks.size(), aco::unordered_map<Temp, uint32_t>(memory)),
95          spills_exit(program->blocks.size(), aco::unordered_map<Temp, uint32_t>(memory)),
96          processed(program->blocks.size(), false), ssa_infos(program->peekAllocationId()),
97          remat(memory), wave_size(program->wave_size), sgpr_spill_slots(0), vgpr_spill_slots(0)
98    {}
99 
add_affinityaco::__anon6dee26dc0111::spill_ctx100    void add_affinity(uint32_t first, uint32_t second)
101    {
102       unsigned found_first = affinities.size();
103       unsigned found_second = affinities.size();
104       for (unsigned i = 0; i < affinities.size(); i++) {
105          std::vector<uint32_t>& vec = affinities[i];
106          for (uint32_t entry : vec) {
107             if (entry == first)
108                found_first = i;
109             else if (entry == second)
110                found_second = i;
111          }
112       }
113       if (found_first == affinities.size() && found_second == affinities.size()) {
114          affinities.emplace_back(std::vector<uint32_t>({first, second}));
115       } else if (found_first < affinities.size() && found_second == affinities.size()) {
116          affinities[found_first].push_back(second);
117       } else if (found_second < affinities.size() && found_first == affinities.size()) {
118          affinities[found_second].push_back(first);
119       } else if (found_first != found_second) {
120          /* merge second into first */
121          affinities[found_first].insert(affinities[found_first].end(),
122                                         affinities[found_second].begin(),
123                                         affinities[found_second].end());
124          affinities.erase(std::next(affinities.begin(), found_second));
125       } else {
126          assert(found_first == found_second);
127       }
128    }
129 
add_to_spillsaco::__anon6dee26dc0111::spill_ctx130    uint32_t add_to_spills(Temp to_spill, aco::unordered_map<Temp, uint32_t>& spills)
131    {
132       const uint32_t spill_id = allocate_spill_id(to_spill.regClass());
133       for (auto pair : spills)
134          add_interference(spill_id, pair.second);
135       if (!loop.empty()) {
136          for (auto pair : loop.back().spills)
137             add_interference(spill_id, pair.second);
138       }
139 
140       spills[to_spill] = spill_id;
141       return spill_id;
142    }
143 
add_interferenceaco::__anon6dee26dc0111::spill_ctx144    void add_interference(uint32_t first, uint32_t second)
145    {
146       if (interferences[first].first.type() != interferences[second].first.type())
147          return;
148 
149       bool inserted = interferences[first].second.insert(second).second;
150       if (inserted)
151          interferences[second].second.insert(first);
152    }
153 
allocate_spill_idaco::__anon6dee26dc0111::spill_ctx154    uint32_t allocate_spill_id(RegClass rc)
155    {
156       interferences.emplace_back(rc, std::unordered_set<uint32_t>());
157       is_reloaded.push_back(false);
158       return next_spill_id++;
159    }
160 
161    uint32_t next_spill_id = 0;
162 };
163 
164 /**
165  * Gathers information about the number of uses and point of last use
166  * per SSA value.
167  *
168  * Phi definitions are added to live-ins.
169  */
170 void
gather_ssa_use_info(spill_ctx & ctx)171 gather_ssa_use_info(spill_ctx& ctx)
172 {
173    unsigned instruction_idx = 0;
174    for (Block& block : ctx.program->blocks) {
175       for (int i = block.instructions.size() - 1; i >= 0; i--) {
176          aco_ptr<Instruction>& instr = block.instructions[i];
177          for (const Operand& op : instr->operands) {
178             if (op.isTemp()) {
179                use_info& info = ctx.ssa_infos[op.tempId()];
180                info.num_uses++;
181                info.last_use = std::max(info.last_use, instruction_idx + i);
182             }
183          }
184       }
185 
186       /* All live-in variables at loop headers get an additional artificial use.
187        * As we decrement the number of uses while processing the blocks, this
188        * ensures that the number of uses won't becomes zero before the loop
189        * (and the variables' live-ranges) end.
190        */
191       if (block.kind & block_kind_loop_header) {
192          for (unsigned t : ctx.program->live.live_in[block.index])
193             ctx.ssa_infos[t].num_uses++;
194       }
195 
196       instruction_idx += block.instructions.size();
197    }
198 }
199 
200 bool
should_rematerialize(aco_ptr<Instruction> & instr)201 should_rematerialize(aco_ptr<Instruction>& instr)
202 {
203    /* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */
204    if (instr->format != Format::VOP1 && instr->format != Format::SOP1 &&
205        instr->format != Format::PSEUDO && instr->format != Format::SOPK)
206       return false;
207    /* TODO: pseudo-instruction rematerialization is only supported for
208     * p_create_vector/p_parallelcopy */
209    if (instr->isPseudo() && instr->opcode != aco_opcode::p_create_vector &&
210        instr->opcode != aco_opcode::p_parallelcopy)
211       return false;
212    if (instr->isSOPK() && instr->opcode != aco_opcode::s_movk_i32)
213       return false;
214 
215    for (const Operand& op : instr->operands) {
216       /* TODO: rematerialization using temporaries isn't yet supported */
217       if (!op.isConstant())
218          return false;
219    }
220 
221    /* TODO: rematerialization with multiple definitions isn't yet supported */
222    if (instr->definitions.size() > 1)
223       return false;
224 
225    return true;
226 }
227 
228 aco_ptr<Instruction>
do_reload(spill_ctx & ctx,Temp tmp,Temp new_name,uint32_t spill_id)229 do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id)
230 {
231    std::unordered_map<Temp, remat_info>::iterator remat = ctx.remat.find(tmp);
232    if (remat != ctx.remat.end()) {
233       Instruction* instr = remat->second.instr;
234       assert((instr->isVOP1() || instr->isSOP1() || instr->isPseudo() || instr->isSOPK()) &&
235              "unsupported");
236       assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector ||
237               instr->opcode == aco_opcode::p_parallelcopy) &&
238              "unsupported");
239       assert(instr->definitions.size() == 1 && "unsupported");
240 
241       aco_ptr<Instruction> res;
242       res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(),
243                                    instr->definitions.size()));
244       if (instr->isSOPK())
245          res->salu().imm = instr->salu().imm;
246 
247       for (unsigned i = 0; i < instr->operands.size(); i++) {
248          res->operands[i] = instr->operands[i];
249          if (instr->operands[i].isTemp()) {
250             assert(false && "unsupported");
251             if (ctx.remat.count(instr->operands[i].getTemp()))
252                ctx.unused_remats.erase(ctx.remat[instr->operands[i].getTemp()].instr);
253          }
254       }
255       res->definitions[0] = Definition(new_name);
256       return res;
257    } else {
258       aco_ptr<Instruction> reload{create_instruction(aco_opcode::p_reload, Format::PSEUDO, 1, 1)};
259       reload->operands[0] = Operand::c32(spill_id);
260       reload->definitions[0] = Definition(new_name);
261       ctx.is_reloaded[spill_id] = true;
262       return reload;
263    }
264 }
265 
266 void
get_rematerialize_info(spill_ctx & ctx)267 get_rematerialize_info(spill_ctx& ctx)
268 {
269    for (Block& block : ctx.program->blocks) {
270       bool logical = false;
271       for (aco_ptr<Instruction>& instr : block.instructions) {
272          if (instr->opcode == aco_opcode::p_logical_start)
273             logical = true;
274          else if (instr->opcode == aco_opcode::p_logical_end)
275             logical = false;
276          if (logical && should_rematerialize(instr)) {
277             for (const Definition& def : instr->definitions) {
278                if (def.isTemp()) {
279                   ctx.remat[def.getTemp()] = remat_info{instr.get()};
280                   ctx.unused_remats.insert(instr.get());
281                }
282             }
283          }
284       }
285    }
286 }
287 
288 RegisterDemand
init_live_in_vars(spill_ctx & ctx,Block * block,unsigned block_idx)289 init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
290 {
291    RegisterDemand spilled_registers;
292 
293    /* first block, nothing was spilled before */
294    if (block->linear_preds.empty())
295       return {0, 0};
296 
297    /* live-in variables at the beginning of the current block */
298    const IDSet& live_in = ctx.program->live.live_in[block_idx];
299 
300    /* loop header block */
301    if (block->kind & block_kind_loop_header) {
302       assert(block->linear_preds[0] == block_idx - 1);
303       assert(block->logical_preds[0] == block_idx - 1);
304 
305       /* check how many live-through variables should be spilled */
306       RegisterDemand reg_pressure = block->live_in_demand;
307       RegisterDemand loop_demand = reg_pressure;
308       unsigned i = block_idx;
309       while (ctx.program->blocks[i].loop_nest_depth >= block->loop_nest_depth)
310          loop_demand.update(ctx.program->blocks[i++].register_demand);
311 
312       for (auto spilled : ctx.spills_exit[block_idx - 1]) {
313          /* variable is not live at loop entry: probably a phi operand */
314          if (!live_in.count(spilled.first.id()))
315             continue;
316 
317          /* keep live-through variables spilled */
318          ctx.spills_entry[block_idx][spilled.first] = spilled.second;
319          spilled_registers += spilled.first;
320          loop_demand -= spilled.first;
321       }
322       if (!ctx.loop.empty()) {
323          /* If this is a nested loop, keep variables from the outer loop spilled. */
324          for (auto spilled : ctx.loop.back().spills) {
325             /* If the inner loop comes after the last continue statement of the outer loop,
326              * the loop-carried variables might not be live-in for the inner loop.
327              */
328             if (live_in.count(spilled.first.id()) &&
329                 ctx.spills_entry[block_idx].insert(spilled).second) {
330                spilled_registers += spilled.first;
331                loop_demand -= spilled.first;
332             }
333          }
334       }
335 
336       /* select more live-through variables and constants */
337       RegType type = RegType::vgpr;
338       while (loop_demand.exceeds(ctx.target_pressure)) {
339          /* if VGPR demand is low enough, select SGPRs */
340          if (type == RegType::vgpr && loop_demand.vgpr <= ctx.target_pressure.vgpr)
341             type = RegType::sgpr;
342          /* if SGPR demand is low enough, break */
343          if (type == RegType::sgpr && loop_demand.sgpr <= ctx.target_pressure.sgpr)
344             break;
345 
346          float score = 0.0;
347          unsigned remat = 0;
348          Temp to_spill;
349          for (unsigned t : live_in) {
350             Temp var = Temp(t, ctx.program->temp_rc[t]);
351             if (var.type() != type || ctx.spills_entry[block_idx].count(var) ||
352                 var.regClass().is_linear_vgpr())
353                continue;
354 
355             unsigned can_remat = ctx.remat.count(var);
356             if (can_remat > remat || (can_remat == remat && ctx.ssa_infos[t].score() > score)) {
357                to_spill = var;
358                score = ctx.ssa_infos[t].score();
359                remat = can_remat;
360             }
361          }
362 
363          /* select SGPRs or break */
364          if (score == 0.0) {
365             if (type == RegType::sgpr)
366                break;
367             type = RegType::sgpr;
368             continue;
369          }
370 
371          ctx.add_to_spills(to_spill, ctx.spills_entry[block_idx]);
372          spilled_registers += to_spill;
373          loop_demand -= to_spill;
374       }
375 
376       /* create new loop_info */
377       loop_info info = {block_idx, ctx.spills_entry[block_idx], live_in};
378       ctx.loop.emplace_back(std::move(info));
379 
380       /* shortcut */
381       if (!loop_demand.exceeds(ctx.target_pressure))
382          return spilled_registers;
383 
384       /* if reg pressure is too high at beginning of loop, add variables with furthest use */
385       reg_pressure -= spilled_registers;
386 
387       while (reg_pressure.exceeds(ctx.target_pressure)) {
388          float score = 0;
389          Temp to_spill;
390          type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;
391          for (aco_ptr<Instruction>& phi : block->instructions) {
392             if (!is_phi(phi))
393                break;
394             if (!phi->definitions[0].isTemp() || phi->definitions[0].isKill())
395                continue;
396             Temp var = phi->definitions[0].getTemp();
397             if (var.type() == type && !ctx.spills_entry[block_idx].count(var) &&
398                 ctx.ssa_infos[var.id()].score() > score) {
399                to_spill = var;
400                score = ctx.ssa_infos[var.id()].score();
401             }
402          }
403          assert(score != 0.0);
404          ctx.add_to_spills(to_spill, ctx.spills_entry[block_idx]);
405          spilled_registers += to_spill;
406          reg_pressure -= to_spill;
407       }
408 
409       return spilled_registers;
410    }
411 
412    /* branch block */
413    if (block->linear_preds.size() == 1 && !(block->kind & block_kind_loop_exit)) {
414       /* keep variables spilled */
415       unsigned pred_idx = block->linear_preds[0];
416       for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
417          if (pair.first.type() != RegType::sgpr)
418             continue;
419 
420          if (live_in.count(pair.first.id())) {
421             spilled_registers += pair.first;
422             ctx.spills_entry[block_idx].emplace(pair);
423          }
424       }
425 
426       if (block->logical_preds.empty())
427          return spilled_registers;
428 
429       pred_idx = block->logical_preds[0];
430       for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
431          if (pair.first.type() != RegType::vgpr)
432             continue;
433 
434          if (live_in.count(pair.first.id())) {
435             spilled_registers += pair.first;
436             ctx.spills_entry[block_idx].emplace(pair);
437          }
438       }
439 
440       return spilled_registers;
441    }
442 
443    /* else: merge block */
444    std::map<Temp, bool> partial_spills;
445 
446    /* keep variables spilled on all incoming paths */
447    for (unsigned t : live_in) {
448       const RegClass rc = ctx.program->temp_rc[t];
449       Temp var = Temp(t, rc);
450       Block::edge_vec& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;
451 
452       /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload
453        * it. Otherwise, if any predecessor reloads it, ensure it's reloaded on all other
454        * predecessors. The idea is that it's better in practice to rematerialize redundantly than to
455        * create lots of phis. */
456       const bool remat = ctx.remat.count(var);
457       /* If the variable is spilled at the current loop-header, spilling is essentially for free
458        * while reloading is not. Thus, keep them spilled if they are at least partially spilled.
459        */
460       const bool avoid_respill = block->loop_nest_depth && ctx.loop.back().spills.count(var);
461       bool spill = true;
462       bool partial_spill = false;
463       uint32_t spill_id = 0;
464       for (unsigned pred_idx : preds) {
465          if (!ctx.spills_exit[pred_idx].count(var)) {
466             spill = false;
467          } else {
468             partial_spill = true;
469             /* it might be that on one incoming path, the variable has a different spill_id, but
470              * add_couple_code() will take care of that. */
471             spill_id = ctx.spills_exit[pred_idx][var];
472          }
473       }
474       spill |= (remat && partial_spill);
475       spill |= (avoid_respill && partial_spill);
476       if (spill) {
477          ctx.spills_entry[block_idx][var] = spill_id;
478          partial_spills.erase(var);
479          spilled_registers += var;
480       } else {
481          partial_spills[var] = partial_spill;
482       }
483    }
484 
485    /* same for phis */
486    for (aco_ptr<Instruction>& phi : block->instructions) {
487       if (!is_phi(phi))
488          break;
489       if (!phi->definitions[0].isTemp() || phi->definitions[0].isKill())
490          continue;
491 
492       Block::edge_vec& preds =
493          phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
494       bool is_all_undef = true;
495       bool is_all_spilled = true;
496       bool is_partial_spill = false;
497       for (unsigned i = 0; i < phi->operands.size(); i++) {
498          if (phi->operands[i].isUndefined())
499             continue;
500          bool spilled = phi->operands[i].isTemp() &&
501                         ctx.spills_exit[preds[i]].count(phi->operands[i].getTemp());
502          is_all_spilled &= spilled;
503          is_partial_spill |= spilled;
504          is_all_undef = false;
505       }
506 
507       if (is_all_spilled && !is_all_undef) {
508          /* The phi is spilled at all predecessors. Keep it spilled. */
509          ctx.add_to_spills(phi->definitions[0].getTemp(), ctx.spills_entry[block_idx]);
510          spilled_registers += phi->definitions[0].getTemp();
511          partial_spills.erase(phi->definitions[0].getTemp());
512       } else {
513          /* Phis might increase the register pressure. */
514          partial_spills[phi->definitions[0].getTemp()] = is_partial_spill;
515       }
516    }
517 
518    /* if reg pressure at first instruction is still too high, add partially spilled variables */
519    RegisterDemand reg_pressure = block->live_in_demand;
520    reg_pressure -= spilled_registers;
521 
522    while (reg_pressure.exceeds(ctx.target_pressure)) {
523       assert(!partial_spills.empty());
524       std::map<Temp, bool>::iterator it = partial_spills.begin();
525       Temp to_spill = Temp();
526       bool is_partial_spill = false;
527       float score = 0.0;
528       RegType type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;
529 
530       while (it != partial_spills.end()) {
531          assert(!ctx.spills_entry[block_idx].count(it->first));
532 
533          if (it->first.type() == type && !it->first.regClass().is_linear_vgpr() &&
534              ((it->second && !is_partial_spill) ||
535               (it->second == is_partial_spill && ctx.ssa_infos[it->first.id()].score() > score))) {
536             score = ctx.ssa_infos[it->first.id()].score();
537             to_spill = it->first;
538             is_partial_spill = it->second;
539          }
540          ++it;
541       }
542       assert(score != 0.0);
543       ctx.add_to_spills(to_spill, ctx.spills_entry[block_idx]);
544       partial_spills.erase(to_spill);
545       spilled_registers += to_spill;
546       reg_pressure -= to_spill;
547    }
548 
549    return spilled_registers;
550 }
551 
552 void
add_coupling_code(spill_ctx & ctx,Block * block,IDSet & live_in)553 add_coupling_code(spill_ctx& ctx, Block* block, IDSet& live_in)
554 {
555    const unsigned block_idx = block->index;
556    /* No coupling code necessary */
557    if (block->linear_preds.size() == 0)
558       return;
559 
560    /* Branch block: update renames */
561    if (block->linear_preds.size() == 1 &&
562        !(block->kind & (block_kind_loop_exit | block_kind_loop_header))) {
563       assert(ctx.processed[block->linear_preds[0]]);
564 
565       ctx.renames[block_idx] = ctx.renames[block->linear_preds[0]];
566       if (!block->logical_preds.empty() && block->logical_preds[0] != block->linear_preds[0]) {
567          for (auto it : ctx.renames[block->logical_preds[0]]) {
568             if (it.first.type() == RegType::vgpr)
569                ctx.renames[block_idx].insert_or_assign(it.first, it.second);
570          }
571       }
572       return;
573    }
574 
575    std::vector<aco_ptr<Instruction>> instructions;
576 
577    /* loop header and merge blocks: check if all (linear) predecessors have been processed */
578    for (ASSERTED unsigned pred : block->linear_preds)
579       assert(ctx.processed[pred]);
580 
581    /* iterate the phi nodes for which operands to spill at the predecessor */
582    for (aco_ptr<Instruction>& phi : block->instructions) {
583       if (!is_phi(phi))
584          break;
585 
586       for (const Operand& op : phi->operands) {
587          if (op.isTemp())
588             ctx.ssa_infos[op.tempId()].num_uses--;
589       }
590 
591       /* The phi is not spilled */
592       if (!phi->definitions[0].isTemp() ||
593           !ctx.spills_entry[block_idx].count(phi->definitions[0].getTemp()))
594          continue;
595 
596       Block::edge_vec& preds =
597          phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
598       uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()];
599       phi->definitions[0].setKill(true);
600 
601       for (unsigned i = 0; i < phi->operands.size(); i++) {
602          if (phi->operands[i].isUndefined())
603             continue;
604 
605          unsigned pred_idx = preds[i];
606          Operand spill_op = phi->operands[i];
607          phi->operands[i] = Operand(phi->definitions[0].regClass());
608 
609          if (spill_op.isTemp()) {
610             assert(spill_op.isKill());
611             Temp var = spill_op.getTemp();
612 
613             std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
614             /* prevent the defining instruction from being DCE'd if it could be rematerialized */
615             if (rename_it == ctx.renames[preds[i]].end() && ctx.remat.count(var))
616                ctx.unused_remats.erase(ctx.remat[var].instr);
617 
618             /* check if variable is already spilled at predecessor */
619             auto spilled = ctx.spills_exit[pred_idx].find(var);
620             if (spilled != ctx.spills_exit[pred_idx].end()) {
621                if (spilled->second != def_spill_id)
622                   ctx.add_affinity(def_spill_id, spilled->second);
623                continue;
624             }
625 
626             /* If the phi operand has the same name as the definition,
627              * add to predecessor's spilled variables, so that it gets
628              * skipped in the loop below.
629              */
630             if (var == phi->definitions[0].getTemp())
631                ctx.spills_exit[pred_idx][var] = def_spill_id;
632 
633             /* rename if necessary */
634             if (rename_it != ctx.renames[pred_idx].end()) {
635                spill_op.setTemp(rename_it->second);
636                ctx.renames[pred_idx].erase(rename_it);
637             }
638          }
639 
640          /* add interferences */
641          for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx])
642             ctx.add_interference(def_spill_id, pair.second);
643 
644          aco_ptr<Instruction> spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
645          spill->operands[0] = spill_op;
646          spill->operands[1] = Operand::c32(def_spill_id);
647          Block& pred = ctx.program->blocks[pred_idx];
648          unsigned idx = pred.instructions.size();
649          do {
650             assert(idx != 0);
651             idx--;
652          } while (phi->opcode == aco_opcode::p_phi &&
653                   pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
654          std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
655          pred.instructions.insert(it, std::move(spill));
656       }
657    }
658 
659    /* iterate all (other) spilled variables for which to spill at the predecessor */
660    // TODO: would be better to have them sorted: first vgprs and first with longest distance
661    for (std::pair<Temp, uint32_t> pair : ctx.spills_entry[block_idx]) {
662       /* if variable is not live-in, it must be from a phi: this works because of CSSA form */
663       if (!live_in.count(pair.first.id()))
664          continue;
665 
666       Block::edge_vec& preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
667       for (unsigned pred_idx : preds) {
668          /* variable is already spilled at predecessor */
669          auto spilled = ctx.spills_exit[pred_idx].find(pair.first);
670          if (spilled != ctx.spills_exit[pred_idx].end()) {
671             if (spilled->second != pair.second)
672                ctx.add_affinity(pair.second, spilled->second);
673             continue;
674          }
675 
676          /* If this variable is spilled through the entire loop, no need to re-spill.
677           * It can be reloaded from the same spill-slot it got at the loop-preheader.
678           * No need to add interferences since every spilled variable in the loop already
679           * interferes with the spilled loop-variables. Make sure that the spill_ids match.
680           */
681          const uint32_t loop_nest_depth = std::min(ctx.program->blocks[pred_idx].loop_nest_depth,
682                                                    ctx.program->blocks[block_idx].loop_nest_depth);
683          if (loop_nest_depth) {
684             auto spill = ctx.loop[loop_nest_depth - 1].spills.find(pair.first);
685             if (spill != ctx.loop[loop_nest_depth - 1].spills.end() && spill->second == pair.second)
686                continue;
687          }
688 
689          /* add interferences between spilled variable and predecessors exit spills */
690          for (std::pair<Temp, uint32_t> exit_spill : ctx.spills_exit[pred_idx])
691             ctx.add_interference(exit_spill.second, pair.second);
692 
693          /* variable is in register at predecessor and has to be spilled */
694          /* rename if necessary */
695          Temp var = pair.first;
696          std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
697          if (rename_it != ctx.renames[pred_idx].end()) {
698             var = rename_it->second;
699             ctx.renames[pred_idx].erase(rename_it);
700          }
701 
702          aco_ptr<Instruction> spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
703          spill->operands[0] = Operand(var);
704          spill->operands[1] = Operand::c32(pair.second);
705          Block& pred = ctx.program->blocks[pred_idx];
706          unsigned idx = pred.instructions.size();
707          do {
708             assert(idx != 0);
709             idx--;
710          } while (pair.first.type() == RegType::vgpr &&
711                   pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
712          std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
713          pred.instructions.insert(it, std::move(spill));
714       }
715    }
716 
717    /* iterate phis for which operands to reload */
718    for (aco_ptr<Instruction>& phi : block->instructions) {
719       if (!is_phi(phi))
720          break;
721       if (phi->definitions[0].isKill())
722          continue;
723 
724       assert(!phi->definitions[0].isTemp() ||
725              !ctx.spills_entry[block_idx].count(phi->definitions[0].getTemp()));
726 
727       Block::edge_vec& preds =
728          phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
729       for (unsigned i = 0; i < phi->operands.size(); i++) {
730          if (!phi->operands[i].isTemp())
731             continue;
732          unsigned pred_idx = preds[i];
733 
734          /* if the operand was reloaded, rename */
735          if (!ctx.spills_exit[pred_idx].count(phi->operands[i].getTemp())) {
736             std::map<Temp, Temp>::iterator it =
737                ctx.renames[pred_idx].find(phi->operands[i].getTemp());
738             if (it != ctx.renames[pred_idx].end()) {
739                phi->operands[i].setTemp(it->second);
740                /* prevent the defining instruction from being DCE'd if it could be rematerialized */
741             } else {
742                auto remat_it = ctx.remat.find(phi->operands[i].getTemp());
743                if (remat_it != ctx.remat.end()) {
744                   ctx.unused_remats.erase(remat_it->second.instr);
745                }
746             }
747             continue;
748          }
749 
750          Temp tmp = phi->operands[i].getTemp();
751 
752          /* reload phi operand at end of predecessor block */
753          Temp new_name = ctx.program->allocateTmp(tmp.regClass());
754          Block& pred = ctx.program->blocks[pred_idx];
755          unsigned idx = pred.instructions.size();
756          do {
757             assert(idx != 0);
758             idx--;
759          } while (phi->opcode == aco_opcode::p_phi &&
760                   pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
761          std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
762          aco_ptr<Instruction> reload =
763             do_reload(ctx, tmp, new_name, ctx.spills_exit[pred_idx][tmp]);
764 
765          /* reload spilled exec mask directly to exec */
766          if (!phi->definitions[0].isTemp()) {
767             assert(phi->definitions[0].isFixed() && phi->definitions[0].physReg() == exec);
768             reload->definitions[0] = phi->definitions[0];
769             phi->operands[i] = Operand(exec, ctx.program->lane_mask);
770          } else {
771             ctx.spills_exit[pred_idx].erase(tmp);
772             ctx.renames[pred_idx][tmp] = new_name;
773             phi->operands[i].setTemp(new_name);
774          }
775 
776          pred.instructions.insert(it, std::move(reload));
777       }
778    }
779 
780    /* iterate live variables for which to reload */
781    for (unsigned t : live_in) {
782       const RegClass rc = ctx.program->temp_rc[t];
783       Temp var = Temp(t, rc);
784 
785       /* skip spilled variables */
786       if (ctx.spills_entry[block_idx].count(var))
787          continue;
788 
789       Block::edge_vec& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;
790       for (unsigned pred_idx : preds) {
791          /* skip if the variable is not spilled at the predecessor */
792          if (!ctx.spills_exit[pred_idx].count(var))
793             continue;
794 
795          /* variable is spilled at predecessor and has to be reloaded */
796          Temp new_name = ctx.program->allocateTmp(rc);
797          Block& pred = ctx.program->blocks[pred_idx];
798          unsigned idx = pred.instructions.size();
799          do {
800             assert(idx != 0);
801             idx--;
802          } while (rc.type() == RegType::vgpr &&
803                   pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
804          std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
805 
806          aco_ptr<Instruction> reload =
807             do_reload(ctx, var, new_name, ctx.spills_exit[pred.index][var]);
808          pred.instructions.insert(it, std::move(reload));
809 
810          ctx.spills_exit[pred.index].erase(var);
811          ctx.renames[pred.index][var] = new_name;
812       }
813 
814       /* check if we have to create a new phi for this variable */
815       Temp rename = Temp();
816       bool is_same = true;
817       for (unsigned pred_idx : preds) {
818          if (!ctx.renames[pred_idx].count(var)) {
819             if (rename == Temp())
820                rename = var;
821             else
822                is_same = rename == var;
823          } else {
824             if (rename == Temp())
825                rename = ctx.renames[pred_idx][var];
826             else
827                is_same = rename == ctx.renames[pred_idx][var];
828          }
829 
830          if (!is_same)
831             break;
832       }
833 
834       if (!is_same) {
835          /* the variable was renamed differently in the predecessors: we have to create a phi */
836          aco_opcode opcode = rc.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
837          aco_ptr<Instruction> phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
838          rename = ctx.program->allocateTmp(rc);
839          for (unsigned i = 0; i < phi->operands.size(); i++) {
840             Temp tmp;
841             if (ctx.renames[preds[i]].count(var)) {
842                tmp = ctx.renames[preds[i]][var];
843             } else if (preds[i] >= block_idx) {
844                tmp = rename;
845             } else {
846                tmp = var;
847                /* prevent the defining instruction from being DCE'd if it could be rematerialized */
848                if (ctx.remat.count(tmp))
849                   ctx.unused_remats.erase(ctx.remat[tmp].instr);
850             }
851             phi->operands[i] = Operand(tmp);
852          }
853          phi->definitions[0] = Definition(rename);
854          phi->register_demand = block->live_in_demand;
855          block->instructions.insert(block->instructions.begin(), std::move(phi));
856       }
857 
858       /* the variable was renamed: add new name to renames */
859       if (!(rename == Temp() || rename == var))
860          ctx.renames[block_idx][var] = rename;
861    }
862 }
863 
864 void
process_block(spill_ctx & ctx,unsigned block_idx,Block * block,RegisterDemand spilled_registers)865 process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand spilled_registers)
866 {
867    assert(!ctx.processed[block_idx]);
868 
869    std::vector<aco_ptr<Instruction>> instructions;
870    unsigned idx = 0;
871 
872    /* phis are handled separately */
873    while (block->instructions[idx]->opcode == aco_opcode::p_phi ||
874           block->instructions[idx]->opcode == aco_opcode::p_linear_phi) {
875       const Definition def = block->instructions[idx]->definitions[0];
876       if (def.isTemp() && !def.isKill() && def.tempId() < ctx.ssa_infos.size())
877          ctx.program->live.live_in[block_idx].insert(def.tempId());
878       instructions.emplace_back(std::move(block->instructions[idx++]));
879    }
880 
881    auto& current_spills = ctx.spills_exit[block_idx];
882 
883    while (idx < block->instructions.size()) {
884       aco_ptr<Instruction>& instr = block->instructions[idx];
885 
886       /* Spilling is handled as part of phis (they should always have the same or higher register
887        * demand). If we try to spill here, we might not be able to reduce the register demand enough
888        * because there is no path to spill constant/undef phi operands. */
889       if (instr->opcode == aco_opcode::p_branch) {
890          instructions.emplace_back(std::move(instr));
891          idx++;
892          continue;
893       }
894 
895       std::map<Temp, std::pair<Temp, uint32_t>> reloads;
896 
897       /* rename and reload operands */
898       for (Operand& op : instr->operands) {
899          if (!op.isTemp())
900             continue;
901 
902          if (op.isFirstKill())
903             ctx.program->live.live_in[block_idx].erase(op.tempId());
904          ctx.ssa_infos[op.tempId()].num_uses--;
905 
906          if (!current_spills.count(op.getTemp()))
907             continue;
908 
909          /* the Operand is spilled: add it to reloads */
910          Temp new_tmp = ctx.program->allocateTmp(op.regClass());
911          ctx.renames[block_idx][op.getTemp()] = new_tmp;
912          reloads[new_tmp] = std::make_pair(op.getTemp(), current_spills[op.getTemp()]);
913          current_spills.erase(op.getTemp());
914          spilled_registers -= new_tmp;
915       }
916 
917       /* check if register demand is low enough during and after the current instruction */
918       if (block->register_demand.exceeds(ctx.target_pressure)) {
919          RegisterDemand new_demand = instr->register_demand;
920 
921          /* if reg pressure is too high, spill variable with furthest next use */
922          while ((new_demand - spilled_registers).exceeds(ctx.target_pressure)) {
923             float score = 0.0;
924             Temp to_spill;
925             unsigned do_rematerialize = 0;
926             unsigned avoid_respill = 0;
927             RegType type = RegType::sgpr;
928             if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr)
929                type = RegType::vgpr;
930 
931             for (unsigned t : ctx.program->live.live_in[block_idx]) {
932                RegClass rc = ctx.program->temp_rc[t];
933                Temp var = Temp(t, rc);
934                if (rc.type() != type || current_spills.count(var) || rc.is_linear_vgpr())
935                   continue;
936 
937                unsigned can_rematerialize = ctx.remat.count(var);
938                unsigned loop_variable = block->loop_nest_depth && ctx.loop.back().spills.count(var);
939                if (avoid_respill > loop_variable || do_rematerialize > can_rematerialize)
940                   continue;
941 
942                if (can_rematerialize > do_rematerialize || loop_variable > avoid_respill ||
943                    ctx.ssa_infos[t].score() > score) {
944                   /* Don't spill operands */
945                   if (std::any_of(instr->operands.begin(), instr->operands.end(),
946                                   [&](Operand& op) { return op.isTemp() && op.getTemp() == var; }))
947                      continue;
948 
949                   to_spill = var;
950                   score = ctx.ssa_infos[t].score();
951                   do_rematerialize = can_rematerialize;
952                   avoid_respill = loop_variable;
953                }
954             }
955             assert(score != 0.0);
956 
957             if (avoid_respill) {
958                /* This variable is spilled at the loop-header of the current loop.
959                 * Re-use the spill-slot in order to avoid an extra store.
960                 */
961                current_spills[to_spill] = ctx.loop.back().spills[to_spill];
962                spilled_registers += to_spill;
963                continue;
964             }
965 
966             uint32_t spill_id = ctx.add_to_spills(to_spill, current_spills);
967             /* add interferences with reloads */
968             for (std::pair<const Temp, std::pair<Temp, uint32_t>>& pair : reloads)
969                ctx.add_interference(spill_id, pair.second.second);
970 
971             spilled_registers += to_spill;
972 
973             /* rename if necessary */
974             if (ctx.renames[block_idx].count(to_spill)) {
975                to_spill = ctx.renames[block_idx][to_spill];
976             }
977 
978             /* add spill to new instructions */
979             aco_ptr<Instruction> spill{
980                create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
981             spill->operands[0] = Operand(to_spill);
982             spill->operands[1] = Operand::c32(spill_id);
983             instructions.emplace_back(std::move(spill));
984          }
985       }
986 
987       for (const Definition& def : instr->definitions) {
988          if (def.isTemp() && !def.isKill())
989             ctx.program->live.live_in[block_idx].insert(def.tempId());
990       }
991       /* rename operands */
992       for (Operand& op : instr->operands) {
993          if (op.isTemp()) {
994             auto rename_it = ctx.renames[block_idx].find(op.getTemp());
995             if (rename_it != ctx.renames[block_idx].end()) {
996                op.setTemp(rename_it->second);
997             } else {
998                /* prevent its defining instruction from being DCE'd if it could be rematerialized */
999                auto remat_it = ctx.remat.find(op.getTemp());
1000                if (remat_it != ctx.remat.end()) {
1001                   ctx.unused_remats.erase(remat_it->second.instr);
1002                }
1003             }
1004          }
1005       }
1006 
1007       /* add reloads and instruction to new instructions */
1008       for (std::pair<const Temp, std::pair<Temp, uint32_t>>& pair : reloads) {
1009          aco_ptr<Instruction> reload =
1010             do_reload(ctx, pair.second.first, pair.first, pair.second.second);
1011          instructions.emplace_back(std::move(reload));
1012       }
1013       instructions.emplace_back(std::move(instr));
1014       idx++;
1015    }
1016 
1017    block->instructions = std::move(instructions);
1018 }
1019 
1020 void
spill_block(spill_ctx & ctx,unsigned block_idx)1021 spill_block(spill_ctx& ctx, unsigned block_idx)
1022 {
1023    Block* block = &ctx.program->blocks[block_idx];
1024 
1025    /* determine set of variables which are spilled at the beginning of the block */
1026    RegisterDemand spilled_registers = init_live_in_vars(ctx, block, block_idx);
1027 
1028    if (!(block->kind & block_kind_loop_header)) {
1029       /* add spill/reload code on incoming control flow edges */
1030       add_coupling_code(ctx, block, ctx.program->live.live_in[block_idx]);
1031    }
1032 
1033    assert(ctx.spills_exit[block_idx].empty());
1034    ctx.spills_exit[block_idx] = ctx.spills_entry[block_idx];
1035    process_block(ctx, block_idx, block, spilled_registers);
1036 
1037    ctx.processed[block_idx] = true;
1038 
1039    /* check if the next block leaves the current loop */
1040    if (block->loop_nest_depth == 0 ||
1041        ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth)
1042       return;
1043 
1044    uint32_t loop_header_idx = ctx.loop.back().index;
1045 
1046    /* preserve original renames at end of loop header block */
1047    aco::map<Temp, Temp> renames = std::move(ctx.renames[loop_header_idx]);
1048 
1049    /* add coupling code to all loop header predecessors */
1050    for (unsigned t : ctx.loop.back().live_in)
1051       ctx.ssa_infos[t].num_uses--;
1052    add_coupling_code(ctx, &ctx.program->blocks[loop_header_idx], ctx.loop.back().live_in);
1053    renames.swap(ctx.renames[loop_header_idx]);
1054 
1055    /* remove loop header info from stack */
1056    ctx.loop.pop_back();
1057    if (renames.empty())
1058       return;
1059 
1060    /* Add the new renames to each block */
1061    for (std::pair<Temp, Temp> rename : renames) {
1062       /* If there is already a rename, don't overwrite it. */
1063       for (unsigned idx = loop_header_idx; idx <= block_idx; idx++)
1064          ctx.renames[idx].insert(rename);
1065    }
1066 
1067    /* propagate new renames through loop: i.e. repair the SSA */
1068    for (unsigned idx = loop_header_idx; idx <= block_idx; idx++) {
1069       Block& current = ctx.program->blocks[idx];
1070       /* rename all uses in this block */
1071       for (aco_ptr<Instruction>& instr : current.instructions) {
1072          /* no need to rename the loop header phis once again. */
1073          if (idx == loop_header_idx && is_phi(instr))
1074             continue;
1075 
1076          for (Operand& op : instr->operands) {
1077             if (!op.isTemp())
1078                continue;
1079 
1080             auto rename = renames.find(op.getTemp());
1081             if (rename != renames.end())
1082                op.setTemp(rename->second);
1083          }
1084       }
1085    }
1086 }
1087 
1088 Temp
load_scratch_resource(spill_ctx & ctx,Builder & bld,bool apply_scratch_offset)1089 load_scratch_resource(spill_ctx& ctx, Builder& bld, bool apply_scratch_offset)
1090 {
1091    Temp private_segment_buffer = ctx.program->private_segment_buffer;
1092    if (!private_segment_buffer.bytes()) {
1093       Temp addr_lo =
1094          bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
1095       Temp addr_hi =
1096          bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
1097       private_segment_buffer =
1098          bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
1099    } else if (ctx.program->stage.hw != AC_HW_COMPUTE_SHADER) {
1100       private_segment_buffer =
1101          bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand::zero());
1102    }
1103 
1104    if (apply_scratch_offset) {
1105       Temp addr_lo = bld.tmp(s1);
1106       Temp addr_hi = bld.tmp(s1);
1107       bld.pseudo(aco_opcode::p_split_vector, Definition(addr_lo), Definition(addr_hi),
1108                  private_segment_buffer);
1109 
1110       Temp carry = bld.tmp(s1);
1111       addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo,
1112                          ctx.program->scratch_offset);
1113       addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi,
1114                          Operand::c32(0), bld.scc(carry));
1115 
1116       private_segment_buffer =
1117          bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
1118    }
1119 
1120    struct ac_buffer_state ac_state = {0};
1121    uint32_t desc[4];
1122 
1123    ac_state.size = 0xffffffff;
1124    ac_state.format = PIPE_FORMAT_R32_FLOAT;
1125    for (int i = 0; i < 4; i++)
1126       ac_state.swizzle[i] = PIPE_SWIZZLE_0;
1127    /* older generations need element size = 4 bytes. element size removed in GFX9 */
1128    ac_state.element_size = ctx.program->gfx_level <= GFX8 ? 1u : 0u;
1129    ac_state.index_stride = ctx.program->wave_size == 64 ? 3u : 2u;
1130    ac_state.add_tid = true;
1131    ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
1132 
1133    ac_build_buffer_descriptor(ctx.program->gfx_level, &ac_state, desc);
1134 
1135    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
1136                      Operand::c32(desc[2]), Operand::c32(desc[3]));
1137 }
1138 
1139 void
setup_vgpr_spill_reload(spill_ctx & ctx,Block & block,std::vector<aco_ptr<Instruction>> & instructions,uint32_t spill_slot,Temp & scratch_offset,unsigned * offset)1140 setup_vgpr_spill_reload(spill_ctx& ctx, Block& block,
1141                         std::vector<aco_ptr<Instruction>>& instructions, uint32_t spill_slot,
1142                         Temp& scratch_offset, unsigned* offset)
1143 {
1144    uint32_t scratch_size = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
1145 
1146    uint32_t offset_range;
1147    if (ctx.program->gfx_level >= GFX9) {
1148       offset_range =
1149          ctx.program->dev.scratch_global_offset_max - ctx.program->dev.scratch_global_offset_min;
1150    } else {
1151       if (scratch_size < 4095)
1152          offset_range = 4095 - scratch_size;
1153       else
1154          offset_range = 0;
1155    }
1156 
1157    bool overflow = (ctx.vgpr_spill_slots - 1) * 4 > offset_range;
1158 
1159    Builder rsrc_bld(ctx.program);
1160    if (block.kind & block_kind_top_level) {
1161       rsrc_bld.reset(&instructions);
1162    } else if (ctx.scratch_rsrc == Temp() && (!overflow || ctx.program->gfx_level < GFX9)) {
1163       Block* tl_block = &block;
1164       while (!(tl_block->kind & block_kind_top_level))
1165          tl_block = &ctx.program->blocks[tl_block->linear_idom];
1166 
1167       /* find p_logical_end */
1168       std::vector<aco_ptr<Instruction>>& prev_instructions = tl_block->instructions;
1169       unsigned idx = prev_instructions.size() - 1;
1170       while (prev_instructions[idx]->opcode != aco_opcode::p_logical_end)
1171          idx--;
1172       rsrc_bld.reset(&prev_instructions, std::next(prev_instructions.begin(), idx));
1173    }
1174 
1175    /* If spilling overflows the constant offset range at any point, we need to emit the soffset
1176     * before every spill/reload to avoid increasing register demand.
1177     */
1178    Builder offset_bld = rsrc_bld;
1179    if (overflow)
1180       offset_bld.reset(&instructions);
1181 
1182    *offset = spill_slot * 4;
1183    if (ctx.program->gfx_level >= GFX9) {
1184       *offset += ctx.program->dev.scratch_global_offset_min;
1185 
1186       if (ctx.scratch_rsrc == Temp() || overflow) {
1187          int32_t saddr = scratch_size - ctx.program->dev.scratch_global_offset_min;
1188          if ((int32_t)*offset > (int32_t)ctx.program->dev.scratch_global_offset_max) {
1189             saddr += (int32_t)*offset;
1190             *offset = 0;
1191          }
1192 
1193          /* GFX9+ uses scratch_* instructions, which don't use a resource. */
1194          ctx.scratch_rsrc = offset_bld.copy(offset_bld.def(s1), Operand::c32(saddr));
1195       }
1196    } else {
1197       if (ctx.scratch_rsrc == Temp())
1198          ctx.scratch_rsrc = load_scratch_resource(ctx, rsrc_bld, overflow);
1199 
1200       if (overflow) {
1201          uint32_t soffset =
1202             ctx.program->config->scratch_bytes_per_wave + *offset * ctx.program->wave_size;
1203          *offset = 0;
1204 
1205          scratch_offset = offset_bld.copy(offset_bld.def(s1), Operand::c32(soffset));
1206       } else {
1207          *offset += scratch_size;
1208       }
1209    }
1210 }
1211 
1212 void
spill_vgpr(spill_ctx & ctx,Block & block,std::vector<aco_ptr<Instruction>> & instructions,aco_ptr<Instruction> & spill,std::vector<uint32_t> & slots)1213 spill_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& instructions,
1214            aco_ptr<Instruction>& spill, std::vector<uint32_t>& slots)
1215 {
1216    ctx.program->config->spilled_vgprs += spill->operands[0].size();
1217 
1218    uint32_t spill_id = spill->operands[1].constantValue();
1219    uint32_t spill_slot = slots[spill_id];
1220 
1221    Temp scratch_offset = ctx.program->scratch_offset;
1222    unsigned offset;
1223    setup_vgpr_spill_reload(ctx, block, instructions, spill_slot, scratch_offset, &offset);
1224 
1225    assert(spill->operands[0].isTemp());
1226    Temp temp = spill->operands[0].getTemp();
1227    assert(temp.type() == RegType::vgpr && !temp.is_linear());
1228 
1229    Builder bld(ctx.program, &instructions);
1230    if (temp.size() > 1) {
1231       Instruction* split{
1232          create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())};
1233       split->operands[0] = Operand(temp);
1234       for (unsigned i = 0; i < temp.size(); i++)
1235          split->definitions[i] = bld.def(v1);
1236       bld.insert(split);
1237       for (unsigned i = 0; i < temp.size(); i++, offset += 4) {
1238          Temp elem = split->definitions[i].getTemp();
1239          if (ctx.program->gfx_level >= GFX9) {
1240             bld.scratch(aco_opcode::scratch_store_dword, Operand(v1), ctx.scratch_rsrc, elem,
1241                         offset, memory_sync_info(storage_vgpr_spill, semantic_private));
1242          } else {
1243             Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc,
1244                                            Operand(v1), scratch_offset, elem, offset, false);
1245             instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1246             instr->mubuf().cache.value = ac_swizzled;
1247          }
1248       }
1249    } else if (ctx.program->gfx_level >= GFX9) {
1250       bld.scratch(aco_opcode::scratch_store_dword, Operand(v1), ctx.scratch_rsrc, temp, offset,
1251                   memory_sync_info(storage_vgpr_spill, semantic_private));
1252    } else {
1253       Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc, Operand(v1),
1254                                      scratch_offset, temp, offset, false);
1255       instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1256       instr->mubuf().cache.value = ac_swizzled;
1257    }
1258 }
1259 
1260 void
reload_vgpr(spill_ctx & ctx,Block & block,std::vector<aco_ptr<Instruction>> & instructions,aco_ptr<Instruction> & reload,std::vector<uint32_t> & slots)1261 reload_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& instructions,
1262             aco_ptr<Instruction>& reload, std::vector<uint32_t>& slots)
1263 {
1264    uint32_t spill_id = reload->operands[0].constantValue();
1265    uint32_t spill_slot = slots[spill_id];
1266 
1267    Temp scratch_offset = ctx.program->scratch_offset;
1268    unsigned offset;
1269    setup_vgpr_spill_reload(ctx, block, instructions, spill_slot, scratch_offset, &offset);
1270 
1271    Definition def = reload->definitions[0];
1272 
1273    Builder bld(ctx.program, &instructions);
1274    if (def.size() > 1) {
1275       Instruction* vec{
1276          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)};
1277       vec->definitions[0] = def;
1278       for (unsigned i = 0; i < def.size(); i++, offset += 4) {
1279          Temp tmp = bld.tmp(v1);
1280          vec->operands[i] = Operand(tmp);
1281          if (ctx.program->gfx_level >= GFX9) {
1282             bld.scratch(aco_opcode::scratch_load_dword, Definition(tmp), Operand(v1),
1283                         ctx.scratch_rsrc, offset,
1284                         memory_sync_info(storage_vgpr_spill, semantic_private));
1285          } else {
1286             Instruction* instr =
1287                bld.mubuf(aco_opcode::buffer_load_dword, Definition(tmp), ctx.scratch_rsrc,
1288                          Operand(v1), scratch_offset, offset, false);
1289             instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1290             instr->mubuf().cache.value = ac_swizzled;
1291          }
1292       }
1293       bld.insert(vec);
1294    } else if (ctx.program->gfx_level >= GFX9) {
1295       bld.scratch(aco_opcode::scratch_load_dword, def, Operand(v1), ctx.scratch_rsrc, offset,
1296                   memory_sync_info(storage_vgpr_spill, semantic_private));
1297    } else {
1298       Instruction* instr = bld.mubuf(aco_opcode::buffer_load_dword, def, ctx.scratch_rsrc,
1299                                      Operand(v1), scratch_offset, offset, false);
1300       instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1301       instr->mubuf().cache.value = ac_swizzled;
1302    }
1303 }
1304 
1305 void
add_interferences(spill_ctx & ctx,std::vector<bool> & is_assigned,std::vector<uint32_t> & slots,std::vector<bool> & slots_used,unsigned id)1306 add_interferences(spill_ctx& ctx, std::vector<bool>& is_assigned, std::vector<uint32_t>& slots,
1307                   std::vector<bool>& slots_used, unsigned id)
1308 {
1309    for (unsigned other : ctx.interferences[id].second) {
1310       if (!is_assigned[other])
1311          continue;
1312 
1313       RegClass other_rc = ctx.interferences[other].first;
1314       unsigned slot = slots[other];
1315       std::fill(slots_used.begin() + slot, slots_used.begin() + slot + other_rc.size(), true);
1316    }
1317 }
1318 
1319 unsigned
find_available_slot(std::vector<bool> & used,unsigned wave_size,unsigned size,bool is_sgpr)1320 find_available_slot(std::vector<bool>& used, unsigned wave_size, unsigned size, bool is_sgpr)
1321 {
1322    unsigned wave_size_minus_one = wave_size - 1;
1323    unsigned slot = 0;
1324 
1325    while (true) {
1326       bool available = true;
1327       for (unsigned i = 0; i < size; i++) {
1328          if (slot + i < used.size() && used[slot + i]) {
1329             available = false;
1330             break;
1331          }
1332       }
1333       if (!available) {
1334          slot++;
1335          continue;
1336       }
1337 
1338       if (is_sgpr && ((slot & wave_size_minus_one) > wave_size - size)) {
1339          slot = align(slot, wave_size);
1340          continue;
1341       }
1342 
1343       std::fill(used.begin(), used.end(), false);
1344 
1345       if (slot + size > used.size())
1346          used.resize(slot + size);
1347 
1348       return slot;
1349    }
1350 }
1351 
1352 void
assign_spill_slots_helper(spill_ctx & ctx,RegType type,std::vector<bool> & is_assigned,std::vector<uint32_t> & slots,unsigned * num_slots)1353 assign_spill_slots_helper(spill_ctx& ctx, RegType type, std::vector<bool>& is_assigned,
1354                           std::vector<uint32_t>& slots, unsigned* num_slots)
1355 {
1356    std::vector<bool> slots_used;
1357 
1358    /* assign slots for ids with affinities first */
1359    for (std::vector<uint32_t>& vec : ctx.affinities) {
1360       if (ctx.interferences[vec[0]].first.type() != type)
1361          continue;
1362 
1363       for (unsigned id : vec) {
1364          if (!ctx.is_reloaded[id])
1365             continue;
1366 
1367          add_interferences(ctx, is_assigned, slots, slots_used, id);
1368       }
1369 
1370       unsigned slot = find_available_slot(
1371          slots_used, ctx.wave_size, ctx.interferences[vec[0]].first.size(), type == RegType::sgpr);
1372 
1373       for (unsigned id : vec) {
1374          assert(!is_assigned[id]);
1375 
1376          if (ctx.is_reloaded[id]) {
1377             slots[id] = slot;
1378             is_assigned[id] = true;
1379          }
1380       }
1381    }
1382 
1383    /* assign slots for ids without affinities */
1384    for (unsigned id = 0; id < ctx.interferences.size(); id++) {
1385       if (is_assigned[id] || !ctx.is_reloaded[id] || ctx.interferences[id].first.type() != type)
1386          continue;
1387 
1388       add_interferences(ctx, is_assigned, slots, slots_used, id);
1389 
1390       unsigned slot = find_available_slot(
1391          slots_used, ctx.wave_size, ctx.interferences[id].first.size(), type == RegType::sgpr);
1392 
1393       slots[id] = slot;
1394       is_assigned[id] = true;
1395    }
1396 
1397    *num_slots = slots_used.size();
1398 }
1399 
1400 void
end_unused_spill_vgprs(spill_ctx & ctx,Block & block,std::vector<Temp> & vgpr_spill_temps,const std::vector<uint32_t> & slots,const aco::unordered_map<Temp,uint32_t> & spills)1401 end_unused_spill_vgprs(spill_ctx& ctx, Block& block, std::vector<Temp>& vgpr_spill_temps,
1402                        const std::vector<uint32_t>& slots,
1403                        const aco::unordered_map<Temp, uint32_t>& spills)
1404 {
1405    std::vector<bool> is_used(vgpr_spill_temps.size());
1406    for (std::pair<Temp, uint32_t> pair : spills) {
1407       if (pair.first.type() == RegType::sgpr && ctx.is_reloaded[pair.second])
1408          is_used[slots[pair.second] / ctx.wave_size] = true;
1409    }
1410 
1411    std::vector<Temp> temps;
1412    for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
1413       if (vgpr_spill_temps[i].id() && !is_used[i]) {
1414          temps.push_back(vgpr_spill_temps[i]);
1415          vgpr_spill_temps[i] = Temp();
1416       }
1417    }
1418    if (temps.empty() || block.linear_preds.empty())
1419       return;
1420 
1421    aco_ptr<Instruction> destr{
1422       create_instruction(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, temps.size(), 0)};
1423    for (unsigned i = 0; i < temps.size(); i++)
1424       destr->operands[i] = Operand(temps[i]);
1425 
1426    std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
1427    while (is_phi(*it))
1428       ++it;
1429    block.instructions.insert(it, std::move(destr));
1430 }
1431 
1432 void
assign_spill_slots(spill_ctx & ctx,unsigned spills_to_vgpr)1433 assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
1434 {
1435    std::vector<uint32_t> slots(ctx.interferences.size());
1436    std::vector<bool> is_assigned(ctx.interferences.size());
1437 
1438    /* first, handle affinities: just merge all interferences into both spill ids */
1439    for (std::vector<uint32_t>& vec : ctx.affinities) {
1440       for (unsigned i = 0; i < vec.size(); i++) {
1441          for (unsigned j = i + 1; j < vec.size(); j++) {
1442             assert(vec[i] != vec[j]);
1443             bool reloaded = ctx.is_reloaded[vec[i]] || ctx.is_reloaded[vec[j]];
1444             ctx.is_reloaded[vec[i]] = reloaded;
1445             ctx.is_reloaded[vec[j]] = reloaded;
1446          }
1447       }
1448    }
1449    for (ASSERTED uint32_t i = 0; i < ctx.interferences.size(); i++)
1450       for (ASSERTED uint32_t id : ctx.interferences[i].second)
1451          assert(i != id);
1452 
1453    /* for each spill slot, assign as many spill ids as possible */
1454    assign_spill_slots_helper(ctx, RegType::sgpr, is_assigned, slots, &ctx.sgpr_spill_slots);
1455    assign_spill_slots_helper(ctx, RegType::vgpr, is_assigned, slots, &ctx.vgpr_spill_slots);
1456 
1457    for (unsigned id = 0; id < is_assigned.size(); id++)
1458       assert(is_assigned[id] || !ctx.is_reloaded[id]);
1459 
1460    for (std::vector<uint32_t>& vec : ctx.affinities) {
1461       for (unsigned i = 0; i < vec.size(); i++) {
1462          for (unsigned j = i + 1; j < vec.size(); j++) {
1463             assert(is_assigned[vec[i]] == is_assigned[vec[j]]);
1464             if (!is_assigned[vec[i]])
1465                continue;
1466             assert(ctx.is_reloaded[vec[i]] == ctx.is_reloaded[vec[j]]);
1467             assert(ctx.interferences[vec[i]].first.type() ==
1468                    ctx.interferences[vec[j]].first.type());
1469             assert(slots[vec[i]] == slots[vec[j]]);
1470          }
1471       }
1472    }
1473 
1474    /* hope, we didn't mess up */
1475    std::vector<Temp> vgpr_spill_temps((ctx.sgpr_spill_slots + ctx.wave_size - 1) / ctx.wave_size);
1476    assert(vgpr_spill_temps.size() <= spills_to_vgpr);
1477 
1478    /* replace pseudo instructions with actual hardware instructions */
1479    unsigned last_top_level_block_idx = 0;
1480    for (Block& block : ctx.program->blocks) {
1481 
1482       if (block.kind & block_kind_top_level) {
1483          last_top_level_block_idx = block.index;
1484 
1485          end_unused_spill_vgprs(ctx, block, vgpr_spill_temps, slots, ctx.spills_entry[block.index]);
1486 
1487          /* If the block has no predecessors (for example in RT resume shaders),
1488           * we cannot reuse the current scratch_rsrc temp because its definition is unreachable */
1489          if (block.linear_preds.empty())
1490             ctx.scratch_rsrc = Temp();
1491       }
1492 
1493       std::vector<aco_ptr<Instruction>>::iterator it;
1494       std::vector<aco_ptr<Instruction>> instructions;
1495       instructions.reserve(block.instructions.size());
1496       Builder bld(ctx.program, &instructions);
1497       for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1498 
1499          if ((*it)->opcode == aco_opcode::p_spill) {
1500             uint32_t spill_id = (*it)->operands[1].constantValue();
1501 
1502             if (!ctx.is_reloaded[spill_id]) {
1503                /* never reloaded, so don't spill */
1504             } else if (!is_assigned[spill_id]) {
1505                unreachable("No spill slot assigned for spill id");
1506             } else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) {
1507                spill_vgpr(ctx, block, instructions, *it, slots);
1508             } else {
1509                ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
1510 
1511                uint32_t spill_slot = slots[spill_id];
1512 
1513                /* check if the linear vgpr already exists */
1514                if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
1515                   Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear());
1516                   vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
1517                   aco_ptr<Instruction> create{
1518                      create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
1519                   create->definitions[0] = Definition(linear_vgpr);
1520                   /* find the right place to insert this definition */
1521                   if (last_top_level_block_idx == block.index) {
1522                      /* insert right before the current instruction */
1523                      instructions.emplace_back(std::move(create));
1524                   } else {
1525                      assert(last_top_level_block_idx < block.index);
1526                      /* insert after p_logical_end of the last top-level block */
1527                      std::vector<aco_ptr<Instruction>>& block_instrs =
1528                         ctx.program->blocks[last_top_level_block_idx].instructions;
1529                      auto insert_point =
1530                         std::find_if(block_instrs.rbegin(), block_instrs.rend(),
1531                                      [](const auto& iter) {
1532                                         return iter->opcode == aco_opcode::p_logical_end;
1533                                      })
1534                            .base();
1535                      block_instrs.insert(insert_point, std::move(create));
1536                   }
1537                }
1538 
1539                /* spill sgpr: just add the vgpr temp to operands */
1540                Instruction* spill = create_instruction(aco_opcode::p_spill, Format::PSEUDO, 3, 0);
1541                spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
1542                spill->operands[1] = Operand::c32(spill_slot % ctx.wave_size);
1543                spill->operands[2] = (*it)->operands[0];
1544                instructions.emplace_back(aco_ptr<Instruction>(spill));
1545             }
1546 
1547          } else if ((*it)->opcode == aco_opcode::p_reload) {
1548             uint32_t spill_id = (*it)->operands[0].constantValue();
1549             assert(ctx.is_reloaded[spill_id]);
1550 
1551             if (!is_assigned[spill_id]) {
1552                unreachable("No spill slot assigned for spill id");
1553             } else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) {
1554                reload_vgpr(ctx, block, instructions, *it, slots);
1555             } else {
1556                uint32_t spill_slot = slots[spill_id];
1557 
1558                /* check if the linear vgpr already exists */
1559                if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
1560                   Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear());
1561                   vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
1562                   aco_ptr<Instruction> create{
1563                      create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
1564                   create->definitions[0] = Definition(linear_vgpr);
1565                   /* find the right place to insert this definition */
1566                   if (last_top_level_block_idx == block.index) {
1567                      /* insert right before the current instruction */
1568                      instructions.emplace_back(std::move(create));
1569                   } else {
1570                      assert(last_top_level_block_idx < block.index);
1571                      /* insert after p_logical_end of the last top-level block */
1572                      std::vector<aco_ptr<Instruction>>& block_instrs =
1573                         ctx.program->blocks[last_top_level_block_idx].instructions;
1574                      auto insert_point =
1575                         std::find_if(block_instrs.rbegin(), block_instrs.rend(),
1576                                      [](const auto& iter) {
1577                                         return iter->opcode == aco_opcode::p_logical_end;
1578                                      })
1579                            .base();
1580                      block_instrs.insert(insert_point, std::move(create));
1581                   }
1582                }
1583 
1584                /* reload sgpr: just add the vgpr temp to operands */
1585                Instruction* reload = create_instruction(aco_opcode::p_reload, Format::PSEUDO, 2, 1);
1586                reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
1587                reload->operands[1] = Operand::c32(spill_slot % ctx.wave_size);
1588                reload->definitions[0] = (*it)->definitions[0];
1589                instructions.emplace_back(aco_ptr<Instruction>(reload));
1590             }
1591          } else if (!ctx.unused_remats.count(it->get())) {
1592             instructions.emplace_back(std::move(*it));
1593          }
1594       }
1595       block.instructions = std::move(instructions);
1596    }
1597 
1598    /* update required scratch memory */
1599    ctx.program->config->scratch_bytes_per_wave += ctx.vgpr_spill_slots * 4 * ctx.program->wave_size;
1600 }
1601 
1602 } /* end namespace */
1603 
1604 void
spill(Program * program)1605 spill(Program* program)
1606 {
1607    program->config->spilled_vgprs = 0;
1608    program->config->spilled_sgprs = 0;
1609 
1610    program->progress = CompilationProgress::after_spilling;
1611 
1612    /* no spilling when register pressure is low enough */
1613    if (program->num_waves > 0)
1614       return;
1615 
1616    /* lower to CSSA before spilling to ensure correctness w.r.t. phis */
1617    lower_to_cssa(program);
1618 
1619    /* calculate target register demand */
1620    const RegisterDemand demand = program->max_reg_demand; /* current max */
1621    const uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
1622    const uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
1623    uint16_t extra_vgprs = 0;
1624    uint16_t extra_sgprs = 0;
1625 
1626    /* calculate extra VGPRs required for spilling SGPRs */
1627    if (demand.sgpr > sgpr_limit) {
1628       unsigned sgpr_spills = demand.sgpr - sgpr_limit;
1629       extra_vgprs = DIV_ROUND_UP(sgpr_spills * 2, program->wave_size) + 1;
1630    }
1631    /* add extra SGPRs required for spilling VGPRs */
1632    if (demand.vgpr + extra_vgprs > vgpr_limit) {
1633       if (program->gfx_level >= GFX9)
1634          extra_sgprs = 1; /* SADDR */
1635       else
1636          extra_sgprs = 5; /* scratch_resource (s4) + scratch_offset (s1) */
1637       if (demand.sgpr + extra_sgprs > sgpr_limit) {
1638          /* re-calculate in case something has changed */
1639          unsigned sgpr_spills = demand.sgpr + extra_sgprs - sgpr_limit;
1640          extra_vgprs = DIV_ROUND_UP(sgpr_spills * 2, program->wave_size) + 1;
1641       }
1642    }
1643    /* the spiller has to target the following register demand */
1644    const RegisterDemand target(vgpr_limit - extra_vgprs, sgpr_limit - extra_sgprs);
1645 
1646    /* initialize ctx */
1647    spill_ctx ctx(target, program);
1648    gather_ssa_use_info(ctx);
1649    get_rematerialize_info(ctx);
1650 
1651    /* create spills and reloads */
1652    for (unsigned i = 0; i < program->blocks.size(); i++)
1653       spill_block(ctx, i);
1654 
1655    /* assign spill slots and DCE rematerialized code */
1656    assign_spill_slots(ctx, extra_vgprs);
1657 
1658    /* update live variable information */
1659    live_var_analysis(program);
1660 
1661    assert(program->num_waves > 0);
1662 }
1663 
1664 } // namespace aco
1665