xref: /aosp_15_r20/external/mesa3d/src/amd/compiler/aco_lower_to_hw_instr.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9 
10 #include "common/sid.h"
11 
12 #include <map>
13 #include <vector>
14 
15 namespace aco {
16 
17 namespace {
18 
19 struct lower_context {
20    Program* program;
21    Block* block;
22    std::vector<aco_ptr<Instruction>> instructions;
23 };
24 
25 /* Class for obtaining where s_sendmsg(MSG_ORDERED_PS_DONE) must be done in a Primitive Ordered
26  * Pixel Shader on GFX9-10.3.
27  *
28  * MSG_ORDERED_PS_DONE must be sent once after the ordered section is done along all execution paths
29  * from the POPS packer ID hardware register setting to s_endpgm. It is, however, also okay to send
30  * it if the packer ID is not going to be set at all by the wave, so some conservativeness is fine.
31  *
32  * For simplicity, sending the message from top-level blocks as dominance and post-dominance
33  * checking for any location in the shader is trivial in them. Also, for simplicity, sending it
34  * regardless of whether the POPS packer ID hardware register has already potentially been set up.
35  *
36  * Note that there can be multiple interlock end instructions in the shader.
37  * SPV_EXT_fragment_shader_interlock requires OpEndInvocationInterlockEXT to be executed exactly
38  * once by the invocation. However, there may be, for instance, multiple ordered sections, and which
39  * one will be executed may depend on divergent control flow (some lanes may execute one ordered
40  * section, other lanes may execute another). MSG_ORDERED_PS_DONE, however, is sent via a scalar
41  * instruction, so it must be ensured that the message is sent after the last ordered section in the
42  * entire wave.
43  */
44 class gfx9_pops_done_msg_bounds {
45 public:
46    explicit gfx9_pops_done_msg_bounds() = default;
47 
gfx9_pops_done_msg_bounds(const Program * const program)48    explicit gfx9_pops_done_msg_bounds(const Program* const program)
49    {
50       /* Find the top-level location after the last ordered section end pseudo-instruction in the
51        * program.
52        * Consider `p_pops_gfx9_overlapped_wave_wait_done` a boundary too - make sure the message
53        * isn't sent if any wait hasn't been fully completed yet (if a begin-end-begin situation
54        * occurs somehow, as the location of `p_pops_gfx9_ordered_section_done` is controlled by the
55        * application) for safety, assuming that waits are the only thing that need the packer
56        * hardware register to be set at some point during or before them, and it won't be set
57        * anymore after the last wait.
58        */
59       int last_top_level_block_idx = -1;
60       for (int block_idx = (int)program->blocks.size() - 1; block_idx >= 0; block_idx--) {
61          const Block& block = program->blocks[block_idx];
62          if (block.kind & block_kind_top_level) {
63             last_top_level_block_idx = block_idx;
64          }
65          for (size_t instr_idx = block.instructions.size() - 1; instr_idx + size_t(1) > 0;
66               instr_idx--) {
67             const aco_opcode opcode = block.instructions[instr_idx]->opcode;
68             if (opcode == aco_opcode::p_pops_gfx9_ordered_section_done ||
69                 opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done) {
70                end_block_idx_ = last_top_level_block_idx;
71                /* The same block if it's already a top-level block, or the beginning of the next
72                 * top-level block.
73                 */
74                instr_after_end_idx_ = block_idx == end_block_idx_ ? instr_idx + 1 : 0;
75                break;
76             }
77          }
78          if (end_block_idx_ != -1) {
79             break;
80          }
81       }
82    }
83 
84    /* If this is not -1, during the normal execution flow (not early exiting), MSG_ORDERED_PS_DONE
85     * must be sent in this block.
86     */
end_block_idx() const87    int end_block_idx() const { return end_block_idx_; }
88 
89    /* If end_block_idx() is an existing block, during the normal execution flow (not early exiting),
90     * MSG_ORDERED_PS_DONE must be sent before this instruction in the block end_block_idx().
91     * If this is out of the bounds of the instructions in the end block, it must be sent in the end
92     * of that block.
93     */
instr_after_end_idx() const94    size_t instr_after_end_idx() const { return instr_after_end_idx_; }
95 
96    /* Whether an instruction doing early exit (such as discard) needs to send MSG_ORDERED_PS_DONE
97     * before actually ending the program.
98     */
early_exit_needs_done_msg(const int block_idx,const size_t instr_idx) const99    bool early_exit_needs_done_msg(const int block_idx, const size_t instr_idx) const
100    {
101       return block_idx <= end_block_idx_ &&
102              (block_idx != end_block_idx_ || instr_idx < instr_after_end_idx_);
103    }
104 
105 private:
106    /* Initialize to an empty range for which "is inside" comparisons will be failing for any
107     * block.
108     */
109    int end_block_idx_ = -1;
110    size_t instr_after_end_idx_ = 0;
111 };
112 
113 void
copy_constant_sgpr(Builder & bld,Definition dst,uint64_t constant)114 copy_constant_sgpr(Builder& bld, Definition dst, uint64_t constant)
115 {
116    if (dst.regClass() == s1) {
117       uint32_t imm = constant;
118       Operand op = Operand::get_const(bld.program->gfx_level, imm, 4);
119       if (op.isLiteral()) {
120          if (imm >= 0xffff8000 || imm <= 0x7fff) {
121             bld.sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);
122             return;
123          }
124 
125          Operand rev_op = Operand::get_const(bld.program->gfx_level, util_bitreverse(imm), 4);
126          if (!rev_op.isLiteral()) {
127             bld.sop1(aco_opcode::s_brev_b32, dst, rev_op);
128             return;
129          }
130 
131          unsigned start = (ffs(imm) - 1) & 0x1f;
132          unsigned size = util_bitcount(imm) & 0x1f;
133          if (BITFIELD_RANGE(start, size) == imm) {
134             bld.sop2(aco_opcode::s_bfm_b32, dst, Operand::c32(size), Operand::c32(start));
135             return;
136          }
137 
138          if (bld.program->gfx_level >= GFX9) {
139             Operand op_lo = Operand::c32(int32_t(int16_t(imm)));
140             Operand op_hi = Operand::c32(int32_t(int16_t(imm >> 16)));
141             if (!op_lo.isLiteral() && !op_hi.isLiteral()) {
142                bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op_lo, op_hi);
143                return;
144             }
145          }
146       }
147 
148       bld.sop1(aco_opcode::s_mov_b32, dst, op);
149       return;
150    }
151 
152    assert(dst.regClass() == s2);
153 
154    bool can_use_mov = Operand::is_constant_representable(constant, 8, true, false);
155    if (can_use_mov && !Operand::c64(constant).isLiteral()) {
156       bld.sop1(aco_opcode::s_mov_b64, dst, Operand::c64(constant));
157       return;
158    }
159 
160    unsigned start = (ffsll(constant) - 1) & 0x3f;
161    unsigned size = util_bitcount64(constant) & 0x3f;
162    if (BITFIELD64_RANGE(start, size) == constant) {
163       bld.sop2(aco_opcode::s_bfm_b64, dst, Operand::c32(size), Operand::c32(start));
164       return;
165    }
166 
167    uint64_t rev = ((uint64_t)util_bitreverse(constant) << 32) | util_bitreverse(constant >> 32);
168    if (Operand::is_constant_representable(rev, 8, true, false)) {
169       bld.sop1(aco_opcode::s_brev_b64, dst, Operand::c64(rev));
170       return;
171    }
172 
173    if (can_use_mov) {
174       bld.sop1(aco_opcode::s_mov_b64, dst, Operand::c64(constant));
175       return;
176    }
177 
178    uint32_t derep = 0;
179    bool can_use_rep = bld.program->gfx_level >= GFX9;
180    for (unsigned i = 0; can_use_rep && i < 32; i++) {
181       uint32_t lo = (constant >> (i * 2)) & 0x1;
182       uint32_t hi = (constant >> ((i * 2) + 1)) & 0x1;
183       can_use_rep &= lo == hi;
184       derep |= lo << i;
185    }
186    if (can_use_rep) {
187       bld.sop1(aco_opcode::s_bitreplicate_b64_b32, dst, Operand::c32(derep));
188       return;
189    }
190 
191    copy_constant_sgpr(bld, Definition(dst.physReg(), s1), (uint32_t)constant);
192    copy_constant_sgpr(bld, Definition(dst.physReg().advance(4), s1), constant >> 32);
193 }
194 
195 /* used by handle_operands() indirectly through Builder::copy */
196 uint8_t int8_mul_table[512] = {
197    0, 20,  1,  1,   1,  2,   1,  3,   1,  4,   1, 5,   1,  6,   1,  7,   1,  8,   1,  9,
198    1, 10,  1,  11,  1,  12,  1,  13,  1,  14,  1, 15,  1,  16,  1,  17,  1,  18,  1,  19,
199    1, 20,  1,  21,  1,  22,  1,  23,  1,  24,  1, 25,  1,  26,  1,  27,  1,  28,  1,  29,
200    1, 30,  1,  31,  1,  32,  1,  33,  1,  34,  1, 35,  1,  36,  1,  37,  1,  38,  1,  39,
201    1, 40,  1,  41,  1,  42,  1,  43,  1,  44,  1, 45,  1,  46,  1,  47,  1,  48,  1,  49,
202    1, 50,  1,  51,  1,  52,  1,  53,  1,  54,  1, 55,  1,  56,  1,  57,  1,  58,  1,  59,
203    1, 60,  1,  61,  1,  62,  1,  63,  1,  64,  5, 13,  2,  33,  17, 19,  2,  34,  3,  23,
204    2, 35,  11, 53,  2,  36,  7,  47,  2,  37,  3, 25,  2,  38,  7,  11,  2,  39,  53, 243,
205    2, 40,  3,  27,  2,  41,  17, 35,  2,  42,  5, 17,  2,  43,  3,  29,  2,  44,  15, 23,
206    2, 45,  7,  13,  2,  46,  3,  31,  2,  47,  5, 19,  2,  48,  19, 59,  2,  49,  3,  33,
207    2, 50,  7,  51,  2,  51,  15, 41,  2,  52,  3, 35,  2,  53,  11, 33,  2,  54,  23, 27,
208    2, 55,  3,  37,  2,  56,  9,  41,  2,  57,  5, 23,  2,  58,  3,  39,  2,  59,  7,  17,
209    2, 60,  9,  241, 2,  61,  3,  41,  2,  62,  5, 25,  2,  63,  35, 245, 2,  64,  3,  43,
210    5, 26,  9,  43,  3,  44,  7,  19,  10, 39,  3, 45,  4,  34,  11, 59,  3,  46,  9,  243,
211    4, 35,  3,  47,  22, 53,  7,  57,  3,  48,  5, 29,  10, 245, 3,  49,  4,  37,  9,  45,
212    3, 50,  7,  241, 4,  38,  3,  51,  7,  22,  5, 31,  3,  52,  7,  59,  7,  242, 3,  53,
213    4, 40,  7,  23,  3,  54,  15, 45,  4,  41,  3, 55,  6,  241, 9,  47,  3,  56,  13, 13,
214    5, 34,  3,  57,  4,  43,  11, 39,  3,  58,  5, 35,  4,  44,  3,  59,  6,  243, 7,  245,
215    3, 60,  5,  241, 7,  26,  3,  61,  4,  46,  5, 37,  3,  62,  11, 17,  4,  47,  3,  63,
216    5, 38,  5,  243, 3,  64,  7,  247, 9,  50,  5, 39,  4,  241, 33, 37,  6,  33,  13, 35,
217    4, 242, 5,  245, 6,  247, 7,  29,  4,  51,  5, 41,  5,  246, 7,  249, 3,  240, 11, 19,
218    5, 42,  3,  241, 4,  245, 25, 29,  3,  242, 5, 43,  4,  246, 3,  243, 17, 58,  17, 43,
219    3, 244, 5,  249, 6,  37,  3,  245, 2,  240, 5, 45,  2,  241, 21, 23,  2,  242, 3,  247,
220    2, 243, 5,  251, 2,  244, 29, 61,  2,  245, 3, 249, 2,  246, 17, 29,  2,  247, 9,  55,
221    1, 240, 1,  241, 1,  242, 1,  243, 1,  244, 1, 245, 1,  246, 1,  247, 1,  248, 1,  249,
222    1, 250, 1,  251, 1,  252, 1,  253, 1,  254, 1, 255};
223 
224 aco_opcode
get_reduce_opcode(amd_gfx_level gfx_level,ReduceOp op)225 get_reduce_opcode(amd_gfx_level gfx_level, ReduceOp op)
226 {
227    /* Because some 16-bit instructions are already VOP3 on GFX10, we use the
228     * 32-bit opcodes (VOP2) which allows to remove the temporary VGPR and to use
229     * DPP with the arithmetic instructions. This requires to sign-extend.
230     */
231    switch (op) {
232    case iadd8:
233    case iadd16:
234       if (gfx_level >= GFX10) {
235          return aco_opcode::v_add_u32;
236       } else if (gfx_level >= GFX8) {
237          return aco_opcode::v_add_u16;
238       } else {
239          return aco_opcode::v_add_co_u32;
240       }
241       break;
242    case imul8:
243    case imul16:
244       if (gfx_level >= GFX10) {
245          return aco_opcode::v_mul_lo_u16_e64;
246       } else if (gfx_level >= GFX8) {
247          return aco_opcode::v_mul_lo_u16;
248       } else {
249          return aco_opcode::v_mul_u32_u24;
250       }
251       break;
252    case fadd16: return aco_opcode::v_add_f16;
253    case fmul16: return aco_opcode::v_mul_f16;
254    case imax8:
255    case imax16:
256       if (gfx_level >= GFX10) {
257          return aco_opcode::v_max_i32;
258       } else if (gfx_level >= GFX8) {
259          return aco_opcode::v_max_i16;
260       } else {
261          return aco_opcode::v_max_i32;
262       }
263       break;
264    case imin8:
265    case imin16:
266       if (gfx_level >= GFX10) {
267          return aco_opcode::v_min_i32;
268       } else if (gfx_level >= GFX8) {
269          return aco_opcode::v_min_i16;
270       } else {
271          return aco_opcode::v_min_i32;
272       }
273       break;
274    case umin8:
275    case umin16:
276       if (gfx_level >= GFX10) {
277          return aco_opcode::v_min_u32;
278       } else if (gfx_level >= GFX8) {
279          return aco_opcode::v_min_u16;
280       } else {
281          return aco_opcode::v_min_u32;
282       }
283       break;
284    case umax8:
285    case umax16:
286       if (gfx_level >= GFX10) {
287          return aco_opcode::v_max_u32;
288       } else if (gfx_level >= GFX8) {
289          return aco_opcode::v_max_u16;
290       } else {
291          return aco_opcode::v_max_u32;
292       }
293       break;
294    case fmin16: return aco_opcode::v_min_f16;
295    case fmax16: return aco_opcode::v_max_f16;
296    case iadd32: return gfx_level >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
297    case imul32: return aco_opcode::v_mul_lo_u32;
298    case fadd32: return aco_opcode::v_add_f32;
299    case fmul32: return aco_opcode::v_mul_f32;
300    case imax32: return aco_opcode::v_max_i32;
301    case imin32: return aco_opcode::v_min_i32;
302    case umin32: return aco_opcode::v_min_u32;
303    case umax32: return aco_opcode::v_max_u32;
304    case fmin32: return aco_opcode::v_min_f32;
305    case fmax32: return aco_opcode::v_max_f32;
306    case iand8:
307    case iand16:
308    case iand32: return aco_opcode::v_and_b32;
309    case ixor8:
310    case ixor16:
311    case ixor32: return aco_opcode::v_xor_b32;
312    case ior8:
313    case ior16:
314    case ior32: return aco_opcode::v_or_b32;
315    case iadd64: return aco_opcode::num_opcodes;
316    case imul64: return aco_opcode::num_opcodes;
317    case fadd64: return aco_opcode::v_add_f64_e64;
318    case fmul64: return aco_opcode::v_mul_f64_e64;
319    case imin64: return aco_opcode::num_opcodes;
320    case imax64: return aco_opcode::num_opcodes;
321    case umin64: return aco_opcode::num_opcodes;
322    case umax64: return aco_opcode::num_opcodes;
323    case fmin64: return aco_opcode::v_min_f64_e64;
324    case fmax64: return aco_opcode::v_max_f64_e64;
325    case iand64: return aco_opcode::num_opcodes;
326    case ior64: return aco_opcode::num_opcodes;
327    case ixor64: return aco_opcode::num_opcodes;
328    default: return aco_opcode::num_opcodes;
329    }
330 }
331 
332 bool
is_vop3_reduce_opcode(aco_opcode opcode)333 is_vop3_reduce_opcode(aco_opcode opcode)
334 {
335    /* 64-bit reductions are VOP3. */
336    if (opcode == aco_opcode::num_opcodes)
337       return true;
338 
339    return instr_info.format[(int)opcode] == Format::VOP3;
340 }
341 
342 void
emit_vadd32(Builder & bld,Definition def,Operand src0,Operand src1)343 emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
344 {
345    Instruction* instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
346    if (instr->definitions.size() >= 2) {
347       assert(instr->definitions[1].regClass() == bld.lm);
348       instr->definitions[1].setFixed(vcc);
349    }
350 }
351 
352 void
emit_int64_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp_reg,ReduceOp op,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)353 emit_int64_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
354                   PhysReg vtmp_reg, ReduceOp op, unsigned dpp_ctrl, unsigned row_mask,
355                   unsigned bank_mask, bool bound_ctrl, Operand* identity = NULL)
356 {
357    Builder bld(ctx->program, &ctx->instructions);
358    Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
359    Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg + 1}, v1)};
360    Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg + 1}, v1)};
361    Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
362    Operand src1_64 = Operand(src1_reg, v2);
363    Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg + 1}, v1)};
364    Operand vtmp_op64 = Operand(vtmp_reg, v2);
365    if (op == iadd64) {
366       if (ctx->program->gfx_level >= GFX10) {
367          if (identity)
368             bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
369          bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
370                       bound_ctrl);
371          bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]);
372       } else {
373          bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0],
374                       dpp_ctrl, row_mask, bank_mask, bound_ctrl);
375       }
376       bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
377                    Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
378    } else if (op == iand64) {
379       bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
380                    bound_ctrl);
381       bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
382                    bound_ctrl);
383    } else if (op == ior64) {
384       bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
385                    bound_ctrl);
386       bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
387                    bound_ctrl);
388    } else if (op == ixor64) {
389       bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
390                    bound_ctrl);
391       bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
392                    bound_ctrl);
393    } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
394       aco_opcode cmp = aco_opcode::num_opcodes;
395       switch (op) {
396       case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
397       case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
398       case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
399       case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
400       default: break;
401       }
402 
403       if (identity) {
404          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
405          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]);
406       }
407       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
408                    bound_ctrl);
409       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask,
410                    bound_ctrl);
411 
412       bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64);
413       bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
414       bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));
415    } else if (op == imul64) {
416       /* t4 = dpp(x_hi)
417        * t1 = umul_lo(t4, y_lo)
418        * t3 = dpp(x_lo)
419        * t0 = umul_lo(t3, y_hi)
420        * t2 = iadd(t0, t1)
421        * t5 = umul_hi(t3, y_lo)
422        * res_hi = iadd(t2, t5)
423        * res_lo = umul_lo(t3, y_lo)
424        * Requires that res_hi != src0[0] and res_hi != src1[0]
425        * and that vtmp[0] != res_hi.
426        */
427       if (identity)
428          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]);
429       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], dpp_ctrl, row_mask, bank_mask,
430                    bound_ctrl);
431       bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]);
432       if (identity)
433          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
434       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
435                    bound_ctrl);
436       bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]);
437       emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]);
438       if (identity)
439          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
440       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
441                    bound_ctrl);
442       bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]);
443       emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]);
444       if (identity)
445          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
446       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
447                    bound_ctrl);
448       bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]);
449    }
450 }
451 
452 void
emit_int64_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op)453 emit_int64_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
454               ReduceOp op)
455 {
456    Builder bld(ctx->program, &ctx->instructions);
457    Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
458    RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;
459    Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg + 1}, src0_rc)};
460    Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
461    Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);
462    Operand src1_64 = Operand(src1_reg, v2);
463 
464    if (src0_rc == s1 &&
465        (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {
466       assert(vtmp.reg() != 0);
467       bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);
468       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
469       src0_reg = vtmp;
470       src0[0] = Operand(vtmp, v1);
471       src0[1] = Operand(PhysReg{vtmp + 1}, v1);
472       src0_64 = Operand(vtmp, v2);
473    } else if (src0_rc == s1 && op == iadd64) {
474       assert(vtmp.reg() != 0);
475       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
476       src0[1] = Operand(PhysReg{vtmp + 1}, v1);
477    }
478 
479    if (op == iadd64) {
480       if (ctx->program->gfx_level >= GFX10) {
481          bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
482       } else {
483          bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
484       }
485       bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
486                Operand(vcc, bld.lm));
487    } else if (op == iand64) {
488       bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
489       bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
490    } else if (op == ior64) {
491       bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]);
492       bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]);
493    } else if (op == ixor64) {
494       bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]);
495       bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]);
496    } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
497       aco_opcode cmp = aco_opcode::num_opcodes;
498       switch (op) {
499       case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
500       case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
501       case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
502       case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
503       default: break;
504       }
505 
506       bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64);
507       bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));
508       bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));
509    } else if (op == imul64) {
510       if (src1_reg == dst_reg) {
511          /* it's fine if src0==dst but not if src1==dst */
512          std::swap(src0_reg, src1_reg);
513          std::swap(src0[0], src1[0]);
514          std::swap(src0[1], src1[1]);
515          std::swap(src0_64, src1_64);
516       }
517       assert(!(src0_reg == src1_reg));
518       /* t1 = umul_lo(x_hi, y_lo)
519        * t0 = umul_lo(x_lo, y_hi)
520        * t2 = iadd(t0, t1)
521        * t5 = umul_hi(x_lo, y_lo)
522        * res_hi = iadd(t2, t5)
523        * res_lo = umul_lo(x_lo, y_lo)
524        * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp
525        */
526       Definition tmp0_def(PhysReg{src0_reg + 1}, v1);
527       Definition tmp1_def(PhysReg{src1_reg + 1}, v1);
528       Operand tmp0_op = src0[1];
529       Operand tmp1_op = src1[1];
530       bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]);
531       bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]);
532       emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op);
533       bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]);
534       emit_vadd32(bld, dst[1], tmp0_op, tmp1_op);
535       bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]);
536    }
537 }
538 
539 void
emit_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)540 emit_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
541             ReduceOp op, unsigned size, unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask,
542             bool bound_ctrl, Operand* identity = NULL) /* for VOP3 with sparse writes */
543 {
544    Builder bld(ctx->program, &ctx->instructions);
545    RegClass rc = RegClass(RegType::vgpr, size);
546    Definition dst(dst_reg, rc);
547    Operand src0(src0_reg, rc);
548    Operand src1(src1_reg, rc);
549 
550    aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
551    bool vop3 = is_vop3_reduce_opcode(opcode);
552 
553    if (!vop3) {
554       if (opcode == aco_opcode::v_add_co_u32)
555          bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask,
556                       bound_ctrl);
557       else
558          bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
559       return;
560    }
561 
562    if (opcode == aco_opcode::num_opcodes) {
563       emit_int64_dpp_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op, dpp_ctrl, row_mask, bank_mask,
564                         bound_ctrl, identity);
565       return;
566    }
567 
568    if (identity)
569       bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
570    if (identity && size >= 2)
571       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), identity[1]);
572 
573    for (unsigned i = 0; i < size; i++)
574       bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
575                    Operand(PhysReg{src0_reg + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
576 
577    bld.vop3(opcode, dst, Operand(vtmp, rc), src1);
578 }
579 
580 void
emit_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size)581 emit_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
582         ReduceOp op, unsigned size)
583 {
584    Builder bld(ctx->program, &ctx->instructions);
585    RegClass rc = RegClass(RegType::vgpr, size);
586    Definition dst(dst_reg, rc);
587    Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size));
588    Operand src1(src1_reg, rc);
589 
590    aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
591    bool vop3 = is_vop3_reduce_opcode(opcode);
592 
593    if (opcode == aco_opcode::num_opcodes) {
594       emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op);
595       return;
596    }
597 
598    if (vop3) {
599       bld.vop3(opcode, dst, src0, src1);
600    } else if (opcode == aco_opcode::v_add_co_u32) {
601       bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1);
602    } else {
603       bld.vop2(opcode, dst, src0, src1);
604    }
605 }
606 
607 void
emit_dpp_mov(lower_context * ctx,PhysReg dst,PhysReg src0,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)608 emit_dpp_mov(lower_context* ctx, PhysReg dst, PhysReg src0, unsigned size, unsigned dpp_ctrl,
609              unsigned row_mask, unsigned bank_mask, bool bound_ctrl)
610 {
611    Builder bld(ctx->program, &ctx->instructions);
612    for (unsigned i = 0; i < size; i++) {
613       bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst + i}, v1),
614                    Operand(PhysReg{src0 + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
615    }
616 }
617 
618 void
emit_ds_swizzle(Builder bld,PhysReg dst,PhysReg src,unsigned size,unsigned ds_pattern)619 emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
620 {
621    for (unsigned i = 0; i < size; i++) {
622       bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst + i}, v1),
623              Operand(PhysReg{src + i}, v1), ds_pattern);
624    }
625 }
626 
627 void
emit_reduction(lower_context * ctx,aco_opcode op,ReduceOp reduce_op,unsigned cluster_size,PhysReg tmp,PhysReg stmp,PhysReg vtmp,PhysReg sitmp,Operand src,Definition dst)628 emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size,
629                PhysReg tmp, PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
630 {
631    assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);
632    assert(cluster_size <= ctx->program->wave_size);
633 
634    Builder bld(ctx->program, &ctx->instructions);
635 
636    Operand identity[2];
637    identity[0] = Operand::c32(get_reduction_identity(reduce_op, 0));
638    identity[1] = Operand::c32(get_reduction_identity(reduce_op, 1));
639    Operand vcndmask_identity[2] = {identity[0], identity[1]};
640 
641    /* First, copy the source to tmp and set inactive lanes to the identity */
642    bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1),
643             Definition(exec, bld.lm), Operand::c64(UINT64_MAX), Operand(exec, bld.lm));
644 
645    /* On GFX10+ v_writelane_b32/v_cndmask_b32_e64 can take a literal */
646    if (ctx->program->gfx_level < GFX10) {
647       for (unsigned i = 0; i < src.size(); i++) {
648          /* p_exclusive_scan uses identity for v_writelane_b32 */
649          if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) {
650             bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp + i}, s1), identity[i]);
651             identity[i] = Operand(PhysReg{sitmp + i}, s1);
652 
653             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
654             vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
655          } else if (identity[i].isLiteral()) {
656             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
657             vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
658          }
659       }
660    }
661 
662    for (unsigned i = 0; i < src.size(); i++) {
663       bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
664                    vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
665                    Operand(stmp, bld.lm));
666    }
667 
668    if (reduce_op == iadd8 || reduce_op == imul8 || reduce_op == imax8 || reduce_op == imin8 ||
669        reduce_op == umin8 || reduce_op == umax8 || reduce_op == ixor8 || reduce_op == ior8 ||
670        reduce_op == iand8) {
671       if (ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11) {
672          aco_ptr<Instruction> sdwa{
673             create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
674          sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
675          sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
676          bool sext = reduce_op == imin8 || reduce_op == imax8;
677          sdwa->sdwa().sel[0] = SubdwordSel(1, 0, sext);
678          sdwa->sdwa().dst_sel = SubdwordSel::dword;
679          bld.insert(std::move(sdwa));
680       } else {
681          aco_opcode opcode;
682 
683          if (reduce_op == imin8 || reduce_op == imax8)
684             opcode = aco_opcode::v_bfe_i32;
685          else
686             opcode = aco_opcode::v_bfe_u32;
687 
688          bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
689                   Operand::c32(8u));
690       }
691    } else if (reduce_op == iadd16 || reduce_op == imul16 || reduce_op == imax16 ||
692               reduce_op == imin16 || reduce_op == umin16 || reduce_op == umax16 ||
693               reduce_op == ixor16 || reduce_op == ior16 || reduce_op == iand16 ||
694               reduce_op == fadd16 || reduce_op == fmul16 || reduce_op == fmin16 ||
695               reduce_op == fmax16) {
696       bool is_add_cmp = reduce_op == iadd16 || reduce_op == imax16 || reduce_op == imin16 ||
697                         reduce_op == umin16 || reduce_op == umax16;
698       if (ctx->program->gfx_level >= GFX10 && ctx->program->gfx_level < GFX11 && is_add_cmp) {
699          aco_ptr<Instruction> sdwa{
700             create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
701          sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
702          sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
703          bool sext = reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16;
704          sdwa->sdwa().sel[0] = SubdwordSel(2, 0, sext);
705          sdwa->sdwa().dst_sel = SubdwordSel::dword;
706          bld.insert(std::move(sdwa));
707       } else if (ctx->program->gfx_level <= GFX7 ||
708                  (ctx->program->gfx_level >= GFX11 && is_add_cmp)) {
709          aco_opcode opcode;
710 
711          if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
712             opcode = aco_opcode::v_bfe_i32;
713          else
714             opcode = aco_opcode::v_bfe_u32;
715 
716          bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
717                   Operand::c32(16u));
718       }
719    }
720 
721    bool reduction_needs_last_op = false;
722    switch (op) {
723    case aco_opcode::p_reduce:
724       if (cluster_size == 1)
725          break;
726 
727       if (ctx->program->gfx_level <= GFX7) {
728          reduction_needs_last_op = true;
729          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2));
730          if (cluster_size == 2)
731             break;
732          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
733          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1));
734          if (cluster_size == 4)
735             break;
736          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
737          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04));
738          if (cluster_size == 8)
739             break;
740          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
741          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08));
742          if (cluster_size == 16)
743             break;
744          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
745          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
746          if (cluster_size == 32)
747             break;
748          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
749          for (unsigned i = 0; i < src.size(); i++)
750             bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
751                          Operand::zero());
752          // TODO: it would be more effective to do the last reduction step on SALU
753          emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
754          reduction_needs_last_op = false;
755          break;
756       }
757 
758       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf,
759                   0xf, false);
760       if (cluster_size == 2)
761          break;
762       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf,
763                   0xf, false);
764       if (cluster_size == 4)
765          break;
766       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf,
767                   false);
768       if (cluster_size == 8)
769          break;
770       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false);
771       if (cluster_size == 16)
772          break;
773 
774       if (ctx->program->gfx_level >= GFX10) {
775          /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
776          for (unsigned i = 0; i < src.size(); i++)
777             bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
778                      Operand(PhysReg{tmp + i}, v1), Operand::zero(), Operand::zero());
779 
780          if (cluster_size == 32) {
781             reduction_needs_last_op = true;
782             break;
783          }
784 
785          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
786          for (unsigned i = 0; i < src.size(); i++)
787             bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
788                          Operand::zero());
789          // TODO: it would be more effective to do the last reduction step on SALU
790          emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
791          break;
792       }
793 
794       if (cluster_size == 32) {
795          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
796          reduction_needs_last_op = true;
797          break;
798       }
799       assert(cluster_size == 64);
800       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
801                   false);
802       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
803                   false);
804       break;
805    case aco_opcode::p_exclusive_scan:
806       if (ctx->program->gfx_level >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
807          /* shift rows right */
808          emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true);
809 
810          /* fill in the gaps in rows 1 and 3 */
811          copy_constant_sgpr(bld, Definition(exec, bld.lm), 0x0001'0000'0001'0000ull);
812          for (unsigned i = 0; i < src.size(); i++) {
813             Instruction* perm =
814                bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
815                         Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
816                         Operand::c32(0xffffffffu))
817                   .instr;
818             perm->valu().opsel = 1; /* FI (Fetch Inactive) */
819          }
820          copy_constant_sgpr(bld, Definition(exec, bld.lm), UINT64_MAX);
821 
822          if (ctx->program->wave_size == 64) {
823             /* fill in the gap in row 2 */
824             for (unsigned i = 0; i < src.size(); i++) {
825                bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
826                             Operand::c32(31u));
827                bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
828                              Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
829             }
830          }
831          std::swap(tmp, vtmp);
832       } else if (ctx->program->gfx_level >= GFX8) {
833          emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true);
834       } else {
835          // TODO: use LDS on CS with a single write and shifted read
836          /* wavefront shift_right by 1 on SI/CI */
837          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
838          emit_ds_swizzle(bld, tmp, tmp, src.size(),
839                          ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
840          copy_constant_sgpr(bld, Definition(exec, s2), 0x1010'1010'1010'1010ull);
841          for (unsigned i = 0; i < src.size(); i++)
842             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
843                      Operand(PhysReg{tmp + i}, v1));
844 
845          copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
846          emit_ds_swizzle(bld, tmp, tmp, src.size(),
847                          ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
848          copy_constant_sgpr(bld, Definition(exec, s2), 0x0100'0100'0100'0100ull);
849          for (unsigned i = 0; i < src.size(); i++)
850             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
851                      Operand(PhysReg{tmp + i}, v1));
852 
853          copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
854          emit_ds_swizzle(bld, tmp, tmp, src.size(),
855                          ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
856          copy_constant_sgpr(bld, Definition(exec, s2), 0x0001'0000'0001'0000ull);
857          for (unsigned i = 0; i < src.size(); i++)
858             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
859                      Operand(PhysReg{tmp + i}, v1));
860 
861          copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
862          for (unsigned i = 0; i < src.size(); i++) {
863             bld.writelane(Definition(PhysReg{vtmp + i}, v1), identity[i], Operand::zero(),
864                           Operand(PhysReg{vtmp + i}, v1));
865             bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
866                          Operand::zero());
867             bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
868                           Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
869             identity[i] = Operand::zero(); /* prevent further uses of identity */
870          }
871          std::swap(tmp, vtmp);
872       }
873 
874       for (unsigned i = 0; i < src.size(); i++) {
875          if (!identity[i].isConstant() ||
876              identity[i].constantValue()) { /* bound_ctrl should take care of this otherwise */
877             if (ctx->program->gfx_level < GFX10)
878                assert((identity[i].isConstant() && !identity[i].isLiteral()) ||
879                       identity[i].physReg() == PhysReg{sitmp + i});
880             bld.writelane(Definition(PhysReg{tmp + i}, v1), identity[i], Operand::zero(),
881                           Operand(PhysReg{tmp + i}, v1));
882          }
883       }
884       FALLTHROUGH;
885    case aco_opcode::p_inclusive_scan:
886       assert(cluster_size == ctx->program->wave_size);
887       if (ctx->program->gfx_level <= GFX7) {
888          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00));
889          copy_constant_sgpr(bld, Definition(exec, s2), 0xaaaa'aaaa'aaaa'aaaaull);
890          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
891 
892          copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
893          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00));
894          copy_constant_sgpr(bld, Definition(exec, s2), 0xcccc'cccc'cccc'ccccull);
895          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
896 
897          copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
898          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00));
899          copy_constant_sgpr(bld, Definition(exec, s2), 0xf0f0'f0f0'f0f0'f0f0ull);
900          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
901 
902          copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
903          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00));
904          copy_constant_sgpr(bld, Definition(exec, s2), 0xff00'ff00'ff00'ff00ull);
905          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
906 
907          copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
908          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00));
909          copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'0000'ffff'0000ull);
910          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
911 
912          for (unsigned i = 0; i < src.size(); i++)
913             bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
914                          Operand::c32(31u));
915          copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'ffff'0000'0000ull);
916          emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
917          break;
918       }
919 
920       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(1), 0xf, 0xf, false,
921                   identity);
922       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(2), 0xf, 0xf, false,
923                   identity);
924       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(4), 0xf, 0xf, false,
925                   identity);
926       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(8), 0xf, 0xf, false,
927                   identity);
928       if (ctx->program->gfx_level >= GFX10) {
929          copy_constant_sgpr(bld, Definition(exec, bld.lm), 0xffff'0000'ffff'0000ull);
930          for (unsigned i = 0; i < src.size(); i++) {
931             Instruction* perm =
932                bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
933                         Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
934                         Operand::c32(0xffffffffu))
935                   .instr;
936             perm->valu().opsel = 1; /* FI (Fetch Inactive) */
937          }
938          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
939 
940          if (ctx->program->wave_size == 64) {
941             copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'ffff'0000'0000ull);
942             for (unsigned i = 0; i < src.size(); i++)
943                bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
944                             Operand::c32(31u));
945             emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
946          }
947       } else {
948          emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
949                      false, identity);
950          emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
951                      false, identity);
952       }
953       break;
954    default: unreachable("Invalid reduction mode");
955    }
956 
957    if (op == aco_opcode::p_reduce) {
958       if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) {
959          bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
960          emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size());
961          return;
962       }
963 
964       if (reduction_needs_last_op)
965          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
966    }
967 
968    /* restore exec */
969    bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
970 
971    if (dst.regClass().type() == RegType::sgpr) {
972       for (unsigned k = 0; k < src.size(); k++) {
973          bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), Operand(PhysReg{tmp + k}, v1),
974                       Operand::c32(ctx->program->wave_size - 1));
975       }
976    } else if (dst.physReg() != tmp) {
977       for (unsigned k = 0; k < src.size(); k++) {
978          bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1),
979                   Operand(PhysReg{tmp + k}, v1));
980       }
981    }
982 }
983 
984 void
adjust_bpermute_dst(Builder & bld,Definition dst,Operand input_data)985 adjust_bpermute_dst(Builder& bld, Definition dst, Operand input_data)
986 {
987    /* RA assumes that the result is always in the low part of the register, so we have to shift,
988     * if it's not there already.
989     */
990    if (input_data.physReg().byte()) {
991       unsigned right_shift = input_data.physReg().byte() * 8;
992       bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand::c32(right_shift),
993                Operand(dst.physReg(), dst.regClass()));
994    }
995 }
996 
997 void
emit_bpermute_permlane(Builder & bld,aco_ptr<Instruction> & instr)998 emit_bpermute_permlane(Builder& bld, aco_ptr<Instruction>& instr)
999 {
1000    /* Emulates proper bpermute on GFX11 in wave64 mode.
1001     *
1002     * Similar to emit_gfx10_wave64_bpermute, but uses the new
1003     * v_permlane64_b32 instruction to swap data between lo and hi halves.
1004     */
1005 
1006    assert(bld.program->gfx_level >= GFX11);
1007    assert(bld.program->wave_size == 64);
1008 
1009    Definition dst = instr->definitions[0];
1010    Definition tmp_exec = instr->definitions[1];
1011    Definition clobber_scc = instr->definitions[2];
1012    Operand tmp_op = instr->operands[0];
1013    Operand index_x4 = instr->operands[1];
1014    Operand input_data = instr->operands[2];
1015    Operand same_half = instr->operands[3];
1016 
1017    assert(dst.regClass() == v1);
1018    assert(tmp_exec.regClass() == bld.lm);
1019    assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
1020    assert(same_half.regClass() == bld.lm);
1021    assert(tmp_op.regClass() == v1.as_linear());
1022    assert(index_x4.regClass() == v1);
1023    assert(input_data.regClass().type() == RegType::vgpr);
1024    assert(input_data.bytes() <= 4);
1025 
1026    Definition tmp_def(tmp_op.physReg(), tmp_op.regClass());
1027 
1028    /* Permute the input within the same half-wave. */
1029    bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
1030 
1031    /* Save EXEC and enable all lanes. */
1032    bld.sop1(aco_opcode::s_or_saveexec_b64, tmp_exec, clobber_scc, Definition(exec, s2),
1033             Operand::c32(-1u), Operand(exec, s2));
1034 
1035    /* Copy input data from other half to current half's linear VGPR. */
1036    bld.vop1(aco_opcode::v_permlane64_b32, tmp_def, input_data);
1037 
1038    /* Permute the input from the other half-wave, write to linear VGPR. */
1039    bld.ds(aco_opcode::ds_bpermute_b32, tmp_def, index_x4, tmp_op);
1040 
1041    /* Restore saved EXEC. */
1042    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
1043 
1044    /* Select correct permute result. */
1045    bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, tmp_op, Operand(dst.physReg(), dst.regClass()),
1046                 same_half);
1047 
1048    adjust_bpermute_dst(bld, dst, input_data);
1049 }
1050 
1051 void
emit_bpermute_shared_vgpr(Builder & bld,aco_ptr<Instruction> & instr)1052 emit_bpermute_shared_vgpr(Builder& bld, aco_ptr<Instruction>& instr)
1053 {
1054    /* Emulates proper bpermute on GFX10 in wave64 mode.
1055     *
1056     * This is necessary because on GFX10 the bpermute instruction only works
1057     * on half waves (you can think of it as having a cluster size of 32), so we
1058     * manually swap the data between the two halves using two shared VGPRs.
1059     */
1060 
1061    assert(bld.program->gfx_level >= GFX10 && bld.program->gfx_level <= GFX10_3);
1062    assert(bld.program->wave_size == 64);
1063 
1064    unsigned shared_vgpr_reg_0 = align(bld.program->config->num_vgprs, 4) + 256;
1065    Definition dst = instr->definitions[0];
1066    Definition tmp_exec = instr->definitions[1];
1067    Definition clobber_scc = instr->definitions[2];
1068    Operand index_x4 = instr->operands[0];
1069    Operand input_data = instr->operands[1];
1070    Operand same_half = instr->operands[2];
1071 
1072    assert(dst.regClass() == v1);
1073    assert(tmp_exec.regClass() == bld.lm);
1074    assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
1075    assert(same_half.regClass() == bld.lm);
1076    assert(index_x4.regClass() == v1);
1077    assert(input_data.regClass().type() == RegType::vgpr);
1078    assert(input_data.bytes() <= 4);
1079    assert(dst.physReg() != index_x4.physReg());
1080    assert(dst.physReg() != input_data.physReg());
1081    assert(tmp_exec.physReg() != same_half.physReg());
1082 
1083    PhysReg shared_vgpr_lo(shared_vgpr_reg_0);
1084    PhysReg shared_vgpr_hi(shared_vgpr_reg_0 + 1);
1085 
1086    /* Permute the input within the same half-wave */
1087    bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
1088 
1089    /* HI: Copy data from high lanes 32-63 to shared vgpr */
1090    bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data,
1091                 dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
1092    /* Save EXEC */
1093    bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2));
1094    /* Set EXEC to enable LO lanes only */
1095    copy_constant_sgpr(bld, Definition(exec, s2), 0x0000'0000'ffff'ffffull);
1096    /* LO: Copy data from low lanes 0-31 to shared vgpr */
1097    bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data);
1098    /* LO: bpermute shared vgpr (high lanes' data) */
1099    bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4,
1100           Operand(shared_vgpr_hi, v1));
1101    /* Set EXEC to enable HI lanes only */
1102    copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'ffff'0000'0000ull);
1103    /* HI: bpermute shared vgpr (low lanes' data) */
1104    bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4,
1105           Operand(shared_vgpr_lo, v1));
1106 
1107    /* Only enable lanes which use the other half's data */
1108    bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc,
1109             Operand(tmp_exec.physReg(), s2), same_half);
1110    /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */
1111    bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3),
1112                 0x3, 0xf, false);
1113    /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */
1114    bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3),
1115                 0xc, 0xf, false);
1116 
1117    /* Restore saved EXEC */
1118    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
1119 
1120    adjust_bpermute_dst(bld, dst, input_data);
1121 }
1122 
1123 void
emit_bpermute_readlane(Builder & bld,aco_ptr<Instruction> & instr)1124 emit_bpermute_readlane(Builder& bld, aco_ptr<Instruction>& instr)
1125 {
1126    /* Emulates bpermute using readlane instructions */
1127 
1128    Operand index = instr->operands[0];
1129    Operand input = instr->operands[1];
1130    Definition dst = instr->definitions[0];
1131    Definition temp_exec = instr->definitions[1];
1132    Definition clobber_vcc = instr->definitions[2];
1133 
1134    assert(dst.regClass() == v1);
1135    assert(temp_exec.regClass() == bld.lm);
1136    assert(clobber_vcc.regClass() == bld.lm);
1137    assert(clobber_vcc.physReg() == vcc);
1138    assert(index.regClass() == v1);
1139    assert(index.physReg() != dst.physReg());
1140    assert(input.regClass().type() == RegType::vgpr);
1141    assert(input.bytes() <= 4);
1142    assert(input.physReg() != dst.physReg());
1143 
1144    /* Save original EXEC */
1145    bld.sop1(Builder::s_mov, temp_exec, Operand(exec, bld.lm));
1146 
1147    /* An "unrolled loop" that is executed per each lane.
1148     * This takes only a few instructions per lane, as opposed to a "real" loop
1149     * with branching, where the branch instruction alone would take 16+ cycles.
1150     */
1151    for (unsigned n = 0; n < bld.program->wave_size; ++n) {
1152       /* Activate the lane which has N for its source index */
1153       if (bld.program->gfx_level >= GFX10)
1154          bld.vopc(aco_opcode::v_cmpx_eq_u32, Definition(exec, bld.lm), Operand::c32(n), index);
1155       else
1156          bld.vopc(aco_opcode::v_cmpx_eq_u32, clobber_vcc, Definition(exec, bld.lm), Operand::c32(n),
1157                   index);
1158       /* Read the data from lane N */
1159       bld.readlane(Definition(vcc, s1), input, Operand::c32(n));
1160       /* On the active lane, move the data we read from lane N to the destination VGPR */
1161       bld.vop1(aco_opcode::v_mov_b32, dst, Operand(vcc, s1));
1162       /* Restore original EXEC */
1163       bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(temp_exec.physReg(), bld.lm));
1164    }
1165 
1166    adjust_bpermute_dst(bld, dst, input);
1167 }
1168 
1169 struct copy_operation {
1170    Operand op;
1171    Definition def;
1172    unsigned bytes;
1173    union {
1174       uint8_t uses[8];
1175       uint64_t is_used = 0;
1176    };
1177 };
1178 
1179 void
split_copy(lower_context * ctx,unsigned offset,Definition * def,Operand * op,const copy_operation & src,bool ignore_uses,unsigned max_size)1180 split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op,
1181            const copy_operation& src, bool ignore_uses, unsigned max_size)
1182 {
1183    PhysReg def_reg = src.def.physReg();
1184    PhysReg op_reg = src.op.physReg();
1185    def_reg.reg_b += offset;
1186    op_reg.reg_b += offset;
1187 
1188    /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10, and on GFX11
1189     * v_lshrrev_b64 doesn't get dual issued. */
1190    if ((ctx->program->gfx_level < GFX10 || ctx->program->gfx_level >= GFX11) &&
1191        src.def.regClass().type() == RegType::vgpr)
1192       max_size = MIN2(max_size, 4);
1193    unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
1194 
1195    /* make sure the size is a power of two and reg % bytes == 0 */
1196    unsigned bytes = 1;
1197    for (; bytes <= max_size; bytes *= 2) {
1198       unsigned next = bytes * 2u;
1199       bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&
1200                           offset + next <= src.bytes && next <= max_size;
1201       if (!src.op.isConstant() && can_increase)
1202          can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;
1203       for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
1204          can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
1205       if (!can_increase)
1206          break;
1207    }
1208 
1209    *def = Definition(src.def.tempId(), def_reg, src.def.regClass().resize(bytes));
1210    if (src.op.isConstant()) {
1211       assert(bytes >= 1 && bytes <= 8);
1212       uint64_t val = src.op.constantValue64() >> (offset * 8u);
1213       *op = Operand::get_const(ctx->program->gfx_level, val, bytes);
1214    } else {
1215       RegClass op_cls = src.op.regClass().resize(bytes);
1216       *op = Operand(op_reg, op_cls);
1217       op->setTemp(Temp(src.op.tempId(), op_cls));
1218    }
1219 }
1220 
1221 uint32_t
get_intersection_mask(int a_start,int a_size,int b_start,int b_size)1222 get_intersection_mask(int a_start, int a_size, int b_start, int b_size)
1223 {
1224    int intersection_start = MAX2(b_start - a_start, 0);
1225    int intersection_end = MAX2(b_start + b_size - a_start, 0);
1226    if (intersection_start >= a_size || intersection_end == 0)
1227       return 0;
1228 
1229    uint32_t mask = u_bit_consecutive(0, a_size);
1230    return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
1231 }
1232 
1233 /* src1 are bytes 0-3. dst/src0 are bytes 4-7. */
1234 void
create_bperm(Builder & bld,uint8_t swiz[4],Definition dst,Operand src1,Operand src0=Operand (v1))1235 create_bperm(Builder& bld, uint8_t swiz[4], Definition dst, Operand src1,
1236              Operand src0 = Operand(v1))
1237 {
1238    uint32_t swiz_packed =
1239       swiz[0] | ((uint32_t)swiz[1] << 8) | ((uint32_t)swiz[2] << 16) | ((uint32_t)swiz[3] << 24);
1240 
1241    dst = Definition(PhysReg(dst.physReg().reg()), v1);
1242    if (!src1.isConstant())
1243       src1 = Operand(PhysReg(src1.physReg().reg()), v1);
1244    if (src0.isUndefined())
1245       src0 = Operand(dst.physReg(), v1);
1246    else if (!src0.isConstant())
1247       src0 = Operand(PhysReg(src0.physReg().reg()), v1);
1248    bld.vop3(aco_opcode::v_perm_b32, dst, src0, src1, Operand::c32(swiz_packed));
1249 }
1250 
1251 void
emit_v_mov_b16(Builder & bld,Definition dst,Operand op)1252 emit_v_mov_b16(Builder& bld, Definition dst, Operand op)
1253 {
1254    /* v_mov_b16 uses 32bit inline constants. */
1255    if (op.isConstant()) {
1256       if (!op.isLiteral() && op.physReg() >= 240) {
1257          /* v_add_f16 is smaller because it can use 16bit fp inline constants. */
1258          Instruction* instr = bld.vop2_e64(aco_opcode::v_add_f16, dst, op, Operand::zero());
1259          instr->valu().opsel[3] = dst.physReg().byte() == 2;
1260          return;
1261       }
1262       op = Operand::c32((int32_t)(int16_t)op.constantValue());
1263    }
1264 
1265    Instruction* instr = bld.vop1(aco_opcode::v_mov_b16, dst, op);
1266    instr->valu().opsel[0] = op.physReg().byte() == 2;
1267    instr->valu().opsel[3] = dst.physReg().byte() == 2;
1268 }
1269 
1270 void
copy_constant(lower_context * ctx,Builder & bld,Definition dst,Operand op)1271 copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
1272 {
1273    assert(op.bytes() == dst.bytes());
1274 
1275    if (dst.regClass().type() == RegType::sgpr)
1276       return copy_constant_sgpr(bld, dst, op.constantValue64());
1277 
1278    bool dual_issue_mov = ctx->program->gfx_level >= GFX11 && ctx->program->wave_size == 64 &&
1279                          ctx->program->workgroup_size > 32;
1280    if (dst.bytes() == 4 && op.isLiteral() && !dual_issue_mov) {
1281       uint32_t imm = op.constantValue();
1282       Operand rev_op = Operand::get_const(ctx->program->gfx_level, util_bitreverse(imm), 4);
1283       if (!rev_op.isLiteral()) {
1284          bld.vop1(aco_opcode::v_bfrev_b32, dst, rev_op);
1285          return;
1286       }
1287    }
1288 
1289    if (op.bytes() == 4 && op.constantEquals(0x3e22f983) && ctx->program->gfx_level >= GFX8)
1290       op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */
1291 
1292    if (dst.regClass() == v2) {
1293       if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) {
1294          bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand::zero(), op);
1295       } else {
1296          assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true));
1297          bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand::zero(), op);
1298       }
1299    } else if (dst.regClass() == v1) {
1300       bld.vop1(aco_opcode::v_mov_b32, dst, op);
1301    } else {
1302       assert(dst.regClass() == v1b || dst.regClass() == v2b);
1303 
1304       bool use_sdwa = ctx->program->gfx_level >= GFX9 && ctx->program->gfx_level < GFX11;
1305       if (dst.regClass() == v1b && use_sdwa) {
1306          uint8_t val = op.constantValue();
1307          Operand op32 = Operand::c32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u));
1308          if (op32.isLiteral()) {
1309             uint32_t a = (uint32_t)int8_mul_table[val * 2];
1310             uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1];
1311             bld.vop2_sdwa(aco_opcode::v_mul_u32_u24, dst,
1312                           Operand::c32(a | (a & 0x80u ? 0xffffff00u : 0x0u)),
1313                           Operand::c32(b | (b & 0x80u ? 0xffffff00u : 0x0u)));
1314          } else {
1315             bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32);
1316          }
1317       } else if (dst.regClass() == v1b && ctx->program->gfx_level >= GFX10) {
1318          Operand fop = Operand::c32(fui(float(op.constantValue())));
1319          Operand offset = Operand::c32(dst.physReg().byte());
1320          Operand def_op(PhysReg(dst.physReg().reg()), v1);
1321          bld.vop3(aco_opcode::v_cvt_pk_u8_f32, dst, fop, offset, def_op);
1322       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1323          emit_v_mov_b16(bld, dst, op);
1324       } else if (dst.regClass() == v2b && use_sdwa && !op.isLiteral()) {
1325          if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) {
1326             /* use v_mov_b32 to avoid possible issues with denormal flushing or
1327              * NaN. v_add_f16 is still needed for float constants. */
1328             uint32_t val32 = (int32_t)(int16_t)op.constantValue();
1329             bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, Operand::c32(val32));
1330          } else {
1331             bld.vop2_sdwa(aco_opcode::v_add_f16, dst, op, Operand::zero());
1332          }
1333       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1334          op = Operand::c32(op.constantValue());
1335          Instruction* instr = bld.vop3(aco_opcode::v_add_u16_e64, dst, op, Operand::c32(0));
1336          instr->valu().opsel[3] = dst.physReg().byte() == 2;
1337       } else {
1338          uint32_t offset = dst.physReg().byte() * 8u;
1339          uint32_t mask = ((1u << (dst.bytes() * 8)) - 1) << offset;
1340          uint32_t val = (op.constantValue() << offset) & mask;
1341          dst = Definition(PhysReg(dst.physReg().reg()), v1);
1342          Operand def_op(dst.physReg(), v1);
1343          if (val != mask)
1344             bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(~mask), def_op);
1345          if (val != 0)
1346             bld.vop2(aco_opcode::v_or_b32, dst, Operand::c32(val), def_op);
1347       }
1348    }
1349 }
1350 
1351 bool
do_copy(lower_context * ctx,Builder & bld,const copy_operation & copy,bool * preserve_scc,PhysReg scratch_sgpr)1352 do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc,
1353         PhysReg scratch_sgpr)
1354 {
1355    bool did_copy = false;
1356    for (unsigned offset = 0; offset < copy.bytes;) {
1357       if (copy.uses[offset]) {
1358          offset++;
1359          continue;
1360       }
1361 
1362       Definition def;
1363       Operand op;
1364       split_copy(ctx, offset, &def, &op, copy, false, 8);
1365 
1366       if (def.physReg() == scc) {
1367          bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand::zero());
1368          *preserve_scc = true;
1369       } else if (op.isConstant()) {
1370          copy_constant(ctx, bld, def, op);
1371       } else if (def.regClass() == v1) {
1372          bld.vop1(aco_opcode::v_mov_b32, def, op);
1373       } else if (def.regClass() == v2) {
1374          bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
1375       } else if (def.regClass() == s1) {
1376          bld.sop1(aco_opcode::s_mov_b32, def, op);
1377       } else if (def.regClass() == s2) {
1378          bld.sop1(aco_opcode::s_mov_b64, def, op);
1379       } else if (def.regClass() == v1b && ctx->program->gfx_level >= GFX11) {
1380          uint8_t swiz[] = {4, 5, 6, 7};
1381          swiz[def.physReg().byte()] = op.physReg().byte();
1382          create_bperm(bld, swiz, def, op);
1383       } else if (def.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1384          emit_v_mov_b16(bld, def, op);
1385       } else if (def.regClass().is_subdword()) {
1386          bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);
1387       } else {
1388          unreachable("unsupported copy");
1389       }
1390 
1391       did_copy = true;
1392       offset += def.bytes();
1393    }
1394    return did_copy;
1395 }
1396 
1397 void
swap_subdword_gfx11(Builder & bld,Definition def,Operand op)1398 swap_subdword_gfx11(Builder& bld, Definition def, Operand op)
1399 {
1400    if (def.physReg().reg() == op.physReg().reg()) {
1401       assert(def.bytes() != 2); /* handled by caller */
1402       uint8_t swiz[] = {4, 5, 6, 7};
1403       std::swap(swiz[def.physReg().byte()], swiz[op.physReg().byte()]);
1404       create_bperm(bld, swiz, def, Operand::zero());
1405       return;
1406    }
1407 
1408    if (def.bytes() == 2) {
1409       Operand def_as_op = Operand(def.physReg(), def.regClass());
1410       Definition op_as_def = Definition(op.physReg(), op.regClass());
1411       /* v_swap_b16 is not offically supported as VOP3, so it can't be used with v128-255.
1412        * Tests show that VOP3 appears to work correctly, but according to AMD that should
1413        * not be relied on.
1414        */
1415       if (def.physReg() < (256 + 128) && op.physReg() < (256 + 128)) {
1416          Instruction* instr = bld.vop1(aco_opcode::v_swap_b16, def, op_as_def, op, def_as_op);
1417          instr->valu().opsel[0] = op.physReg().byte();
1418          instr->valu().opsel[3] = def.physReg().byte();
1419       } else {
1420          Instruction* instr = bld.vop3(aco_opcode::v_xor_b16, def, op, def_as_op);
1421          instr->valu().opsel[0] = op.physReg().byte();
1422          instr->valu().opsel[1] = def_as_op.physReg().byte();
1423          instr->valu().opsel[3] = def.physReg().byte();
1424          instr = bld.vop3(aco_opcode::v_xor_b16, op_as_def, op, def_as_op);
1425          instr->valu().opsel[0] = op.physReg().byte();
1426          instr->valu().opsel[1] = def_as_op.physReg().byte();
1427          instr->valu().opsel[3] = op_as_def.physReg().byte();
1428          instr = bld.vop3(aco_opcode::v_xor_b16, def, op, def_as_op);
1429          instr->valu().opsel[0] = op.physReg().byte();
1430          instr->valu().opsel[1] = def_as_op.physReg().byte();
1431          instr->valu().opsel[3] = def.physReg().byte();
1432       }
1433    } else {
1434       PhysReg op_half = op.physReg();
1435       op_half.reg_b &= ~1;
1436 
1437       PhysReg def_other_half = def.physReg();
1438       def_other_half.reg_b &= ~1;
1439       def_other_half.reg_b ^= 2;
1440 
1441       /* We can only swap individual bytes within a single VGPR, so temporarily move both bytes
1442        * into the same VGPR.
1443        */
1444       swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1445       swap_subdword_gfx11(bld, def, Operand(def_other_half.advance(op.physReg().byte() & 1), v1b));
1446       swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1447    }
1448 }
1449 
1450 void
do_swap(lower_context * ctx,Builder & bld,const copy_operation & copy,bool preserve_scc,Pseudo_instruction * pi)1451 do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool preserve_scc,
1452         Pseudo_instruction* pi)
1453 {
1454    unsigned offset = 0;
1455 
1456    if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&
1457        (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {
1458       /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte
1459        * swap */
1460       PhysReg op = copy.op.physReg();
1461       PhysReg def = copy.def.physReg();
1462       op.reg_b &= ~0x3;
1463       def.reg_b &= ~0x3;
1464 
1465       copy_operation tmp;
1466       tmp.op = Operand(op, v1);
1467       tmp.def = Definition(def, v1);
1468       tmp.bytes = 4;
1469       memset(tmp.uses, 1, 4);
1470       do_swap(ctx, bld, tmp, preserve_scc, pi);
1471 
1472       op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1473       def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1474       tmp.op = Operand(op, v1b);
1475       tmp.def = Definition(def, v1b);
1476       tmp.bytes = 1;
1477       tmp.uses[0] = 1;
1478       do_swap(ctx, bld, tmp, preserve_scc, pi);
1479 
1480       offset = copy.bytes;
1481    }
1482 
1483    for (; offset < copy.bytes;) {
1484       Definition def;
1485       Operand op;
1486       unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8;
1487       split_copy(ctx, offset, &def, &op, copy, true, max_size);
1488 
1489       assert(op.regClass() == def.regClass());
1490       Operand def_as_op = Operand(def.physReg(), def.regClass());
1491       Definition op_as_def = Definition(op.physReg(), op.regClass());
1492       if (ctx->program->gfx_level >= GFX9 && def.regClass() == v1) {
1493          bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1494       } else if (def.regClass() == v1) {
1495          assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);
1496          bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1497          bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1498          bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1499       } else if (op.physReg() == scc || def.physReg() == scc) {
1500          /* we need to swap scc and another sgpr */
1501          assert(!preserve_scc);
1502 
1503          PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg();
1504 
1505          bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1506          bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1),
1507                   Operand::zero());
1508          bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
1509       } else if (def.regClass() == s1) {
1510          if (preserve_scc) {
1511             bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op);
1512             bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
1513             bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1));
1514          } else {
1515             bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1516             bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op);
1517             bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1518          }
1519       } else if (def.regClass() == s2) {
1520          if (preserve_scc)
1521             bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1522          bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1523          bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);
1524          bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1525          if (preserve_scc)
1526             bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1),
1527                      Operand::zero());
1528       } else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
1529          bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op,
1530                   Operand::c32(2u));
1531       } else {
1532          assert(def.regClass().is_subdword());
1533          if (ctx->program->gfx_level >= GFX11) {
1534             swap_subdword_gfx11(bld, def, op);
1535          } else {
1536             bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1537             bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op);
1538             bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1539          }
1540       }
1541 
1542       offset += def.bytes();
1543    }
1544 
1545    if (ctx->program->gfx_level <= GFX7)
1546       return;
1547 
1548    /* fixup in case we swapped bytes we shouldn't have */
1549    copy_operation tmp_copy = copy;
1550    tmp_copy.op.setFixed(copy.def.physReg());
1551    tmp_copy.def.setFixed(copy.op.physReg());
1552    do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
1553 }
1554 
1555 void
do_pack_2x16(lower_context * ctx,Builder & bld,Definition def,Operand lo,Operand hi)1556 do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi)
1557 {
1558    assert(ctx->program->gfx_level >= GFX8);
1559 
1560    if (lo.isConstant() && hi.isConstant()) {
1561       copy_constant(ctx, bld, def, Operand::c32(lo.constantValue() | (hi.constantValue() << 16)));
1562       return;
1563    }
1564 
1565    bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) &&
1566                        (ctx->program->gfx_level >= GFX10 ||
1567                         (ctx->program->gfx_level >= GFX9 && !lo.isLiteral() && !hi.isLiteral()));
1568 
1569    if (can_use_pack) {
1570       Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
1571       /* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */
1572       instr->valu().opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1);
1573       return;
1574    }
1575 
1576    /* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */
1577    if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 &&
1578        (!hi.isConstant() || (hi.constantValue() && (!Operand::c32(hi.constantValue()).isLiteral() ||
1579                                                     ctx->program->gfx_level >= GFX10)))) {
1580       if (hi.isConstant())
1581          bld.vop3(aco_opcode::v_alignbyte_b32, def, Operand::c32(hi.constantValue()), lo,
1582                   Operand::c32(2u));
1583       else
1584          bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
1585       return;
1586    }
1587 
1588    Definition def_lo = Definition(def.physReg(), v2b);
1589    Definition def_hi = Definition(def.physReg().advance(2), v2b);
1590 
1591    if (lo.isConstant()) {
1592       /* move hi and zero low bits */
1593       if (hi.physReg().byte() == 0)
1594          bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1595       else
1596          bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi);
1597       if (lo.constantValue())
1598          bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
1599                   Operand(def.physReg(), v1));
1600       return;
1601    }
1602    if (hi.isConstant()) {
1603       /* move lo and zero high bits */
1604       if (lo.physReg().byte() == 2)
1605          bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1606       else if (ctx->program->gfx_level >= GFX11)
1607          bld.vop1(aco_opcode::v_cvt_u32_u16, def, lo);
1608       else
1609          bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo);
1610       if (hi.constantValue())
1611          bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
1612                   Operand(def.physReg(), v1));
1613       return;
1614    }
1615 
1616    if (lo.physReg().reg() == def.physReg().reg()) {
1617       /* lo is in the high bits of def */
1618       assert(lo.physReg().byte() == 2);
1619       bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1620       lo.setFixed(def.physReg());
1621    } else if (hi.physReg() == def.physReg()) {
1622       /* hi is in the low bits of def */
1623       assert(hi.physReg().byte() == 0);
1624       bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1625       hi.setFixed(def.physReg().advance(2));
1626    } else if (ctx->program->gfx_level >= GFX8) {
1627       /* Either lo or hi can be placed with just a v_mov. SDWA is not needed, because
1628        * op.physReg().byte()==def.physReg().byte() and the other half will be overwritten.
1629        */
1630       assert(lo.physReg().byte() == 0 || hi.physReg().byte() == 2);
1631       Operand& op = lo.physReg().byte() == 0 ? lo : hi;
1632       PhysReg reg = def.physReg().advance(op.physReg().byte());
1633       bld.vop1(aco_opcode::v_mov_b32, Definition(reg, v2b), op);
1634       op.setFixed(reg);
1635    }
1636 
1637    /* either hi or lo are already placed correctly */
1638    if (ctx->program->gfx_level >= GFX11) {
1639       if (lo.physReg().reg() == def.physReg().reg())
1640          emit_v_mov_b16(bld, def_hi, hi);
1641       else
1642          emit_v_mov_b16(bld, def_lo, lo);
1643    } else {
1644       if (lo.physReg().reg() == def.physReg().reg())
1645          bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi);
1646       else
1647          bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo);
1648    }
1649 }
1650 
1651 void
try_coalesce_copies(lower_context * ctx,std::map<PhysReg,copy_operation> & copy_map,copy_operation & copy)1652 try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_map,
1653                     copy_operation& copy)
1654 {
1655    // TODO try more relaxed alignment for subdword copies
1656    unsigned next_def_align = util_next_power_of_two(copy.bytes + 1);
1657    unsigned next_op_align = next_def_align;
1658    if (copy.def.regClass().type() == RegType::vgpr)
1659       next_def_align = MIN2(next_def_align, 4);
1660    if (copy.op.regClass().type() == RegType::vgpr)
1661       next_op_align = MIN2(next_op_align, 4);
1662 
1663    if (copy.bytes >= 8 || copy.def.physReg().reg_b % next_def_align ||
1664        (!copy.op.isConstant() && copy.op.physReg().reg_b % next_op_align))
1665       return;
1666 
1667    auto other = copy_map.find(copy.def.physReg().advance(copy.bytes));
1668    if (other == copy_map.end() || copy.bytes + other->second.bytes > 8 ||
1669        copy.op.isConstant() != other->second.op.isConstant())
1670       return;
1671 
1672    /* don't create 64-bit copies before GFX10 */
1673    if (copy.bytes >= 4 && copy.def.regClass().type() == RegType::vgpr &&
1674        ctx->program->gfx_level < GFX10)
1675       return;
1676 
1677    unsigned new_size = copy.bytes + other->second.bytes;
1678    if (copy.op.isConstant()) {
1679       uint64_t val =
1680          copy.op.constantValue64() | (other->second.op.constantValue64() << (copy.bytes * 8u));
1681       if (!util_is_power_of_two_or_zero(new_size))
1682          return;
1683       if (!Operand::is_constant_representable(val, new_size, true,
1684                                               copy.def.regClass().type() == RegType::vgpr))
1685          return;
1686       copy.op = Operand::get_const(ctx->program->gfx_level, val, new_size);
1687    } else {
1688       if (other->second.op.physReg() != copy.op.physReg().advance(copy.bytes))
1689          return;
1690       copy.op = Operand(copy.op.physReg(), copy.op.regClass().resize(new_size));
1691    }
1692 
1693    copy.bytes = new_size;
1694    copy.def = Definition(copy.def.physReg(), copy.def.regClass().resize(copy.bytes));
1695    copy_map.erase(other);
1696 }
1697 
1698 void
handle_operands(std::map<PhysReg,copy_operation> & copy_map,lower_context * ctx,amd_gfx_level gfx_level,Pseudo_instruction * pi)1699 handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
1700                 amd_gfx_level gfx_level, Pseudo_instruction* pi)
1701 {
1702    Builder bld(ctx->program, &ctx->instructions);
1703    unsigned num_instructions_before = ctx->instructions.size();
1704    aco_ptr<Instruction> mov;
1705    bool writes_scc = false;
1706 
1707    /* count the number of uses for each dst reg */
1708    for (auto it = copy_map.begin(); it != copy_map.end();) {
1709 
1710       if (it->second.def.physReg() == scc)
1711          writes_scc = true;
1712 
1713       assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr));
1714 
1715       /* if src and dst reg are the same, remove operation */
1716       if (it->first == it->second.op.physReg()) {
1717          it = copy_map.erase(it);
1718          continue;
1719       }
1720 
1721       /* split large copies */
1722       if (it->second.bytes > 8) {
1723          assert(!it->second.op.isConstant());
1724          assert(!it->second.def.regClass().is_subdword());
1725          RegClass rc = it->second.def.regClass().resize(it->second.def.bytes() - 8);
1726          Definition hi_def = Definition(PhysReg{it->first + 2}, rc);
1727          rc = it->second.op.regClass().resize(it->second.op.bytes() - 8);
1728          Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 2}, rc);
1729          copy_operation copy = {hi_op, hi_def, it->second.bytes - 8};
1730          copy_map[hi_def.physReg()] = copy;
1731          assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0);
1732          it->second.op = Operand(it->second.op.physReg(), it->second.op.regClass().resize(8));
1733          it->second.def = Definition(it->second.def.physReg(), it->second.def.regClass().resize(8));
1734          it->second.bytes = 8;
1735       }
1736 
1737       try_coalesce_copies(ctx, copy_map, it->second);
1738 
1739       /* check if the definition reg is used by another copy operation */
1740       for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1741          if (copy.second.op.isConstant())
1742             continue;
1743          for (uint16_t i = 0; i < it->second.bytes; i++) {
1744             /* distance might underflow */
1745             unsigned distance = it->first.reg_b + i - copy.second.op.physReg().reg_b;
1746             if (distance < copy.second.bytes)
1747                it->second.uses[i] += 1;
1748          }
1749       }
1750 
1751       ++it;
1752    }
1753 
1754    /* first, handle paths in the location transfer graph */
1755    bool preserve_scc = pi->tmp_in_scc && !writes_scc;
1756    bool skip_partial_copies = true;
1757    for (auto it = copy_map.begin();;) {
1758       if (copy_map.empty()) {
1759          ctx->program->statistics[aco_statistic_copies] +=
1760             ctx->instructions.size() - num_instructions_before;
1761          return;
1762       }
1763       if (it == copy_map.end()) {
1764          if (!skip_partial_copies)
1765             break;
1766          skip_partial_copies = false;
1767          it = copy_map.begin();
1768       }
1769 
1770       /* check if we can pack one register at once */
1771       if (it->first.byte() == 0 && it->second.bytes == 2) {
1772          PhysReg reg_hi = it->first.advance(2);
1773          std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1774          if (other != copy_map.end() && other->second.bytes == 2) {
1775             /* check if the target register is otherwise unused */
1776             bool unused_lo = !it->second.is_used || (it->second.is_used == 0x0101 &&
1777                                                      other->second.op.physReg() == it->first);
1778             bool unused_hi = !other->second.is_used ||
1779                              (other->second.is_used == 0x0101 && it->second.op.physReg() == reg_hi);
1780             if (unused_lo && unused_hi) {
1781                Operand lo = it->second.op;
1782                Operand hi = other->second.op;
1783                do_pack_2x16(ctx, bld, Definition(it->first, v1), lo, hi);
1784                copy_map.erase(it);
1785                copy_map.erase(other);
1786 
1787                for (std::pair<const PhysReg, copy_operation>& other2 : copy_map) {
1788                   for (uint16_t i = 0; i < other2.second.bytes; i++) {
1789                      /* distance might underflow */
1790                      unsigned distance_lo = other2.first.reg_b + i - lo.physReg().reg_b;
1791                      unsigned distance_hi = other2.first.reg_b + i - hi.physReg().reg_b;
1792                      if (distance_lo < 2 || distance_hi < 2)
1793                         other2.second.uses[i] -= 1;
1794                   }
1795                }
1796                it = copy_map.begin();
1797                continue;
1798             }
1799          }
1800       }
1801 
1802       /* optimize constant copies to aligned sgpr pair that's otherwise unused. */
1803       if (it->first <= exec && (it->first % 2) == 0 && it->second.bytes == 4 &&
1804           it->second.op.isConstant() && !it->second.is_used) {
1805          PhysReg reg_hi = it->first.advance(4);
1806          std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1807          if (other != copy_map.end() && other->second.bytes == 4 && other->second.op.isConstant() &&
1808              !other->second.is_used) {
1809             uint64_t constant =
1810                it->second.op.constantValue64() | (other->second.op.constantValue64() << 32);
1811             copy_constant_sgpr(bld, Definition(it->first, s2), constant);
1812             copy_map.erase(it);
1813             copy_map.erase(other);
1814             it = copy_map.begin();
1815             continue;
1816          }
1817       }
1818 
1819       /* find portions where the target reg is not used as operand for any other copy */
1820       if (it->second.is_used) {
1821          if (it->second.op.isConstant() || skip_partial_copies) {
1822             /* we have to skip constants until is_used=0.
1823              * we also skip partial copies at the beginning to help coalescing */
1824             ++it;
1825             continue;
1826          }
1827 
1828          unsigned has_zero_use_bytes = 0;
1829          for (unsigned i = 0; i < it->second.bytes; i++)
1830             has_zero_use_bytes |= (it->second.uses[i] == 0) << i;
1831 
1832          if (has_zero_use_bytes) {
1833             /* Skipping partial copying and doing a v_swap_b32 and then fixup
1834              * copies is usually beneficial for sub-dword copies, but if doing
1835              * a partial copy allows further copies, it should be done instead. */
1836             bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);
1837             for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1838                if (partial_copy)
1839                   break;
1840                for (uint16_t i = 0; i < copy.second.bytes; i++) {
1841                   /* distance might underflow */
1842                   unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b;
1843                   if (distance < it->second.bytes && copy.second.uses[i] == 1 &&
1844                       !it->second.uses[distance])
1845                      partial_copy = true;
1846                }
1847             }
1848 
1849             if (!partial_copy) {
1850                ++it;
1851                continue;
1852             }
1853          } else {
1854             /* full target reg is used: register swapping needed */
1855             ++it;
1856             continue;
1857          }
1858       }
1859 
1860       bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr);
1861       skip_partial_copies = did_copy;
1862       std::pair<PhysReg, copy_operation> copy = *it;
1863 
1864       if (it->second.is_used == 0) {
1865          /* the target reg is not used as operand for any other copy, so we
1866           * copied to all of it */
1867          copy_map.erase(it);
1868          it = copy_map.begin();
1869       } else {
1870          /* we only performed some portions of this copy, so split it to only
1871           * leave the portions that still need to be done */
1872          copy_operation original = it->second; /* the map insertion below can overwrite this */
1873          copy_map.erase(it);
1874          for (unsigned offset = 0; offset < original.bytes;) {
1875             if (original.uses[offset] == 0) {
1876                offset++;
1877                continue;
1878             }
1879             Definition def;
1880             Operand op;
1881             split_copy(ctx, offset, &def, &op, original, false, 8);
1882 
1883             copy_operation new_copy = {op, def, def.bytes()};
1884             for (unsigned i = 0; i < new_copy.bytes; i++)
1885                new_copy.uses[i] = original.uses[i + offset];
1886             copy_map[def.physReg()] = new_copy;
1887 
1888             offset += def.bytes();
1889          }
1890 
1891          it = copy_map.begin();
1892       }
1893 
1894       /* Reduce the number of uses of the operand reg by one. Do this after
1895        * splitting the copy or removing it in case the copy writes to it's own
1896        * operand (for example, v[7:8] = v[8:9]) */
1897       if (did_copy && !copy.second.op.isConstant()) {
1898          for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
1899             for (uint16_t i = 0; i < other.second.bytes; i++) {
1900                /* distance might underflow */
1901                unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b;
1902                if (distance < copy.second.bytes && !copy.second.uses[distance])
1903                   other.second.uses[i] -= 1;
1904             }
1905          }
1906       }
1907    }
1908 
1909    /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */
1910    unsigned largest = 0;
1911    for (const std::pair<const PhysReg, copy_operation>& op : copy_map)
1912       largest = MAX2(largest, op.second.bytes);
1913 
1914    while (!copy_map.empty()) {
1915 
1916       /* Perform larger swaps first, because larger swaps swaps can make other
1917        * swaps unnecessary. */
1918       auto it = copy_map.begin();
1919       for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) {
1920          if (it2->second.bytes > it->second.bytes) {
1921             it = it2;
1922             if (it->second.bytes == largest)
1923                break;
1924          }
1925       }
1926 
1927       /* should already be done */
1928       assert(!it->second.op.isConstant());
1929 
1930       assert(it->second.op.isFixed());
1931       assert(it->second.def.regClass() == it->second.op.regClass());
1932 
1933       if (it->first == it->second.op.physReg()) {
1934          copy_map.erase(it);
1935          continue;
1936       }
1937 
1938       if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr)
1939          assert(!(it->second.def.physReg() == pi->scratch_sgpr));
1940 
1941       /* to resolve the cycle, we have to swap the src reg with the dst reg */
1942       copy_operation swap = it->second;
1943 
1944       /* if this is self-intersecting, we have to split it because
1945        * self-intersecting swaps don't make sense */
1946       PhysReg src = swap.op.physReg(), dst = swap.def.physReg();
1947       if (abs((int)src.reg_b - (int)dst.reg_b) < (int)swap.bytes) {
1948          unsigned offset = abs((int)src.reg_b - (int)dst.reg_b);
1949 
1950          copy_operation remaining;
1951          src.reg_b += offset;
1952          dst.reg_b += offset;
1953          remaining.bytes = swap.bytes - offset;
1954          memcpy(remaining.uses, swap.uses + offset, remaining.bytes);
1955          remaining.op = Operand(src, swap.def.regClass().resize(remaining.bytes));
1956          remaining.def = Definition(dst, swap.def.regClass().resize(remaining.bytes));
1957          copy_map[dst] = remaining;
1958 
1959          memset(swap.uses + offset, 0, swap.bytes - offset);
1960          swap.bytes = offset;
1961       }
1962 
1963       /* GFX6-7 can only swap full registers */
1964       assert (ctx->program->gfx_level > GFX7 || (swap.bytes % 4) == 0);
1965 
1966       do_swap(ctx, bld, swap, preserve_scc, pi);
1967 
1968       /* remove from map */
1969       copy_map.erase(it);
1970 
1971       /* change the operand reg of the target's uses and split uses if needed */
1972       uint32_t bytes_left = u_bit_consecutive(0, swap.bytes);
1973       for (auto target = copy_map.begin(); target != copy_map.end(); ++target) {
1974          if (target->second.op.physReg() == swap.def.physReg() &&
1975              swap.bytes == target->second.bytes) {
1976             target->second.op.setFixed(swap.op.physReg());
1977             break;
1978          }
1979 
1980          uint32_t imask =
1981             get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,
1982                                   target->second.op.physReg().reg_b, target->second.bytes);
1983 
1984          if (!imask)
1985             continue;
1986 
1987          int offset = (int)target->second.op.physReg().reg_b - (int)swap.def.physReg().reg_b;
1988 
1989          /* split and update the middle (the portion that reads the swap's
1990           * definition) to read the swap's operand instead */
1991          int target_op_end = target->second.op.physReg().reg_b + target->second.bytes;
1992          int swap_def_end = swap.def.physReg().reg_b + swap.bytes;
1993          int before_bytes = MAX2(-offset, 0);
1994          int after_bytes = MAX2(target_op_end - swap_def_end, 0);
1995          int middle_bytes = target->second.bytes - before_bytes - after_bytes;
1996 
1997          if (after_bytes) {
1998             unsigned after_offset = before_bytes + middle_bytes;
1999             assert(after_offset > 0);
2000             copy_operation copy;
2001             copy.bytes = after_bytes;
2002             memcpy(copy.uses, target->second.uses + after_offset, copy.bytes);
2003             RegClass rc = target->second.op.regClass().resize(after_bytes);
2004             copy.op = Operand(target->second.op.physReg().advance(after_offset), rc);
2005             copy.def = Definition(target->second.def.physReg().advance(after_offset), rc);
2006             copy_map[copy.def.physReg()] = copy;
2007          }
2008 
2009          if (middle_bytes) {
2010             copy_operation copy;
2011             copy.bytes = middle_bytes;
2012             memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes);
2013             RegClass rc = target->second.op.regClass().resize(middle_bytes);
2014             copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc);
2015             copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc);
2016             copy_map[copy.def.physReg()] = copy;
2017          }
2018 
2019          if (before_bytes) {
2020             copy_operation copy;
2021             target->second.bytes = before_bytes;
2022             RegClass rc = target->second.op.regClass().resize(before_bytes);
2023             target->second.op = Operand(target->second.op.physReg(), rc);
2024             target->second.def = Definition(target->second.def.physReg(), rc);
2025             memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes);
2026          }
2027 
2028          /* break early since we know each byte of the swap's definition is used
2029           * at most once */
2030          bytes_left &= ~imask;
2031          if (!bytes_left)
2032             break;
2033       }
2034    }
2035    ctx->program->statistics[aco_statistic_copies] +=
2036       ctx->instructions.size() - num_instructions_before;
2037 }
2038 
2039 void
handle_operands_linear_vgpr(std::map<PhysReg,copy_operation> & copy_map,lower_context * ctx,amd_gfx_level gfx_level,Pseudo_instruction * pi)2040 handle_operands_linear_vgpr(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
2041                             amd_gfx_level gfx_level, Pseudo_instruction* pi)
2042 {
2043    Builder bld(ctx->program, &ctx->instructions);
2044 
2045    for (auto& copy : copy_map) {
2046       copy.second.op =
2047          Operand(copy.second.op.physReg(), RegClass::get(RegType::vgpr, copy.second.op.bytes()));
2048       copy.second.def = Definition(copy.second.def.physReg(),
2049                                    RegClass::get(RegType::vgpr, copy.second.def.bytes()));
2050    }
2051 
2052    std::map<PhysReg, copy_operation> second_map(copy_map);
2053    handle_operands(second_map, ctx, gfx_level, pi);
2054 
2055    bool tmp_in_scc = pi->tmp_in_scc;
2056    if (tmp_in_scc) {
2057       bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
2058       pi->tmp_in_scc = false;
2059    }
2060    bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), Operand(exec, bld.lm));
2061 
2062    handle_operands(copy_map, ctx, gfx_level, pi);
2063 
2064    bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), Operand(exec, bld.lm));
2065    if (tmp_in_scc) {
2066       bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1),
2067                Operand::zero());
2068       pi->tmp_in_scc = true;
2069    }
2070 
2071    ctx->program->statistics[aco_statistic_copies] += tmp_in_scc ? 4 : 2;
2072 }
2073 
2074 void
emit_set_mode(Builder & bld,float_mode new_mode,bool set_round,bool set_denorm)2075 emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm)
2076 {
2077    if (bld.program->gfx_level >= GFX10) {
2078       if (set_round)
2079          bld.sopp(aco_opcode::s_round_mode, new_mode.round);
2080       if (set_denorm)
2081          bld.sopp(aco_opcode::s_denorm_mode, new_mode.denorm);
2082    } else if (set_round || set_denorm) {
2083       /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */
2084       bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(new_mode.val), (7 << 11) | 1);
2085    }
2086 }
2087 
2088 void
emit_set_mode_from_block(Builder & bld,Program & program,Block * block)2089 emit_set_mode_from_block(Builder& bld, Program& program, Block* block)
2090 {
2091    float_mode initial;
2092    initial.val = program.config->float_mode;
2093 
2094    bool inital_unknown =
2095       (program.info.merged_shader_compiled_separately && program.stage.sw == SWStage::GS) ||
2096       (program.info.merged_shader_compiled_separately && program.stage.sw == SWStage::TCS);
2097    bool is_start = block->index == 0;
2098    bool set_round = is_start && (inital_unknown || block->fp_mode.round != initial.round);
2099    bool set_denorm = is_start && (inital_unknown || block->fp_mode.denorm != initial.denorm);
2100    if (block->kind & block_kind_top_level) {
2101       for (unsigned pred : block->linear_preds) {
2102          if (program.blocks[pred].fp_mode.round != block->fp_mode.round)
2103             set_round = true;
2104          if (program.blocks[pred].fp_mode.denorm != block->fp_mode.denorm)
2105             set_denorm = true;
2106       }
2107    }
2108    /* only allow changing modes at top-level blocks so this doesn't break
2109     * the "jump over empty blocks" optimization */
2110    assert((!set_round && !set_denorm) || (block->kind & block_kind_top_level));
2111    emit_set_mode(bld, block->fp_mode, set_round, set_denorm);
2112 }
2113 
2114 void
lower_image_sample(lower_context * ctx,aco_ptr<Instruction> & instr)2115 lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
2116 {
2117    Operand linear_vgpr = instr->operands[3];
2118 
2119    unsigned nsa_size = ctx->program->dev.max_nsa_vgprs;
2120    unsigned vaddr_size = linear_vgpr.size();
2121    unsigned num_copied_vgprs = instr->operands.size() - 4;
2122    nsa_size = num_copied_vgprs > 0 && (ctx->program->gfx_level >= GFX11 || vaddr_size <= nsa_size)
2123                  ? nsa_size
2124                  : 0;
2125 
2126    Operand vaddr[16];
2127    unsigned num_vaddr = 0;
2128 
2129    if (nsa_size) {
2130       assert(num_copied_vgprs <= nsa_size);
2131       for (unsigned i = 0; i < num_copied_vgprs; i++)
2132          vaddr[num_vaddr++] = instr->operands[4 + i];
2133       for (unsigned i = num_copied_vgprs; i < std::min(vaddr_size, nsa_size); i++)
2134          vaddr[num_vaddr++] = Operand(linear_vgpr.physReg().advance(i * 4), v1);
2135       if (vaddr_size > nsa_size) {
2136          RegClass rc = RegClass::get(RegType::vgpr, (vaddr_size - nsa_size) * 4);
2137          vaddr[num_vaddr++] = Operand(PhysReg(linear_vgpr.physReg().advance(nsa_size * 4)), rc);
2138       }
2139    } else {
2140       PhysReg reg = linear_vgpr.physReg();
2141       std::map<PhysReg, copy_operation> copy_operations;
2142       for (unsigned i = 4; i < instr->operands.size(); i++) {
2143          Operand arg = instr->operands[i];
2144          Definition def(reg, RegClass::get(RegType::vgpr, arg.bytes()));
2145          copy_operations[def.physReg()] = {arg, def, def.bytes()};
2146          reg = reg.advance(arg.bytes());
2147       }
2148       vaddr[num_vaddr++] = linear_vgpr;
2149 
2150       Pseudo_instruction pi = {};
2151       handle_operands(copy_operations, ctx, ctx->program->gfx_level, &pi);
2152    }
2153 
2154    instr->mimg().strict_wqm = false;
2155 
2156    if ((3 + num_vaddr) > instr->operands.size()) {
2157       Instruction* new_instr =
2158          create_instruction(instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size());
2159       std::copy(instr->definitions.cbegin(), instr->definitions.cend(),
2160                 new_instr->definitions.begin());
2161       new_instr->operands[0] = instr->operands[0];
2162       new_instr->operands[1] = instr->operands[1];
2163       new_instr->operands[2] = instr->operands[2];
2164       memcpy((uint8_t*)new_instr + sizeof(Instruction), (uint8_t*)instr.get() + sizeof(Instruction),
2165              sizeof(MIMG_instruction) - sizeof(Instruction));
2166       instr.reset(new_instr);
2167    } else {
2168       while (instr->operands.size() > (3 + num_vaddr))
2169          instr->operands.pop_back();
2170    }
2171    std::copy(vaddr, vaddr + num_vaddr, std::next(instr->operands.begin(), 3));
2172 }
2173 
2174 } /* end namespace */
2175 
2176 void
hw_init_scratch(Builder & bld,Definition def,Operand scratch_addr,Operand scratch_offset)2177 hw_init_scratch(Builder& bld, Definition def, Operand scratch_addr, Operand scratch_offset)
2178 {
2179    /* Since we know what the high 16 bits of scratch_hi is, we can set all the high 16
2180     * bits in the same instruction that we add the carry.
2181     */
2182    Operand hi_add = Operand::c32(0xffff0000 - S_008F04_SWIZZLE_ENABLE_GFX6(1));
2183    Operand scratch_addr_lo(scratch_addr.physReg(), s1);
2184    Operand scratch_addr_hi(scratch_addr_lo.physReg().advance(4), s1);
2185 
2186    if (bld.program->gfx_level >= GFX10) {
2187       PhysReg scratch_lo = def.physReg();
2188       PhysReg scratch_hi = def.physReg().advance(4);
2189 
2190       bld.sop2(aco_opcode::s_add_u32, Definition(scratch_lo, s1), Definition(scc, s1),
2191                scratch_addr_lo, scratch_offset);
2192       bld.sop2(aco_opcode::s_addc_u32, Definition(scratch_hi, s1), Definition(scc, s1),
2193                scratch_addr_hi, hi_add, Operand(scc, s1));
2194 
2195       /* "((size - 1) << 11) | register" (FLAT_SCRATCH_LO/HI is encoded as register
2196        * 20/21) */
2197       bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_lo, s1), (31 << 11) | 20);
2198       bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_hi, s1), (31 << 11) | 21);
2199    } else {
2200       bld.sop2(aco_opcode::s_add_u32, Definition(flat_scr_lo, s1), Definition(scc, s1),
2201                scratch_addr_lo, scratch_offset);
2202       bld.sop2(aco_opcode::s_addc_u32, Definition(flat_scr_hi, s1), Definition(scc, s1),
2203                scratch_addr_hi, hi_add, Operand(scc, s1));
2204    }
2205 }
2206 
2207 void
lower_to_hw_instr(Program * program)2208 lower_to_hw_instr(Program* program)
2209 {
2210    gfx9_pops_done_msg_bounds pops_done_msg_bounds;
2211    if (program->has_pops_overlapped_waves_wait && program->gfx_level < GFX11) {
2212       pops_done_msg_bounds = gfx9_pops_done_msg_bounds(program);
2213    }
2214 
2215    Block* discard_exit_block = NULL;
2216    Block* discard_pops_done_and_exit_block = NULL;
2217 
2218    int end_with_regs_block_index = -1;
2219 
2220    bool should_dealloc_vgprs = dealloc_vgprs(program);
2221 
2222    for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) {
2223       Block* block = &program->blocks[block_idx];
2224       lower_context ctx;
2225       ctx.program = program;
2226       ctx.block = block;
2227       ctx.instructions.reserve(block->instructions.size());
2228       Builder bld(program, &ctx.instructions);
2229 
2230       emit_set_mode_from_block(bld, *program, block);
2231 
2232       for (size_t instr_idx = 0; instr_idx < block->instructions.size(); instr_idx++) {
2233          aco_ptr<Instruction>& instr = block->instructions[instr_idx];
2234 
2235          /* Send the ordered section done message from the middle of the block if needed (if the
2236           * ordered section is ended by an instruction inside this block).
2237           * Also make sure the done message is sent if it's needed in case early exit happens for
2238           * any reason.
2239           */
2240          if ((block_idx == pops_done_msg_bounds.end_block_idx() &&
2241               instr_idx == pops_done_msg_bounds.instr_after_end_idx()) ||
2242              (instr->opcode == aco_opcode::s_endpgm &&
2243               pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx))) {
2244             bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
2245          }
2246 
2247          aco_ptr<Instruction> mov;
2248          if (instr->isPseudo() && instr->opcode != aco_opcode::p_unit_test) {
2249             Pseudo_instruction* pi = &instr->pseudo();
2250 
2251             switch (instr->opcode) {
2252             case aco_opcode::p_extract_vector: {
2253                PhysReg reg = instr->operands[0].physReg();
2254                Definition& def = instr->definitions[0];
2255                reg.reg_b += instr->operands[1].constantValue() * def.bytes();
2256 
2257                if (reg == def.physReg())
2258                   break;
2259 
2260                RegClass op_rc = def.regClass().is_subdword()
2261                                    ? def.regClass()
2262                                    : RegClass(instr->operands[0].getTemp().type(), def.size());
2263                std::map<PhysReg, copy_operation> copy_operations;
2264                copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()};
2265                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2266                break;
2267             }
2268             case aco_opcode::p_create_vector:
2269             case aco_opcode::p_start_linear_vgpr: {
2270                if (instr->operands.empty())
2271                   break;
2272 
2273                std::map<PhysReg, copy_operation> copy_operations;
2274                PhysReg reg = instr->definitions[0].physReg();
2275 
2276                for (const Operand& op : instr->operands) {
2277                   RegClass rc = RegClass::get(instr->definitions[0].regClass().type(), op.bytes());
2278                   if (op.isConstant()) {
2279                      const Definition def = Definition(reg, rc);
2280                      copy_operations[reg] = {op, def, op.bytes()};
2281                      reg.reg_b += op.bytes();
2282                      continue;
2283                   }
2284                   if (op.isUndefined()) {
2285                      // TODO: coalesce subdword copies if dst byte is 0
2286                      reg.reg_b += op.bytes();
2287                      continue;
2288                   }
2289 
2290                   RegClass rc_def = op.regClass().is_subdword() ? op.regClass() : rc;
2291                   const Definition def = Definition(reg, rc_def);
2292                   copy_operations[def.physReg()] = {op, def, op.bytes()};
2293                   reg.reg_b += op.bytes();
2294                }
2295                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2296                break;
2297             }
2298             case aco_opcode::p_split_vector: {
2299                std::map<PhysReg, copy_operation> copy_operations;
2300                PhysReg reg = instr->operands[0].physReg();
2301 
2302                for (const Definition& def : instr->definitions) {
2303                   RegClass rc_op = def.regClass().is_subdword()
2304                                       ? def.regClass()
2305                                       : instr->operands[0].getTemp().regClass().resize(def.bytes());
2306                   const Operand op = Operand(reg, rc_op);
2307                   copy_operations[def.physReg()] = {op, def, def.bytes()};
2308                   reg.reg_b += def.bytes();
2309                }
2310                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2311                break;
2312             }
2313             case aco_opcode::p_parallelcopy: {
2314                std::map<PhysReg, copy_operation> copy_operations;
2315                bool linear_vgpr = false;
2316                bool non_linear_vgpr = false;
2317                for (unsigned j = 0; j < instr->operands.size(); j++) {
2318                   assert(instr->definitions[j].bytes() == instr->operands[j].bytes());
2319                   copy_operations[instr->definitions[j].physReg()] = {
2320                      instr->operands[j], instr->definitions[j], instr->operands[j].bytes()};
2321                   linear_vgpr |= instr->definitions[j].regClass().is_linear_vgpr();
2322                   non_linear_vgpr |= !instr->definitions[j].regClass().is_linear_vgpr();
2323                }
2324                assert(!linear_vgpr || !non_linear_vgpr);
2325                if (linear_vgpr)
2326                   handle_operands_linear_vgpr(copy_operations, &ctx, program->gfx_level, pi);
2327                else
2328                   handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2329                break;
2330             }
2331             case aco_opcode::p_exit_early_if: {
2332                /* don't bother with an early exit near the end of the program */
2333                if ((block->instructions.size() - 1 - instr_idx) <= 4 &&
2334                    block->instructions.back()->opcode == aco_opcode::s_endpgm) {
2335                   unsigned null_exp_dest =
2336                      program->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
2337                   bool ignore_early_exit = true;
2338 
2339                   for (unsigned k = instr_idx + 1; k < block->instructions.size(); ++k) {
2340                      const aco_ptr<Instruction>& instr2 = block->instructions[k];
2341                      if (instr2->opcode == aco_opcode::s_endpgm ||
2342                          instr2->opcode == aco_opcode::p_logical_end)
2343                         continue;
2344                      else if (instr2->opcode == aco_opcode::exp &&
2345                               instr2->exp().dest == null_exp_dest &&
2346                               instr2->exp().enabled_mask == 0)
2347                         continue;
2348                      else if (instr2->opcode == aco_opcode::p_parallelcopy &&
2349                               instr2->definitions[0].isFixed() &&
2350                               instr2->definitions[0].physReg() == exec)
2351                         continue;
2352 
2353                      ignore_early_exit = false;
2354                   }
2355 
2356                   if (ignore_early_exit)
2357                      break;
2358                }
2359 
2360                const bool discard_sends_pops_done =
2361                   pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx);
2362 
2363                Block* discard_block =
2364                   discard_sends_pops_done ? discard_pops_done_and_exit_block : discard_exit_block;
2365                if (!discard_block) {
2366                   discard_block = program->create_and_insert_block();
2367                   discard_block->kind = block_kind_discard_early_exit;
2368                   if (discard_sends_pops_done) {
2369                      discard_pops_done_and_exit_block = discard_block;
2370                   } else {
2371                      discard_exit_block = discard_block;
2372                   }
2373                   block = &program->blocks[block_idx];
2374 
2375                   bld.reset(discard_block);
2376                   if (program->has_pops_overlapped_waves_wait &&
2377                       (program->gfx_level >= GFX11 || discard_sends_pops_done)) {
2378                      /* If this discard early exit potentially exits the POPS ordered section, do
2379                       * the waitcnt necessary before resuming overlapping waves as the normal
2380                       * waitcnt insertion doesn't work in a discard early exit block.
2381                       */
2382                      if (program->gfx_level >= GFX10)
2383                         bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
2384                      wait_imm pops_exit_wait_imm;
2385                      pops_exit_wait_imm.vm = 0;
2386                      if (program->has_smem_buffer_or_global_loads)
2387                         pops_exit_wait_imm.lgkm = 0;
2388                      bld.sopp(aco_opcode::s_waitcnt, pops_exit_wait_imm.pack(program->gfx_level));
2389                   }
2390                   if (discard_sends_pops_done)
2391                      bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
2392                   unsigned target = V_008DFC_SQ_EXP_NULL;
2393                   if (program->gfx_level >= GFX11)
2394                      target =
2395                         program->has_color_exports ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_MRTZ;
2396                   if (program->stage == fragment_fs)
2397                      bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
2398                              target, false, true, true);
2399                   if (should_dealloc_vgprs) {
2400                      bld.sopp(aco_opcode::s_nop, 0);
2401                      bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
2402                   }
2403                   bld.sopp(aco_opcode::s_endpgm);
2404 
2405                   bld.reset(&ctx.instructions);
2406                }
2407 
2408                assert(instr->operands[0].physReg() == scc);
2409                bld.sopp(aco_opcode::s_cbranch_scc0, instr->operands[0], discard_block->index);
2410 
2411                discard_block->linear_preds.push_back(block->index);
2412                block->linear_succs.push_back(discard_block->index);
2413                break;
2414             }
2415             case aco_opcode::p_spill: {
2416                assert(instr->operands[0].regClass() == v1.as_linear());
2417                for (unsigned i = 0; i < instr->operands[2].size(); i++) {
2418                   Operand src =
2419                      instr->operands[2].isConstant()
2420                         ? Operand::c32(uint32_t(instr->operands[2].constantValue64() >> (32 * i)))
2421                         : Operand(PhysReg{instr->operands[2].physReg() + i}, s1);
2422                   bld.writelane(bld.def(v1, instr->operands[0].physReg()), src,
2423                                 Operand::c32(instr->operands[1].constantValue() + i),
2424                                 instr->operands[0]);
2425                }
2426                break;
2427             }
2428             case aco_opcode::p_reload: {
2429                assert(instr->operands[0].regClass() == v1.as_linear());
2430                for (unsigned i = 0; i < instr->definitions[0].size(); i++)
2431                   bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
2432                                instr->operands[0],
2433                                Operand::c32(instr->operands[1].constantValue() + i));
2434                break;
2435             }
2436             case aco_opcode::p_as_uniform: {
2437                if (instr->operands[0].isConstant() ||
2438                    instr->operands[0].regClass().type() == RegType::sgpr) {
2439                   std::map<PhysReg, copy_operation> copy_operations;
2440                   copy_operations[instr->definitions[0].physReg()] = {
2441                      instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};
2442                   handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2443                } else {
2444                   assert(instr->operands[0].regClass().type() == RegType::vgpr);
2445                   assert(instr->definitions[0].regClass().type() == RegType::sgpr);
2446                   assert(instr->operands[0].size() == instr->definitions[0].size());
2447                   for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
2448                      bld.vop1(aco_opcode::v_readfirstlane_b32,
2449                               bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
2450                               Operand(PhysReg{instr->operands[0].physReg() + i}, v1));
2451                   }
2452                }
2453                break;
2454             }
2455             case aco_opcode::p_pops_gfx9_add_exiting_wave_id: {
2456                bld.sop2(aco_opcode::s_add_i32, instr->definitions[0], instr->definitions[1],
2457                         Operand(pops_exiting_wave_id, s1), instr->operands[0]);
2458                break;
2459             }
2460             case aco_opcode::p_bpermute_readlane: {
2461                emit_bpermute_readlane(bld, instr);
2462                break;
2463             }
2464             case aco_opcode::p_bpermute_shared_vgpr: {
2465                emit_bpermute_shared_vgpr(bld, instr);
2466                break;
2467             }
2468             case aco_opcode::p_bpermute_permlane: {
2469                emit_bpermute_permlane(bld, instr);
2470                break;
2471             }
2472             case aco_opcode::p_constaddr: {
2473                unsigned id = instr->definitions[0].tempId();
2474                PhysReg reg = instr->definitions[0].physReg();
2475                bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand::c32(id));
2476                if (ctx.program->gfx_level >= GFX12)
2477                   bld.sop1(aco_opcode::s_sext_i32_i16, Definition(reg.advance(4), s1), Operand(reg.advance(4), s1));
2478                bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
2479                         Operand(reg, s1), instr->operands[0], Operand::c32(id));
2480                /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2481                break;
2482             }
2483             case aco_opcode::p_resume_shader_address: {
2484                /* Find index of resume block. */
2485                unsigned resume_idx = instr->operands[0].constantValue();
2486                unsigned resume_block_idx = 0;
2487                for (Block& resume_block : program->blocks) {
2488                   if (resume_block.kind & block_kind_resume) {
2489                      if (resume_idx == 0) {
2490                         resume_block_idx = resume_block.index;
2491                         break;
2492                      }
2493                      resume_idx--;
2494                   }
2495                }
2496                assert(resume_block_idx != 0);
2497                unsigned id = instr->definitions[0].tempId();
2498                PhysReg reg = instr->definitions[0].physReg();
2499                bld.sop1(aco_opcode::p_resumeaddr_getpc, instr->definitions[0], Operand::c32(id));
2500                if (ctx.program->gfx_level >= GFX12)
2501                   bld.sop1(aco_opcode::s_sext_i32_i16, Definition(reg.advance(4), s1), Operand(reg.advance(4), s1));
2502                bld.sop2(aco_opcode::p_resumeaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
2503                         Operand(reg, s1), Operand::c32(resume_block_idx), Operand::c32(id));
2504                /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2505                break;
2506             }
2507             case aco_opcode::p_extract: {
2508                assert(instr->operands[1].isConstant());
2509                assert(instr->operands[2].isConstant());
2510                assert(instr->operands[3].isConstant());
2511                if (instr->definitions[0].regClass() == s1)
2512                   assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2513                Definition dst = instr->definitions[0];
2514                Operand op = instr->operands[0];
2515                unsigned bits = instr->operands[2].constantValue();
2516                unsigned index = instr->operands[1].constantValue();
2517                unsigned offset = index * bits;
2518                bool signext = !instr->operands[3].constantEquals(0);
2519 
2520                if (dst.regClass() == s1) {
2521                   if (offset == 0 && signext && (bits == 8 || bits == 16)) {
2522                      bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16,
2523                               dst, op);
2524                   } else if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
2525                      bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
2526                   } else if (ctx.program->gfx_level >= GFX9 && offset == 16 && bits == 16 &&
2527                              !signext) {
2528                      bld.sop2(aco_opcode::s_pack_hh_b32_b16, dst, op, Operand::zero());
2529                   } else if (offset == (32 - bits)) {
2530                      bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,
2531                               bld.def(s1, scc), op, Operand::c32(offset));
2532                   } else {
2533                      bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
2534                               bld.def(s1, scc), op, Operand::c32((bits << 16) | offset));
2535                   }
2536                } else if (dst.regClass() == v1 && op.physReg().byte() == 0) {
2537                   assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0);
2538                   if (offset == (32 - bits) && op.regClass() != s1) {
2539                      bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst,
2540                               Operand::c32(offset), op);
2541                   } else if (offset == 0 && bits == 16 && ctx.program->gfx_level >= GFX11) {
2542                      bld.vop1(signext ? aco_opcode::v_cvt_i32_i16 : aco_opcode::v_cvt_u32_u16, dst,
2543                               op);
2544                   } else {
2545                      bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, dst, op,
2546                               Operand::c32(offset), Operand::c32(bits));
2547                   }
2548                } else {
2549                   assert(dst.regClass() == v2b || dst.regClass() == v1b || op.regClass() == v2b ||
2550                          op.regClass() == v1b);
2551                   if (ctx.program->gfx_level >= GFX11) {
2552                      unsigned op_vgpr_byte = op.physReg().byte() + offset / 8;
2553                      unsigned sign_byte = op_vgpr_byte + bits / 8 - 1;
2554 
2555                      uint8_t swiz[4] = {4, 5, 6, 7};
2556                      swiz[dst.physReg().byte()] = op_vgpr_byte;
2557                      if (bits == 16)
2558                         swiz[dst.physReg().byte() + 1] = op_vgpr_byte + 1;
2559                      for (unsigned i = bits / 8; i < dst.bytes(); i++) {
2560                         uint8_t ext = bperm_0;
2561                         if (signext) {
2562                            if (sign_byte == 1)
2563                               ext = bperm_b1_sign;
2564                            else if (sign_byte == 3)
2565                               ext = bperm_b3_sign;
2566                            else /* replicate so sign-extension can be done later */
2567                               ext = sign_byte;
2568                         }
2569                         swiz[dst.physReg().byte() + i] = ext;
2570                      }
2571                      create_bperm(bld, swiz, dst, op);
2572 
2573                      if (signext && sign_byte != 3 && sign_byte != 1) {
2574                         assert(bits == 8);
2575                         assert(dst.regClass() == v2b || dst.regClass() == v1);
2576                         uint8_t ext_swiz[4] = {4, 5, 6, 7};
2577                         uint8_t ext = dst.physReg().byte() == 2 ? bperm_b7_sign : bperm_b5_sign;
2578                         memset(ext_swiz + dst.physReg().byte() + 1, ext, dst.bytes() - 1);
2579                         create_bperm(bld, ext_swiz, dst, Operand::zero());
2580                      }
2581                   } else {
2582                      SDWA_instruction& sdwa = bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa();
2583                      sdwa.sel[0] = SubdwordSel(bits / 8, offset / 8, signext);
2584                   }
2585                }
2586                break;
2587             }
2588             case aco_opcode::p_insert: {
2589                assert(instr->operands[1].isConstant());
2590                assert(instr->operands[2].isConstant());
2591                if (instr->definitions[0].regClass() == s1)
2592                   assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2593                Definition dst = instr->definitions[0];
2594                Operand op = instr->operands[0];
2595                unsigned bits = instr->operands[2].constantValue();
2596                unsigned index = instr->operands[1].constantValue();
2597                unsigned offset = index * bits;
2598 
2599                bool has_sdwa = program->gfx_level >= GFX8 && program->gfx_level < GFX11;
2600                if (dst.regClass() == s1) {
2601                   if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
2602                      bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
2603                   } else if (ctx.program->gfx_level >= GFX9 && offset == 16 && bits == 16) {
2604                      bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, Operand::zero(), op);
2605                   } else if (offset == (32 - bits)) {
2606                      bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), op,
2607                               Operand::c32(offset));
2608                   } else if (offset == 0) {
2609                      bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
2610                               Operand::c32(bits << 16));
2611                   } else {
2612                      bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
2613                               Operand::c32(bits << 16));
2614                      bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc),
2615                               Operand(dst.physReg(), s1), Operand::c32(offset));
2616                   }
2617                } else if (dst.regClass() == v1 || !has_sdwa) {
2618                   if (offset == (dst.bytes() * 8u - bits) && dst.regClass() == v1) {
2619                      bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op);
2620                   } else if (offset == 0 && dst.regClass() == v1) {
2621                      bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2622                   } else if (has_sdwa && (op.regClass() != s1 || program->gfx_level >= GFX9)) {
2623                      bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().dst_sel =
2624                         SubdwordSel(bits / 8, offset / 8, false);
2625                   } else if (program->gfx_level >= GFX11) {
2626                      uint8_t swiz[] = {4, 5, 6, 7};
2627                      for (unsigned i = 0; i < dst.bytes(); i++)
2628                         swiz[dst.physReg().byte() + i] = bperm_0;
2629                      for (unsigned i = 0; i < bits / 8; i++)
2630                         swiz[dst.physReg().byte() + i + offset / 8] = op.physReg().byte() + i;
2631                      create_bperm(bld, swiz, dst, op);
2632                   } else {
2633                      bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2634                      bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset),
2635                               Operand(dst.physReg(), v1));
2636                   }
2637                } else {
2638                   assert(dst.regClass() == v2b);
2639                   if (!offset) {
2640                      bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().sel[0] =
2641                         SubdwordSel::ubyte;
2642                   } else if (program->gfx_level >= GFX9) {
2643                      bld.vop2_sdwa(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op)
2644                         ->sdwa()
2645                         .sel[1] = SubdwordSel::ubyte;
2646                   } else {
2647                      assert(offset == 8);
2648                      Definition dst_hi = Definition(dst.physReg().advance(1), v1b);
2649                      bld.vop1_sdwa(aco_opcode::v_mov_b32, dst_hi, op)->sdwa().sel[0] =
2650                         SubdwordSel::ubyte;
2651                      uint32_t c = ~(BITFIELD_MASK(offset) << (dst.physReg().byte() * 8));
2652                      bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(c),
2653                               Operand(PhysReg(op.physReg().reg()), v1));
2654                   }
2655                }
2656                break;
2657             }
2658             case aco_opcode::p_init_scratch: {
2659                assert(program->gfx_level >= GFX8 && program->gfx_level <= GFX10_3);
2660                if (!program->config->scratch_bytes_per_wave)
2661                   break;
2662 
2663                Operand scratch_addr = instr->operands[0];
2664                if (scratch_addr.isUndefined()) {
2665                   PhysReg reg = instr->definitions[0].physReg();
2666                   bld.sop1(aco_opcode::p_load_symbol, Definition(reg, s1),
2667                            Operand::c32(aco_symbol_scratch_addr_lo));
2668                   bld.sop1(aco_opcode::p_load_symbol, Definition(reg.advance(4), s1),
2669                            Operand::c32(aco_symbol_scratch_addr_hi));
2670                   scratch_addr.setFixed(reg);
2671                } else if (program->stage.hw != AC_HW_COMPUTE_SHADER) {
2672                   bld.smem(aco_opcode::s_load_dwordx2, instr->definitions[0], scratch_addr,
2673                            Operand::zero());
2674                   scratch_addr.setFixed(instr->definitions[0].physReg());
2675                }
2676 
2677                hw_init_scratch(bld, instr->definitions[0], scratch_addr, instr->operands[1]);
2678                break;
2679             }
2680             case aco_opcode::p_jump_to_epilog: {
2681                if (pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx)) {
2682                   bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
2683                }
2684                bld.sop1(aco_opcode::s_setpc_b64, instr->operands[0]);
2685                break;
2686             }
2687             case aco_opcode::p_interp_gfx11: {
2688                assert(instr->definitions[0].regClass() == v1 ||
2689                       instr->definitions[0].regClass() == v2b);
2690                assert(instr->operands[0].regClass() == v1.as_linear());
2691                assert(instr->operands[1].isConstant());
2692                assert(instr->operands[2].isConstant());
2693                assert(instr->operands.back().physReg() == m0);
2694                Definition dst = instr->definitions[0];
2695                PhysReg lin_vgpr = instr->operands[0].physReg();
2696                unsigned attribute = instr->operands[1].constantValue();
2697                unsigned component = instr->operands[2].constantValue();
2698                uint16_t dpp_ctrl = 0;
2699                bool high_16bits = false;
2700                Operand coord1, coord2;
2701                if (instr->operands.size() == 7) {
2702                   assert(instr->operands[3].isConstant());
2703                   high_16bits = instr->operands[3].constantValue();
2704                   assert(instr->operands[4].regClass() == v1);
2705                   assert(instr->operands[5].regClass() == v1);
2706                   coord1 = instr->operands[4];
2707                   coord2 = instr->operands[5];
2708                } else {
2709                   assert(instr->operands[3].isConstant());
2710                   dpp_ctrl = instr->operands[3].constantValue();
2711                }
2712 
2713                bld.ldsdir(aco_opcode::lds_param_load, Definition(lin_vgpr, v1), Operand(m0, s1),
2714                           attribute, component);
2715 
2716                Operand p(lin_vgpr, v1);
2717                Operand dst_op(dst.physReg(), v1);
2718                if (instr->operands.size() == 5) {
2719                   bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
2720                } else if (dst.regClass() == v2b) {
2721                   bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, Definition(dst), p,
2722                                     coord1, p, high_16bits ? 0x5 : 0);
2723                   bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p,
2724                                     coord2, dst_op, high_16bits ? 0x1 : 0);
2725                } else {
2726                   bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), p, coord1,
2727                                     p);
2728                   bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2,
2729                                     dst_op);
2730                }
2731                break;
2732             }
2733             case aco_opcode::p_dual_src_export_gfx11: {
2734                PhysReg dst0 = instr->definitions[0].physReg();
2735                PhysReg dst1 = instr->definitions[1].physReg();
2736                Definition exec_tmp = instr->definitions[2];
2737                Definition not_vcc_tmp = instr->definitions[3];
2738                Definition clobber_vcc = instr->definitions[4];
2739                Definition clobber_scc = instr->definitions[5];
2740 
2741                assert(exec_tmp.regClass() == bld.lm);
2742                assert(not_vcc_tmp.regClass() == bld.lm);
2743                assert(clobber_vcc.regClass() == bld.lm && clobber_vcc.physReg() == vcc);
2744                assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
2745 
2746                bld.sop1(Builder::s_mov, Definition(exec_tmp.physReg(), bld.lm),
2747                         Operand(exec, bld.lm));
2748                bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), clobber_scc,
2749                         Operand(exec, bld.lm));
2750 
2751                uint8_t enabled_channels = 0;
2752                Operand mrt0[4], mrt1[4];
2753 
2754                copy_constant_sgpr(bld, clobber_vcc, 0x5555'5555'5555'5555ull);
2755 
2756                Operand src_even = Operand(clobber_vcc.physReg(), bld.lm);
2757 
2758                bld.sop1(Builder::s_not, not_vcc_tmp, clobber_scc, src_even);
2759 
2760                Operand src_odd = Operand(not_vcc_tmp.physReg(), bld.lm);
2761 
2762                for (unsigned i = 0; i < 4; i++) {
2763                   if (instr->operands[i].isUndefined() && instr->operands[i + 4].isUndefined()) {
2764                      mrt0[i] = instr->operands[i];
2765                      mrt1[i] = instr->operands[i + 4];
2766                      continue;
2767                   }
2768 
2769                   Operand src0 = instr->operands[i];
2770                   Operand src1 = instr->operands[i + 4];
2771 
2772                   /*      | even lanes | odd lanes
2773                    * mrt0 | src0 even  | src1 even
2774                    * mrt1 | src0 odd   | src1 odd
2775                    */
2776                   bld.vop2_dpp(aco_opcode::v_cndmask_b32, Definition(dst0, v1), src1, src0,
2777                                src_even, dpp_row_xmask(1));
2778                   bld.vop2_e64_dpp(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src0, src1,
2779                                    src_odd, dpp_row_xmask(1));
2780 
2781                   mrt0[i] = Operand(dst0, v1);
2782                   mrt1[i] = Operand(dst1, v1);
2783 
2784                   enabled_channels |= 1 << i;
2785 
2786                   dst0 = dst0.advance(4);
2787                   dst1 = dst1.advance(4);
2788                }
2789 
2790                bld.sop1(Builder::s_mov, Definition(exec, bld.lm),
2791                         Operand(exec_tmp.physReg(), bld.lm));
2792 
2793                /* Force export all channels when everything is undefined. */
2794                if (!enabled_channels)
2795                   enabled_channels = 0xf;
2796 
2797                bld.exp(aco_opcode::exp, mrt0[0], mrt0[1], mrt0[2], mrt0[3], enabled_channels,
2798                        V_008DFC_SQ_EXP_MRT + 21, false);
2799                bld.exp(aco_opcode::exp, mrt1[0], mrt1[1], mrt1[2], mrt1[3], enabled_channels,
2800                        V_008DFC_SQ_EXP_MRT + 22, false);
2801                break;
2802             }
2803             case aco_opcode::p_end_with_regs: {
2804                end_with_regs_block_index = block->index;
2805                break;
2806             }
2807             case aco_opcode::p_shader_cycles_hi_lo_hi: {
2808                unsigned shader_cycles_lo = 29;
2809                unsigned shader_cycles_hi = 30;
2810                bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[0],
2811                         ((32 - 1) << 11) | shader_cycles_hi);
2812                bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[1],
2813                         ((32 - 1) << 11) | shader_cycles_lo);
2814                bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[2],
2815                         ((32 - 1) << 11) | shader_cycles_hi);
2816                break;
2817             }
2818             default: break;
2819             }
2820          } else if (instr->isBranch()) {
2821             Pseudo_branch_instruction* branch = &instr->branch();
2822             const uint32_t target = branch->target[0];
2823             const bool uniform_branch = !((branch->opcode == aco_opcode::p_cbranch_z ||
2824                                            branch->opcode == aco_opcode::p_cbranch_nz) &&
2825                                           branch->operands[0].physReg() == exec);
2826 
2827             if (branch->never_taken) {
2828                assert(!uniform_branch);
2829                continue;
2830             }
2831 
2832             /* Check if the branch instruction can be removed.
2833              * This is beneficial when executing the next block with an empty exec mask
2834              * is faster than the branch instruction itself.
2835              *
2836              * Override this judgement when:
2837              * - The application prefers to remove control flow
2838              * - The compiler stack knows that it's a divergent branch always taken
2839              */
2840             const bool prefer_remove = branch->rarely_taken;
2841             bool can_remove = block->index < target;
2842             unsigned num_scalar = 0;
2843             unsigned num_vector = 0;
2844 
2845             /* Check the instructions between branch and target */
2846             for (unsigned i = block->index + 1; i < branch->target[0]; i++) {
2847                /* Uniform conditional branches must not be ignored if they
2848                 * are about to jump over actual instructions */
2849                if (uniform_branch && !program->blocks[i].instructions.empty())
2850                   can_remove = false;
2851 
2852                if (!can_remove)
2853                   break;
2854 
2855                for (aco_ptr<Instruction>& inst : program->blocks[i].instructions) {
2856                   if (inst->isSOPP()) {
2857                      if (instr_info.classes[(int)inst->opcode] == instr_class::branch) {
2858                         /* Discard early exits and loop breaks and continues should work fine with
2859                          * an empty exec mask.
2860                          */
2861                         bool is_break_continue =
2862                            program->blocks[i].kind & (block_kind_break | block_kind_continue);
2863                         bool discard_early_exit =
2864                            program->blocks[inst->salu().imm].kind & block_kind_discard_early_exit;
2865                         if ((inst->opcode != aco_opcode::s_cbranch_scc0 &&
2866                              inst->opcode != aco_opcode::s_cbranch_scc1) ||
2867                             (!discard_early_exit && !is_break_continue))
2868                            can_remove = false;
2869                      } else {
2870                         can_remove = false;
2871                      }
2872                   } else if (inst->isSALU()) {
2873                      num_scalar++;
2874                   } else if (inst->isVALU() || inst->isVINTRP()) {
2875                      if (instr->opcode == aco_opcode::v_writelane_b32 ||
2876                          instr->opcode == aco_opcode::v_writelane_b32_e64) {
2877                         /* writelane ignores exec, writing inactive lanes results in UB. */
2878                         can_remove = false;
2879                      }
2880                      num_vector++;
2881                      /* VALU which writes SGPRs are always executed on GFX10+ */
2882                      if (ctx.program->gfx_level >= GFX10) {
2883                         for (Definition& def : inst->definitions) {
2884                            if (def.regClass().type() == RegType::sgpr)
2885                               num_scalar++;
2886                         }
2887                      }
2888                   } else if (inst->isEXP() || inst->isSMEM() || inst->isBarrier()) {
2889                      /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs),
2890                       * SMEM might be an invalid access, and barriers are probably expensive. */
2891                      can_remove = false;
2892                   } else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() ||
2893                              inst->isLDSDIR()) {
2894                      // TODO: GFX6-9 can use vskip
2895                      can_remove = prefer_remove;
2896                   } else {
2897                      can_remove = false;
2898                      assert(false && "Pseudo instructions should be lowered by this point.");
2899                   }
2900 
2901                   if (!prefer_remove) {
2902                      /* Under these conditions, we shouldn't remove the branch.
2903                       * Don't care about the estimated cycles when the shader prefers flattening.
2904                       */
2905                      unsigned est_cycles;
2906                      if (ctx.program->gfx_level >= GFX10)
2907                         est_cycles = num_scalar * 2 + num_vector;
2908                      else
2909                         est_cycles = num_scalar * 4 + num_vector * 4;
2910 
2911                      if (est_cycles > 16)
2912                         can_remove = false;
2913                   }
2914 
2915                   if (!can_remove)
2916                      break;
2917                }
2918             }
2919 
2920             if (can_remove)
2921                continue;
2922 
2923             /* emit branch instruction */
2924             switch (instr->opcode) {
2925             case aco_opcode::p_branch:
2926                assert(block->linear_succs[0] == target);
2927                bld.sopp(aco_opcode::s_branch, branch->definitions[0], target);
2928                break;
2929             case aco_opcode::p_cbranch_nz:
2930                assert(block->linear_succs[1] == target);
2931                if (branch->operands[0].physReg() == exec)
2932                   bld.sopp(aco_opcode::s_cbranch_execnz, branch->definitions[0], target);
2933                else if (branch->operands[0].physReg() == vcc)
2934                   bld.sopp(aco_opcode::s_cbranch_vccnz, branch->definitions[0], target);
2935                else {
2936                   assert(branch->operands[0].physReg() == scc);
2937                   bld.sopp(aco_opcode::s_cbranch_scc1, branch->definitions[0], target);
2938                }
2939                break;
2940             case aco_opcode::p_cbranch_z:
2941                assert(block->linear_succs[1] == target);
2942                if (branch->operands[0].physReg() == exec)
2943                   bld.sopp(aco_opcode::s_cbranch_execz, branch->definitions[0], target);
2944                else if (branch->operands[0].physReg() == vcc)
2945                   bld.sopp(aco_opcode::s_cbranch_vccz, branch->definitions[0], target);
2946                else {
2947                   assert(branch->operands[0].physReg() == scc);
2948                   bld.sopp(aco_opcode::s_cbranch_scc0, branch->definitions[0], target);
2949                }
2950                break;
2951             default: unreachable("Unknown Pseudo branch instruction!");
2952             }
2953 
2954          } else if (instr->isReduction()) {
2955             Pseudo_reduction_instruction& reduce = instr->reduction();
2956             emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size,
2957                            reduce.operands[1].physReg(),    // tmp
2958                            reduce.definitions[1].physReg(), // stmp
2959                            reduce.operands[2].physReg(),    // vtmp
2960                            reduce.definitions[2].physReg(), // sitmp
2961                            reduce.operands[0], reduce.definitions[0]);
2962          } else if (instr->isBarrier()) {
2963             Pseudo_barrier_instruction& barrier = instr->barrier();
2964 
2965             /* Anything larger than a workgroup isn't possible. Anything
2966              * smaller requires no instructions and this pseudo instruction
2967              * would only be included to control optimizations. */
2968             bool emit_s_barrier = barrier.exec_scope == scope_workgroup &&
2969                                   program->workgroup_size > program->wave_size;
2970 
2971             bld.insert(std::move(instr));
2972             if (emit_s_barrier && ctx.program->gfx_level >= GFX12) {
2973                bld.sop1(aco_opcode::s_barrier_signal, Operand::c32(-1));
2974                bld.sopp(aco_opcode::s_barrier_wait, UINT16_MAX);
2975             } else if (emit_s_barrier) {
2976                bld.sopp(aco_opcode::s_barrier);
2977             }
2978          } else if (instr->opcode == aco_opcode::p_v_cvt_f16_f32_rtne ||
2979                     instr->opcode == aco_opcode::p_s_cvt_f16_f32_rtne) {
2980             float_mode new_mode = block->fp_mode;
2981             new_mode.round16_64 = fp_round_ne;
2982             bool set_round = new_mode.round != block->fp_mode.round;
2983 
2984             emit_set_mode(bld, new_mode, set_round, false);
2985 
2986             if (instr->opcode == aco_opcode::p_v_cvt_f16_f32_rtne)
2987                instr->opcode = aco_opcode::v_cvt_f16_f32;
2988             else
2989                instr->opcode = aco_opcode::s_cvt_f16_f32;
2990             ctx.instructions.emplace_back(std::move(instr));
2991 
2992             emit_set_mode(bld, block->fp_mode, set_round, false);
2993          } else if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32) {
2994             Definition def = instr->definitions[0];
2995             VALU_instruction& valu =
2996                bld.vop3(aco_opcode::v_cvt_pk_u8_f32, def, instr->operands[0],
2997                         Operand::c32(def.physReg().byte()), Operand(def.physReg(), v1))
2998                   ->valu();
2999             valu.abs = instr->valu().abs;
3000             valu.neg = instr->valu().neg;
3001          } else if (instr->isMIMG() && instr->mimg().strict_wqm) {
3002             lower_image_sample(&ctx, instr);
3003             ctx.instructions.emplace_back(std::move(instr));
3004          } else {
3005             ctx.instructions.emplace_back(std::move(instr));
3006          }
3007       }
3008 
3009       /* Send the ordered section done message from this block if it's needed in this block, but
3010        * instr_after_end_idx() points beyond the end of its instructions. This may commonly happen
3011        * if the common post-dominator of multiple end locations turns out to be an empty block.
3012        */
3013       if (block_idx == pops_done_msg_bounds.end_block_idx() &&
3014           pops_done_msg_bounds.instr_after_end_idx() >= block->instructions.size()) {
3015          bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
3016       }
3017 
3018       block->instructions = std::move(ctx.instructions);
3019    }
3020 
3021    /* If block with p_end_with_regs is not the last block (i.e. p_exit_early_if may append exit
3022     * block at last), create an exit block for it to branch to.
3023     */
3024    int last_block_index = program->blocks.size() - 1;
3025    if (end_with_regs_block_index >= 0 && end_with_regs_block_index != last_block_index) {
3026       Block* exit_block = program->create_and_insert_block();
3027       Block* end_with_regs_block = &program->blocks[end_with_regs_block_index];
3028       exit_block->linear_preds.push_back(end_with_regs_block->index);
3029       end_with_regs_block->linear_succs.push_back(exit_block->index);
3030 
3031       Builder bld(program, end_with_regs_block);
3032       bld.sopp(aco_opcode::s_branch, exit_block->index);
3033 
3034       /* For insert waitcnt pass to add waitcnt in exit block, otherwise waitcnt will be added
3035        * after the s_branch which won't be executed.
3036        */
3037       end_with_regs_block->kind &= ~block_kind_end_with_regs;
3038       exit_block->kind |= block_kind_end_with_regs;
3039    }
3040 
3041    program->progress = CompilationProgress::after_lower_to_hw;
3042 }
3043 
3044 } // namespace aco
3045