1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9
10 #include "common/sid.h"
11
12 #include <map>
13 #include <vector>
14
15 namespace aco {
16
17 namespace {
18
19 struct lower_context {
20 Program* program;
21 Block* block;
22 std::vector<aco_ptr<Instruction>> instructions;
23 };
24
25 /* Class for obtaining where s_sendmsg(MSG_ORDERED_PS_DONE) must be done in a Primitive Ordered
26 * Pixel Shader on GFX9-10.3.
27 *
28 * MSG_ORDERED_PS_DONE must be sent once after the ordered section is done along all execution paths
29 * from the POPS packer ID hardware register setting to s_endpgm. It is, however, also okay to send
30 * it if the packer ID is not going to be set at all by the wave, so some conservativeness is fine.
31 *
32 * For simplicity, sending the message from top-level blocks as dominance and post-dominance
33 * checking for any location in the shader is trivial in them. Also, for simplicity, sending it
34 * regardless of whether the POPS packer ID hardware register has already potentially been set up.
35 *
36 * Note that there can be multiple interlock end instructions in the shader.
37 * SPV_EXT_fragment_shader_interlock requires OpEndInvocationInterlockEXT to be executed exactly
38 * once by the invocation. However, there may be, for instance, multiple ordered sections, and which
39 * one will be executed may depend on divergent control flow (some lanes may execute one ordered
40 * section, other lanes may execute another). MSG_ORDERED_PS_DONE, however, is sent via a scalar
41 * instruction, so it must be ensured that the message is sent after the last ordered section in the
42 * entire wave.
43 */
44 class gfx9_pops_done_msg_bounds {
45 public:
46 explicit gfx9_pops_done_msg_bounds() = default;
47
gfx9_pops_done_msg_bounds(const Program * const program)48 explicit gfx9_pops_done_msg_bounds(const Program* const program)
49 {
50 /* Find the top-level location after the last ordered section end pseudo-instruction in the
51 * program.
52 * Consider `p_pops_gfx9_overlapped_wave_wait_done` a boundary too - make sure the message
53 * isn't sent if any wait hasn't been fully completed yet (if a begin-end-begin situation
54 * occurs somehow, as the location of `p_pops_gfx9_ordered_section_done` is controlled by the
55 * application) for safety, assuming that waits are the only thing that need the packer
56 * hardware register to be set at some point during or before them, and it won't be set
57 * anymore after the last wait.
58 */
59 int last_top_level_block_idx = -1;
60 for (int block_idx = (int)program->blocks.size() - 1; block_idx >= 0; block_idx--) {
61 const Block& block = program->blocks[block_idx];
62 if (block.kind & block_kind_top_level) {
63 last_top_level_block_idx = block_idx;
64 }
65 for (size_t instr_idx = block.instructions.size() - 1; instr_idx + size_t(1) > 0;
66 instr_idx--) {
67 const aco_opcode opcode = block.instructions[instr_idx]->opcode;
68 if (opcode == aco_opcode::p_pops_gfx9_ordered_section_done ||
69 opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done) {
70 end_block_idx_ = last_top_level_block_idx;
71 /* The same block if it's already a top-level block, or the beginning of the next
72 * top-level block.
73 */
74 instr_after_end_idx_ = block_idx == end_block_idx_ ? instr_idx + 1 : 0;
75 break;
76 }
77 }
78 if (end_block_idx_ != -1) {
79 break;
80 }
81 }
82 }
83
84 /* If this is not -1, during the normal execution flow (not early exiting), MSG_ORDERED_PS_DONE
85 * must be sent in this block.
86 */
end_block_idx() const87 int end_block_idx() const { return end_block_idx_; }
88
89 /* If end_block_idx() is an existing block, during the normal execution flow (not early exiting),
90 * MSG_ORDERED_PS_DONE must be sent before this instruction in the block end_block_idx().
91 * If this is out of the bounds of the instructions in the end block, it must be sent in the end
92 * of that block.
93 */
instr_after_end_idx() const94 size_t instr_after_end_idx() const { return instr_after_end_idx_; }
95
96 /* Whether an instruction doing early exit (such as discard) needs to send MSG_ORDERED_PS_DONE
97 * before actually ending the program.
98 */
early_exit_needs_done_msg(const int block_idx,const size_t instr_idx) const99 bool early_exit_needs_done_msg(const int block_idx, const size_t instr_idx) const
100 {
101 return block_idx <= end_block_idx_ &&
102 (block_idx != end_block_idx_ || instr_idx < instr_after_end_idx_);
103 }
104
105 private:
106 /* Initialize to an empty range for which "is inside" comparisons will be failing for any
107 * block.
108 */
109 int end_block_idx_ = -1;
110 size_t instr_after_end_idx_ = 0;
111 };
112
113 void
copy_constant_sgpr(Builder & bld,Definition dst,uint64_t constant)114 copy_constant_sgpr(Builder& bld, Definition dst, uint64_t constant)
115 {
116 if (dst.regClass() == s1) {
117 uint32_t imm = constant;
118 Operand op = Operand::get_const(bld.program->gfx_level, imm, 4);
119 if (op.isLiteral()) {
120 if (imm >= 0xffff8000 || imm <= 0x7fff) {
121 bld.sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);
122 return;
123 }
124
125 Operand rev_op = Operand::get_const(bld.program->gfx_level, util_bitreverse(imm), 4);
126 if (!rev_op.isLiteral()) {
127 bld.sop1(aco_opcode::s_brev_b32, dst, rev_op);
128 return;
129 }
130
131 unsigned start = (ffs(imm) - 1) & 0x1f;
132 unsigned size = util_bitcount(imm) & 0x1f;
133 if (BITFIELD_RANGE(start, size) == imm) {
134 bld.sop2(aco_opcode::s_bfm_b32, dst, Operand::c32(size), Operand::c32(start));
135 return;
136 }
137
138 if (bld.program->gfx_level >= GFX9) {
139 Operand op_lo = Operand::c32(int32_t(int16_t(imm)));
140 Operand op_hi = Operand::c32(int32_t(int16_t(imm >> 16)));
141 if (!op_lo.isLiteral() && !op_hi.isLiteral()) {
142 bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op_lo, op_hi);
143 return;
144 }
145 }
146 }
147
148 bld.sop1(aco_opcode::s_mov_b32, dst, op);
149 return;
150 }
151
152 assert(dst.regClass() == s2);
153
154 bool can_use_mov = Operand::is_constant_representable(constant, 8, true, false);
155 if (can_use_mov && !Operand::c64(constant).isLiteral()) {
156 bld.sop1(aco_opcode::s_mov_b64, dst, Operand::c64(constant));
157 return;
158 }
159
160 unsigned start = (ffsll(constant) - 1) & 0x3f;
161 unsigned size = util_bitcount64(constant) & 0x3f;
162 if (BITFIELD64_RANGE(start, size) == constant) {
163 bld.sop2(aco_opcode::s_bfm_b64, dst, Operand::c32(size), Operand::c32(start));
164 return;
165 }
166
167 uint64_t rev = ((uint64_t)util_bitreverse(constant) << 32) | util_bitreverse(constant >> 32);
168 if (Operand::is_constant_representable(rev, 8, true, false)) {
169 bld.sop1(aco_opcode::s_brev_b64, dst, Operand::c64(rev));
170 return;
171 }
172
173 if (can_use_mov) {
174 bld.sop1(aco_opcode::s_mov_b64, dst, Operand::c64(constant));
175 return;
176 }
177
178 uint32_t derep = 0;
179 bool can_use_rep = bld.program->gfx_level >= GFX9;
180 for (unsigned i = 0; can_use_rep && i < 32; i++) {
181 uint32_t lo = (constant >> (i * 2)) & 0x1;
182 uint32_t hi = (constant >> ((i * 2) + 1)) & 0x1;
183 can_use_rep &= lo == hi;
184 derep |= lo << i;
185 }
186 if (can_use_rep) {
187 bld.sop1(aco_opcode::s_bitreplicate_b64_b32, dst, Operand::c32(derep));
188 return;
189 }
190
191 copy_constant_sgpr(bld, Definition(dst.physReg(), s1), (uint32_t)constant);
192 copy_constant_sgpr(bld, Definition(dst.physReg().advance(4), s1), constant >> 32);
193 }
194
195 /* used by handle_operands() indirectly through Builder::copy */
196 uint8_t int8_mul_table[512] = {
197 0, 20, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9,
198 1, 10, 1, 11, 1, 12, 1, 13, 1, 14, 1, 15, 1, 16, 1, 17, 1, 18, 1, 19,
199 1, 20, 1, 21, 1, 22, 1, 23, 1, 24, 1, 25, 1, 26, 1, 27, 1, 28, 1, 29,
200 1, 30, 1, 31, 1, 32, 1, 33, 1, 34, 1, 35, 1, 36, 1, 37, 1, 38, 1, 39,
201 1, 40, 1, 41, 1, 42, 1, 43, 1, 44, 1, 45, 1, 46, 1, 47, 1, 48, 1, 49,
202 1, 50, 1, 51, 1, 52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 57, 1, 58, 1, 59,
203 1, 60, 1, 61, 1, 62, 1, 63, 1, 64, 5, 13, 2, 33, 17, 19, 2, 34, 3, 23,
204 2, 35, 11, 53, 2, 36, 7, 47, 2, 37, 3, 25, 2, 38, 7, 11, 2, 39, 53, 243,
205 2, 40, 3, 27, 2, 41, 17, 35, 2, 42, 5, 17, 2, 43, 3, 29, 2, 44, 15, 23,
206 2, 45, 7, 13, 2, 46, 3, 31, 2, 47, 5, 19, 2, 48, 19, 59, 2, 49, 3, 33,
207 2, 50, 7, 51, 2, 51, 15, 41, 2, 52, 3, 35, 2, 53, 11, 33, 2, 54, 23, 27,
208 2, 55, 3, 37, 2, 56, 9, 41, 2, 57, 5, 23, 2, 58, 3, 39, 2, 59, 7, 17,
209 2, 60, 9, 241, 2, 61, 3, 41, 2, 62, 5, 25, 2, 63, 35, 245, 2, 64, 3, 43,
210 5, 26, 9, 43, 3, 44, 7, 19, 10, 39, 3, 45, 4, 34, 11, 59, 3, 46, 9, 243,
211 4, 35, 3, 47, 22, 53, 7, 57, 3, 48, 5, 29, 10, 245, 3, 49, 4, 37, 9, 45,
212 3, 50, 7, 241, 4, 38, 3, 51, 7, 22, 5, 31, 3, 52, 7, 59, 7, 242, 3, 53,
213 4, 40, 7, 23, 3, 54, 15, 45, 4, 41, 3, 55, 6, 241, 9, 47, 3, 56, 13, 13,
214 5, 34, 3, 57, 4, 43, 11, 39, 3, 58, 5, 35, 4, 44, 3, 59, 6, 243, 7, 245,
215 3, 60, 5, 241, 7, 26, 3, 61, 4, 46, 5, 37, 3, 62, 11, 17, 4, 47, 3, 63,
216 5, 38, 5, 243, 3, 64, 7, 247, 9, 50, 5, 39, 4, 241, 33, 37, 6, 33, 13, 35,
217 4, 242, 5, 245, 6, 247, 7, 29, 4, 51, 5, 41, 5, 246, 7, 249, 3, 240, 11, 19,
218 5, 42, 3, 241, 4, 245, 25, 29, 3, 242, 5, 43, 4, 246, 3, 243, 17, 58, 17, 43,
219 3, 244, 5, 249, 6, 37, 3, 245, 2, 240, 5, 45, 2, 241, 21, 23, 2, 242, 3, 247,
220 2, 243, 5, 251, 2, 244, 29, 61, 2, 245, 3, 249, 2, 246, 17, 29, 2, 247, 9, 55,
221 1, 240, 1, 241, 1, 242, 1, 243, 1, 244, 1, 245, 1, 246, 1, 247, 1, 248, 1, 249,
222 1, 250, 1, 251, 1, 252, 1, 253, 1, 254, 1, 255};
223
224 aco_opcode
get_reduce_opcode(amd_gfx_level gfx_level,ReduceOp op)225 get_reduce_opcode(amd_gfx_level gfx_level, ReduceOp op)
226 {
227 /* Because some 16-bit instructions are already VOP3 on GFX10, we use the
228 * 32-bit opcodes (VOP2) which allows to remove the temporary VGPR and to use
229 * DPP with the arithmetic instructions. This requires to sign-extend.
230 */
231 switch (op) {
232 case iadd8:
233 case iadd16:
234 if (gfx_level >= GFX10) {
235 return aco_opcode::v_add_u32;
236 } else if (gfx_level >= GFX8) {
237 return aco_opcode::v_add_u16;
238 } else {
239 return aco_opcode::v_add_co_u32;
240 }
241 break;
242 case imul8:
243 case imul16:
244 if (gfx_level >= GFX10) {
245 return aco_opcode::v_mul_lo_u16_e64;
246 } else if (gfx_level >= GFX8) {
247 return aco_opcode::v_mul_lo_u16;
248 } else {
249 return aco_opcode::v_mul_u32_u24;
250 }
251 break;
252 case fadd16: return aco_opcode::v_add_f16;
253 case fmul16: return aco_opcode::v_mul_f16;
254 case imax8:
255 case imax16:
256 if (gfx_level >= GFX10) {
257 return aco_opcode::v_max_i32;
258 } else if (gfx_level >= GFX8) {
259 return aco_opcode::v_max_i16;
260 } else {
261 return aco_opcode::v_max_i32;
262 }
263 break;
264 case imin8:
265 case imin16:
266 if (gfx_level >= GFX10) {
267 return aco_opcode::v_min_i32;
268 } else if (gfx_level >= GFX8) {
269 return aco_opcode::v_min_i16;
270 } else {
271 return aco_opcode::v_min_i32;
272 }
273 break;
274 case umin8:
275 case umin16:
276 if (gfx_level >= GFX10) {
277 return aco_opcode::v_min_u32;
278 } else if (gfx_level >= GFX8) {
279 return aco_opcode::v_min_u16;
280 } else {
281 return aco_opcode::v_min_u32;
282 }
283 break;
284 case umax8:
285 case umax16:
286 if (gfx_level >= GFX10) {
287 return aco_opcode::v_max_u32;
288 } else if (gfx_level >= GFX8) {
289 return aco_opcode::v_max_u16;
290 } else {
291 return aco_opcode::v_max_u32;
292 }
293 break;
294 case fmin16: return aco_opcode::v_min_f16;
295 case fmax16: return aco_opcode::v_max_f16;
296 case iadd32: return gfx_level >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
297 case imul32: return aco_opcode::v_mul_lo_u32;
298 case fadd32: return aco_opcode::v_add_f32;
299 case fmul32: return aco_opcode::v_mul_f32;
300 case imax32: return aco_opcode::v_max_i32;
301 case imin32: return aco_opcode::v_min_i32;
302 case umin32: return aco_opcode::v_min_u32;
303 case umax32: return aco_opcode::v_max_u32;
304 case fmin32: return aco_opcode::v_min_f32;
305 case fmax32: return aco_opcode::v_max_f32;
306 case iand8:
307 case iand16:
308 case iand32: return aco_opcode::v_and_b32;
309 case ixor8:
310 case ixor16:
311 case ixor32: return aco_opcode::v_xor_b32;
312 case ior8:
313 case ior16:
314 case ior32: return aco_opcode::v_or_b32;
315 case iadd64: return aco_opcode::num_opcodes;
316 case imul64: return aco_opcode::num_opcodes;
317 case fadd64: return aco_opcode::v_add_f64_e64;
318 case fmul64: return aco_opcode::v_mul_f64_e64;
319 case imin64: return aco_opcode::num_opcodes;
320 case imax64: return aco_opcode::num_opcodes;
321 case umin64: return aco_opcode::num_opcodes;
322 case umax64: return aco_opcode::num_opcodes;
323 case fmin64: return aco_opcode::v_min_f64_e64;
324 case fmax64: return aco_opcode::v_max_f64_e64;
325 case iand64: return aco_opcode::num_opcodes;
326 case ior64: return aco_opcode::num_opcodes;
327 case ixor64: return aco_opcode::num_opcodes;
328 default: return aco_opcode::num_opcodes;
329 }
330 }
331
332 bool
is_vop3_reduce_opcode(aco_opcode opcode)333 is_vop3_reduce_opcode(aco_opcode opcode)
334 {
335 /* 64-bit reductions are VOP3. */
336 if (opcode == aco_opcode::num_opcodes)
337 return true;
338
339 return instr_info.format[(int)opcode] == Format::VOP3;
340 }
341
342 void
emit_vadd32(Builder & bld,Definition def,Operand src0,Operand src1)343 emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
344 {
345 Instruction* instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
346 if (instr->definitions.size() >= 2) {
347 assert(instr->definitions[1].regClass() == bld.lm);
348 instr->definitions[1].setFixed(vcc);
349 }
350 }
351
352 void
emit_int64_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp_reg,ReduceOp op,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)353 emit_int64_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
354 PhysReg vtmp_reg, ReduceOp op, unsigned dpp_ctrl, unsigned row_mask,
355 unsigned bank_mask, bool bound_ctrl, Operand* identity = NULL)
356 {
357 Builder bld(ctx->program, &ctx->instructions);
358 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
359 Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg + 1}, v1)};
360 Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg + 1}, v1)};
361 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
362 Operand src1_64 = Operand(src1_reg, v2);
363 Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg + 1}, v1)};
364 Operand vtmp_op64 = Operand(vtmp_reg, v2);
365 if (op == iadd64) {
366 if (ctx->program->gfx_level >= GFX10) {
367 if (identity)
368 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
369 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
370 bound_ctrl);
371 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]);
372 } else {
373 bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0],
374 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
375 }
376 bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
377 Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
378 } else if (op == iand64) {
379 bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
380 bound_ctrl);
381 bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
382 bound_ctrl);
383 } else if (op == ior64) {
384 bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
385 bound_ctrl);
386 bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
387 bound_ctrl);
388 } else if (op == ixor64) {
389 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
390 bound_ctrl);
391 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
392 bound_ctrl);
393 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
394 aco_opcode cmp = aco_opcode::num_opcodes;
395 switch (op) {
396 case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
397 case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
398 case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
399 case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
400 default: break;
401 }
402
403 if (identity) {
404 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
405 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]);
406 }
407 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
408 bound_ctrl);
409 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask,
410 bound_ctrl);
411
412 bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64);
413 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
414 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));
415 } else if (op == imul64) {
416 /* t4 = dpp(x_hi)
417 * t1 = umul_lo(t4, y_lo)
418 * t3 = dpp(x_lo)
419 * t0 = umul_lo(t3, y_hi)
420 * t2 = iadd(t0, t1)
421 * t5 = umul_hi(t3, y_lo)
422 * res_hi = iadd(t2, t5)
423 * res_lo = umul_lo(t3, y_lo)
424 * Requires that res_hi != src0[0] and res_hi != src1[0]
425 * and that vtmp[0] != res_hi.
426 */
427 if (identity)
428 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]);
429 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], dpp_ctrl, row_mask, bank_mask,
430 bound_ctrl);
431 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]);
432 if (identity)
433 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
434 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
435 bound_ctrl);
436 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]);
437 emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]);
438 if (identity)
439 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
440 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
441 bound_ctrl);
442 bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]);
443 emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]);
444 if (identity)
445 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
446 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
447 bound_ctrl);
448 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]);
449 }
450 }
451
452 void
emit_int64_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op)453 emit_int64_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
454 ReduceOp op)
455 {
456 Builder bld(ctx->program, &ctx->instructions);
457 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
458 RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;
459 Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg + 1}, src0_rc)};
460 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
461 Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);
462 Operand src1_64 = Operand(src1_reg, v2);
463
464 if (src0_rc == s1 &&
465 (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {
466 assert(vtmp.reg() != 0);
467 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);
468 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
469 src0_reg = vtmp;
470 src0[0] = Operand(vtmp, v1);
471 src0[1] = Operand(PhysReg{vtmp + 1}, v1);
472 src0_64 = Operand(vtmp, v2);
473 } else if (src0_rc == s1 && op == iadd64) {
474 assert(vtmp.reg() != 0);
475 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
476 src0[1] = Operand(PhysReg{vtmp + 1}, v1);
477 }
478
479 if (op == iadd64) {
480 if (ctx->program->gfx_level >= GFX10) {
481 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
482 } else {
483 bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
484 }
485 bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
486 Operand(vcc, bld.lm));
487 } else if (op == iand64) {
488 bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
489 bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
490 } else if (op == ior64) {
491 bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]);
492 bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]);
493 } else if (op == ixor64) {
494 bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]);
495 bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]);
496 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
497 aco_opcode cmp = aco_opcode::num_opcodes;
498 switch (op) {
499 case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
500 case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
501 case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
502 case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
503 default: break;
504 }
505
506 bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64);
507 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));
508 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));
509 } else if (op == imul64) {
510 if (src1_reg == dst_reg) {
511 /* it's fine if src0==dst but not if src1==dst */
512 std::swap(src0_reg, src1_reg);
513 std::swap(src0[0], src1[0]);
514 std::swap(src0[1], src1[1]);
515 std::swap(src0_64, src1_64);
516 }
517 assert(!(src0_reg == src1_reg));
518 /* t1 = umul_lo(x_hi, y_lo)
519 * t0 = umul_lo(x_lo, y_hi)
520 * t2 = iadd(t0, t1)
521 * t5 = umul_hi(x_lo, y_lo)
522 * res_hi = iadd(t2, t5)
523 * res_lo = umul_lo(x_lo, y_lo)
524 * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp
525 */
526 Definition tmp0_def(PhysReg{src0_reg + 1}, v1);
527 Definition tmp1_def(PhysReg{src1_reg + 1}, v1);
528 Operand tmp0_op = src0[1];
529 Operand tmp1_op = src1[1];
530 bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]);
531 bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]);
532 emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op);
533 bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]);
534 emit_vadd32(bld, dst[1], tmp0_op, tmp1_op);
535 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]);
536 }
537 }
538
539 void
emit_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)540 emit_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
541 ReduceOp op, unsigned size, unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask,
542 bool bound_ctrl, Operand* identity = NULL) /* for VOP3 with sparse writes */
543 {
544 Builder bld(ctx->program, &ctx->instructions);
545 RegClass rc = RegClass(RegType::vgpr, size);
546 Definition dst(dst_reg, rc);
547 Operand src0(src0_reg, rc);
548 Operand src1(src1_reg, rc);
549
550 aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
551 bool vop3 = is_vop3_reduce_opcode(opcode);
552
553 if (!vop3) {
554 if (opcode == aco_opcode::v_add_co_u32)
555 bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask,
556 bound_ctrl);
557 else
558 bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
559 return;
560 }
561
562 if (opcode == aco_opcode::num_opcodes) {
563 emit_int64_dpp_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op, dpp_ctrl, row_mask, bank_mask,
564 bound_ctrl, identity);
565 return;
566 }
567
568 if (identity)
569 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
570 if (identity && size >= 2)
571 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), identity[1]);
572
573 for (unsigned i = 0; i < size; i++)
574 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
575 Operand(PhysReg{src0_reg + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
576
577 bld.vop3(opcode, dst, Operand(vtmp, rc), src1);
578 }
579
580 void
emit_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size)581 emit_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
582 ReduceOp op, unsigned size)
583 {
584 Builder bld(ctx->program, &ctx->instructions);
585 RegClass rc = RegClass(RegType::vgpr, size);
586 Definition dst(dst_reg, rc);
587 Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size));
588 Operand src1(src1_reg, rc);
589
590 aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
591 bool vop3 = is_vop3_reduce_opcode(opcode);
592
593 if (opcode == aco_opcode::num_opcodes) {
594 emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op);
595 return;
596 }
597
598 if (vop3) {
599 bld.vop3(opcode, dst, src0, src1);
600 } else if (opcode == aco_opcode::v_add_co_u32) {
601 bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1);
602 } else {
603 bld.vop2(opcode, dst, src0, src1);
604 }
605 }
606
607 void
emit_dpp_mov(lower_context * ctx,PhysReg dst,PhysReg src0,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)608 emit_dpp_mov(lower_context* ctx, PhysReg dst, PhysReg src0, unsigned size, unsigned dpp_ctrl,
609 unsigned row_mask, unsigned bank_mask, bool bound_ctrl)
610 {
611 Builder bld(ctx->program, &ctx->instructions);
612 for (unsigned i = 0; i < size; i++) {
613 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst + i}, v1),
614 Operand(PhysReg{src0 + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
615 }
616 }
617
618 void
emit_ds_swizzle(Builder bld,PhysReg dst,PhysReg src,unsigned size,unsigned ds_pattern)619 emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
620 {
621 for (unsigned i = 0; i < size; i++) {
622 bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst + i}, v1),
623 Operand(PhysReg{src + i}, v1), ds_pattern);
624 }
625 }
626
627 void
emit_reduction(lower_context * ctx,aco_opcode op,ReduceOp reduce_op,unsigned cluster_size,PhysReg tmp,PhysReg stmp,PhysReg vtmp,PhysReg sitmp,Operand src,Definition dst)628 emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size,
629 PhysReg tmp, PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
630 {
631 assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);
632 assert(cluster_size <= ctx->program->wave_size);
633
634 Builder bld(ctx->program, &ctx->instructions);
635
636 Operand identity[2];
637 identity[0] = Operand::c32(get_reduction_identity(reduce_op, 0));
638 identity[1] = Operand::c32(get_reduction_identity(reduce_op, 1));
639 Operand vcndmask_identity[2] = {identity[0], identity[1]};
640
641 /* First, copy the source to tmp and set inactive lanes to the identity */
642 bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1),
643 Definition(exec, bld.lm), Operand::c64(UINT64_MAX), Operand(exec, bld.lm));
644
645 /* On GFX10+ v_writelane_b32/v_cndmask_b32_e64 can take a literal */
646 if (ctx->program->gfx_level < GFX10) {
647 for (unsigned i = 0; i < src.size(); i++) {
648 /* p_exclusive_scan uses identity for v_writelane_b32 */
649 if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) {
650 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp + i}, s1), identity[i]);
651 identity[i] = Operand(PhysReg{sitmp + i}, s1);
652
653 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
654 vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
655 } else if (identity[i].isLiteral()) {
656 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
657 vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
658 }
659 }
660 }
661
662 for (unsigned i = 0; i < src.size(); i++) {
663 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
664 vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
665 Operand(stmp, bld.lm));
666 }
667
668 if (reduce_op == iadd8 || reduce_op == imul8 || reduce_op == imax8 || reduce_op == imin8 ||
669 reduce_op == umin8 || reduce_op == umax8 || reduce_op == ixor8 || reduce_op == ior8 ||
670 reduce_op == iand8) {
671 if (ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11) {
672 aco_ptr<Instruction> sdwa{
673 create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
674 sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
675 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
676 bool sext = reduce_op == imin8 || reduce_op == imax8;
677 sdwa->sdwa().sel[0] = SubdwordSel(1, 0, sext);
678 sdwa->sdwa().dst_sel = SubdwordSel::dword;
679 bld.insert(std::move(sdwa));
680 } else {
681 aco_opcode opcode;
682
683 if (reduce_op == imin8 || reduce_op == imax8)
684 opcode = aco_opcode::v_bfe_i32;
685 else
686 opcode = aco_opcode::v_bfe_u32;
687
688 bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
689 Operand::c32(8u));
690 }
691 } else if (reduce_op == iadd16 || reduce_op == imul16 || reduce_op == imax16 ||
692 reduce_op == imin16 || reduce_op == umin16 || reduce_op == umax16 ||
693 reduce_op == ixor16 || reduce_op == ior16 || reduce_op == iand16 ||
694 reduce_op == fadd16 || reduce_op == fmul16 || reduce_op == fmin16 ||
695 reduce_op == fmax16) {
696 bool is_add_cmp = reduce_op == iadd16 || reduce_op == imax16 || reduce_op == imin16 ||
697 reduce_op == umin16 || reduce_op == umax16;
698 if (ctx->program->gfx_level >= GFX10 && ctx->program->gfx_level < GFX11 && is_add_cmp) {
699 aco_ptr<Instruction> sdwa{
700 create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
701 sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
702 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
703 bool sext = reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16;
704 sdwa->sdwa().sel[0] = SubdwordSel(2, 0, sext);
705 sdwa->sdwa().dst_sel = SubdwordSel::dword;
706 bld.insert(std::move(sdwa));
707 } else if (ctx->program->gfx_level <= GFX7 ||
708 (ctx->program->gfx_level >= GFX11 && is_add_cmp)) {
709 aco_opcode opcode;
710
711 if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
712 opcode = aco_opcode::v_bfe_i32;
713 else
714 opcode = aco_opcode::v_bfe_u32;
715
716 bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
717 Operand::c32(16u));
718 }
719 }
720
721 bool reduction_needs_last_op = false;
722 switch (op) {
723 case aco_opcode::p_reduce:
724 if (cluster_size == 1)
725 break;
726
727 if (ctx->program->gfx_level <= GFX7) {
728 reduction_needs_last_op = true;
729 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2));
730 if (cluster_size == 2)
731 break;
732 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
733 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1));
734 if (cluster_size == 4)
735 break;
736 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
737 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04));
738 if (cluster_size == 8)
739 break;
740 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
741 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08));
742 if (cluster_size == 16)
743 break;
744 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
745 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
746 if (cluster_size == 32)
747 break;
748 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
749 for (unsigned i = 0; i < src.size(); i++)
750 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
751 Operand::zero());
752 // TODO: it would be more effective to do the last reduction step on SALU
753 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
754 reduction_needs_last_op = false;
755 break;
756 }
757
758 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf,
759 0xf, false);
760 if (cluster_size == 2)
761 break;
762 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf,
763 0xf, false);
764 if (cluster_size == 4)
765 break;
766 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf,
767 false);
768 if (cluster_size == 8)
769 break;
770 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false);
771 if (cluster_size == 16)
772 break;
773
774 if (ctx->program->gfx_level >= GFX10) {
775 /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
776 for (unsigned i = 0; i < src.size(); i++)
777 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
778 Operand(PhysReg{tmp + i}, v1), Operand::zero(), Operand::zero());
779
780 if (cluster_size == 32) {
781 reduction_needs_last_op = true;
782 break;
783 }
784
785 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
786 for (unsigned i = 0; i < src.size(); i++)
787 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
788 Operand::zero());
789 // TODO: it would be more effective to do the last reduction step on SALU
790 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
791 break;
792 }
793
794 if (cluster_size == 32) {
795 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
796 reduction_needs_last_op = true;
797 break;
798 }
799 assert(cluster_size == 64);
800 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
801 false);
802 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
803 false);
804 break;
805 case aco_opcode::p_exclusive_scan:
806 if (ctx->program->gfx_level >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
807 /* shift rows right */
808 emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true);
809
810 /* fill in the gaps in rows 1 and 3 */
811 copy_constant_sgpr(bld, Definition(exec, bld.lm), 0x0001'0000'0001'0000ull);
812 for (unsigned i = 0; i < src.size(); i++) {
813 Instruction* perm =
814 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
815 Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
816 Operand::c32(0xffffffffu))
817 .instr;
818 perm->valu().opsel = 1; /* FI (Fetch Inactive) */
819 }
820 copy_constant_sgpr(bld, Definition(exec, bld.lm), UINT64_MAX);
821
822 if (ctx->program->wave_size == 64) {
823 /* fill in the gap in row 2 */
824 for (unsigned i = 0; i < src.size(); i++) {
825 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
826 Operand::c32(31u));
827 bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
828 Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
829 }
830 }
831 std::swap(tmp, vtmp);
832 } else if (ctx->program->gfx_level >= GFX8) {
833 emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true);
834 } else {
835 // TODO: use LDS on CS with a single write and shifted read
836 /* wavefront shift_right by 1 on SI/CI */
837 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
838 emit_ds_swizzle(bld, tmp, tmp, src.size(),
839 ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
840 copy_constant_sgpr(bld, Definition(exec, s2), 0x1010'1010'1010'1010ull);
841 for (unsigned i = 0; i < src.size(); i++)
842 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
843 Operand(PhysReg{tmp + i}, v1));
844
845 copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
846 emit_ds_swizzle(bld, tmp, tmp, src.size(),
847 ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
848 copy_constant_sgpr(bld, Definition(exec, s2), 0x0100'0100'0100'0100ull);
849 for (unsigned i = 0; i < src.size(); i++)
850 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
851 Operand(PhysReg{tmp + i}, v1));
852
853 copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
854 emit_ds_swizzle(bld, tmp, tmp, src.size(),
855 ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
856 copy_constant_sgpr(bld, Definition(exec, s2), 0x0001'0000'0001'0000ull);
857 for (unsigned i = 0; i < src.size(); i++)
858 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
859 Operand(PhysReg{tmp + i}, v1));
860
861 copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
862 for (unsigned i = 0; i < src.size(); i++) {
863 bld.writelane(Definition(PhysReg{vtmp + i}, v1), identity[i], Operand::zero(),
864 Operand(PhysReg{vtmp + i}, v1));
865 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
866 Operand::zero());
867 bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
868 Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
869 identity[i] = Operand::zero(); /* prevent further uses of identity */
870 }
871 std::swap(tmp, vtmp);
872 }
873
874 for (unsigned i = 0; i < src.size(); i++) {
875 if (!identity[i].isConstant() ||
876 identity[i].constantValue()) { /* bound_ctrl should take care of this otherwise */
877 if (ctx->program->gfx_level < GFX10)
878 assert((identity[i].isConstant() && !identity[i].isLiteral()) ||
879 identity[i].physReg() == PhysReg{sitmp + i});
880 bld.writelane(Definition(PhysReg{tmp + i}, v1), identity[i], Operand::zero(),
881 Operand(PhysReg{tmp + i}, v1));
882 }
883 }
884 FALLTHROUGH;
885 case aco_opcode::p_inclusive_scan:
886 assert(cluster_size == ctx->program->wave_size);
887 if (ctx->program->gfx_level <= GFX7) {
888 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00));
889 copy_constant_sgpr(bld, Definition(exec, s2), 0xaaaa'aaaa'aaaa'aaaaull);
890 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
891
892 copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
893 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00));
894 copy_constant_sgpr(bld, Definition(exec, s2), 0xcccc'cccc'cccc'ccccull);
895 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
896
897 copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
898 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00));
899 copy_constant_sgpr(bld, Definition(exec, s2), 0xf0f0'f0f0'f0f0'f0f0ull);
900 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
901
902 copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
903 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00));
904 copy_constant_sgpr(bld, Definition(exec, s2), 0xff00'ff00'ff00'ff00ull);
905 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
906
907 copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
908 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00));
909 copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'0000'ffff'0000ull);
910 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
911
912 for (unsigned i = 0; i < src.size(); i++)
913 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
914 Operand::c32(31u));
915 copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'ffff'0000'0000ull);
916 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
917 break;
918 }
919
920 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(1), 0xf, 0xf, false,
921 identity);
922 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(2), 0xf, 0xf, false,
923 identity);
924 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(4), 0xf, 0xf, false,
925 identity);
926 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(8), 0xf, 0xf, false,
927 identity);
928 if (ctx->program->gfx_level >= GFX10) {
929 copy_constant_sgpr(bld, Definition(exec, bld.lm), 0xffff'0000'ffff'0000ull);
930 for (unsigned i = 0; i < src.size(); i++) {
931 Instruction* perm =
932 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
933 Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
934 Operand::c32(0xffffffffu))
935 .instr;
936 perm->valu().opsel = 1; /* FI (Fetch Inactive) */
937 }
938 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
939
940 if (ctx->program->wave_size == 64) {
941 copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'ffff'0000'0000ull);
942 for (unsigned i = 0; i < src.size(); i++)
943 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
944 Operand::c32(31u));
945 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
946 }
947 } else {
948 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
949 false, identity);
950 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
951 false, identity);
952 }
953 break;
954 default: unreachable("Invalid reduction mode");
955 }
956
957 if (op == aco_opcode::p_reduce) {
958 if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) {
959 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
960 emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size());
961 return;
962 }
963
964 if (reduction_needs_last_op)
965 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
966 }
967
968 /* restore exec */
969 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
970
971 if (dst.regClass().type() == RegType::sgpr) {
972 for (unsigned k = 0; k < src.size(); k++) {
973 bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), Operand(PhysReg{tmp + k}, v1),
974 Operand::c32(ctx->program->wave_size - 1));
975 }
976 } else if (dst.physReg() != tmp) {
977 for (unsigned k = 0; k < src.size(); k++) {
978 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1),
979 Operand(PhysReg{tmp + k}, v1));
980 }
981 }
982 }
983
984 void
adjust_bpermute_dst(Builder & bld,Definition dst,Operand input_data)985 adjust_bpermute_dst(Builder& bld, Definition dst, Operand input_data)
986 {
987 /* RA assumes that the result is always in the low part of the register, so we have to shift,
988 * if it's not there already.
989 */
990 if (input_data.physReg().byte()) {
991 unsigned right_shift = input_data.physReg().byte() * 8;
992 bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand::c32(right_shift),
993 Operand(dst.physReg(), dst.regClass()));
994 }
995 }
996
997 void
emit_bpermute_permlane(Builder & bld,aco_ptr<Instruction> & instr)998 emit_bpermute_permlane(Builder& bld, aco_ptr<Instruction>& instr)
999 {
1000 /* Emulates proper bpermute on GFX11 in wave64 mode.
1001 *
1002 * Similar to emit_gfx10_wave64_bpermute, but uses the new
1003 * v_permlane64_b32 instruction to swap data between lo and hi halves.
1004 */
1005
1006 assert(bld.program->gfx_level >= GFX11);
1007 assert(bld.program->wave_size == 64);
1008
1009 Definition dst = instr->definitions[0];
1010 Definition tmp_exec = instr->definitions[1];
1011 Definition clobber_scc = instr->definitions[2];
1012 Operand tmp_op = instr->operands[0];
1013 Operand index_x4 = instr->operands[1];
1014 Operand input_data = instr->operands[2];
1015 Operand same_half = instr->operands[3];
1016
1017 assert(dst.regClass() == v1);
1018 assert(tmp_exec.regClass() == bld.lm);
1019 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
1020 assert(same_half.regClass() == bld.lm);
1021 assert(tmp_op.regClass() == v1.as_linear());
1022 assert(index_x4.regClass() == v1);
1023 assert(input_data.regClass().type() == RegType::vgpr);
1024 assert(input_data.bytes() <= 4);
1025
1026 Definition tmp_def(tmp_op.physReg(), tmp_op.regClass());
1027
1028 /* Permute the input within the same half-wave. */
1029 bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
1030
1031 /* Save EXEC and enable all lanes. */
1032 bld.sop1(aco_opcode::s_or_saveexec_b64, tmp_exec, clobber_scc, Definition(exec, s2),
1033 Operand::c32(-1u), Operand(exec, s2));
1034
1035 /* Copy input data from other half to current half's linear VGPR. */
1036 bld.vop1(aco_opcode::v_permlane64_b32, tmp_def, input_data);
1037
1038 /* Permute the input from the other half-wave, write to linear VGPR. */
1039 bld.ds(aco_opcode::ds_bpermute_b32, tmp_def, index_x4, tmp_op);
1040
1041 /* Restore saved EXEC. */
1042 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
1043
1044 /* Select correct permute result. */
1045 bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, tmp_op, Operand(dst.physReg(), dst.regClass()),
1046 same_half);
1047
1048 adjust_bpermute_dst(bld, dst, input_data);
1049 }
1050
1051 void
emit_bpermute_shared_vgpr(Builder & bld,aco_ptr<Instruction> & instr)1052 emit_bpermute_shared_vgpr(Builder& bld, aco_ptr<Instruction>& instr)
1053 {
1054 /* Emulates proper bpermute on GFX10 in wave64 mode.
1055 *
1056 * This is necessary because on GFX10 the bpermute instruction only works
1057 * on half waves (you can think of it as having a cluster size of 32), so we
1058 * manually swap the data between the two halves using two shared VGPRs.
1059 */
1060
1061 assert(bld.program->gfx_level >= GFX10 && bld.program->gfx_level <= GFX10_3);
1062 assert(bld.program->wave_size == 64);
1063
1064 unsigned shared_vgpr_reg_0 = align(bld.program->config->num_vgprs, 4) + 256;
1065 Definition dst = instr->definitions[0];
1066 Definition tmp_exec = instr->definitions[1];
1067 Definition clobber_scc = instr->definitions[2];
1068 Operand index_x4 = instr->operands[0];
1069 Operand input_data = instr->operands[1];
1070 Operand same_half = instr->operands[2];
1071
1072 assert(dst.regClass() == v1);
1073 assert(tmp_exec.regClass() == bld.lm);
1074 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
1075 assert(same_half.regClass() == bld.lm);
1076 assert(index_x4.regClass() == v1);
1077 assert(input_data.regClass().type() == RegType::vgpr);
1078 assert(input_data.bytes() <= 4);
1079 assert(dst.physReg() != index_x4.physReg());
1080 assert(dst.physReg() != input_data.physReg());
1081 assert(tmp_exec.physReg() != same_half.physReg());
1082
1083 PhysReg shared_vgpr_lo(shared_vgpr_reg_0);
1084 PhysReg shared_vgpr_hi(shared_vgpr_reg_0 + 1);
1085
1086 /* Permute the input within the same half-wave */
1087 bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
1088
1089 /* HI: Copy data from high lanes 32-63 to shared vgpr */
1090 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data,
1091 dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
1092 /* Save EXEC */
1093 bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2));
1094 /* Set EXEC to enable LO lanes only */
1095 copy_constant_sgpr(bld, Definition(exec, s2), 0x0000'0000'ffff'ffffull);
1096 /* LO: Copy data from low lanes 0-31 to shared vgpr */
1097 bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data);
1098 /* LO: bpermute shared vgpr (high lanes' data) */
1099 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4,
1100 Operand(shared_vgpr_hi, v1));
1101 /* Set EXEC to enable HI lanes only */
1102 copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'ffff'0000'0000ull);
1103 /* HI: bpermute shared vgpr (low lanes' data) */
1104 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4,
1105 Operand(shared_vgpr_lo, v1));
1106
1107 /* Only enable lanes which use the other half's data */
1108 bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc,
1109 Operand(tmp_exec.physReg(), s2), same_half);
1110 /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */
1111 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3),
1112 0x3, 0xf, false);
1113 /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */
1114 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3),
1115 0xc, 0xf, false);
1116
1117 /* Restore saved EXEC */
1118 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
1119
1120 adjust_bpermute_dst(bld, dst, input_data);
1121 }
1122
1123 void
emit_bpermute_readlane(Builder & bld,aco_ptr<Instruction> & instr)1124 emit_bpermute_readlane(Builder& bld, aco_ptr<Instruction>& instr)
1125 {
1126 /* Emulates bpermute using readlane instructions */
1127
1128 Operand index = instr->operands[0];
1129 Operand input = instr->operands[1];
1130 Definition dst = instr->definitions[0];
1131 Definition temp_exec = instr->definitions[1];
1132 Definition clobber_vcc = instr->definitions[2];
1133
1134 assert(dst.regClass() == v1);
1135 assert(temp_exec.regClass() == bld.lm);
1136 assert(clobber_vcc.regClass() == bld.lm);
1137 assert(clobber_vcc.physReg() == vcc);
1138 assert(index.regClass() == v1);
1139 assert(index.physReg() != dst.physReg());
1140 assert(input.regClass().type() == RegType::vgpr);
1141 assert(input.bytes() <= 4);
1142 assert(input.physReg() != dst.physReg());
1143
1144 /* Save original EXEC */
1145 bld.sop1(Builder::s_mov, temp_exec, Operand(exec, bld.lm));
1146
1147 /* An "unrolled loop" that is executed per each lane.
1148 * This takes only a few instructions per lane, as opposed to a "real" loop
1149 * with branching, where the branch instruction alone would take 16+ cycles.
1150 */
1151 for (unsigned n = 0; n < bld.program->wave_size; ++n) {
1152 /* Activate the lane which has N for its source index */
1153 if (bld.program->gfx_level >= GFX10)
1154 bld.vopc(aco_opcode::v_cmpx_eq_u32, Definition(exec, bld.lm), Operand::c32(n), index);
1155 else
1156 bld.vopc(aco_opcode::v_cmpx_eq_u32, clobber_vcc, Definition(exec, bld.lm), Operand::c32(n),
1157 index);
1158 /* Read the data from lane N */
1159 bld.readlane(Definition(vcc, s1), input, Operand::c32(n));
1160 /* On the active lane, move the data we read from lane N to the destination VGPR */
1161 bld.vop1(aco_opcode::v_mov_b32, dst, Operand(vcc, s1));
1162 /* Restore original EXEC */
1163 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(temp_exec.physReg(), bld.lm));
1164 }
1165
1166 adjust_bpermute_dst(bld, dst, input);
1167 }
1168
1169 struct copy_operation {
1170 Operand op;
1171 Definition def;
1172 unsigned bytes;
1173 union {
1174 uint8_t uses[8];
1175 uint64_t is_used = 0;
1176 };
1177 };
1178
1179 void
split_copy(lower_context * ctx,unsigned offset,Definition * def,Operand * op,const copy_operation & src,bool ignore_uses,unsigned max_size)1180 split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op,
1181 const copy_operation& src, bool ignore_uses, unsigned max_size)
1182 {
1183 PhysReg def_reg = src.def.physReg();
1184 PhysReg op_reg = src.op.physReg();
1185 def_reg.reg_b += offset;
1186 op_reg.reg_b += offset;
1187
1188 /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10, and on GFX11
1189 * v_lshrrev_b64 doesn't get dual issued. */
1190 if ((ctx->program->gfx_level < GFX10 || ctx->program->gfx_level >= GFX11) &&
1191 src.def.regClass().type() == RegType::vgpr)
1192 max_size = MIN2(max_size, 4);
1193 unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
1194
1195 /* make sure the size is a power of two and reg % bytes == 0 */
1196 unsigned bytes = 1;
1197 for (; bytes <= max_size; bytes *= 2) {
1198 unsigned next = bytes * 2u;
1199 bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&
1200 offset + next <= src.bytes && next <= max_size;
1201 if (!src.op.isConstant() && can_increase)
1202 can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;
1203 for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
1204 can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
1205 if (!can_increase)
1206 break;
1207 }
1208
1209 *def = Definition(src.def.tempId(), def_reg, src.def.regClass().resize(bytes));
1210 if (src.op.isConstant()) {
1211 assert(bytes >= 1 && bytes <= 8);
1212 uint64_t val = src.op.constantValue64() >> (offset * 8u);
1213 *op = Operand::get_const(ctx->program->gfx_level, val, bytes);
1214 } else {
1215 RegClass op_cls = src.op.regClass().resize(bytes);
1216 *op = Operand(op_reg, op_cls);
1217 op->setTemp(Temp(src.op.tempId(), op_cls));
1218 }
1219 }
1220
1221 uint32_t
get_intersection_mask(int a_start,int a_size,int b_start,int b_size)1222 get_intersection_mask(int a_start, int a_size, int b_start, int b_size)
1223 {
1224 int intersection_start = MAX2(b_start - a_start, 0);
1225 int intersection_end = MAX2(b_start + b_size - a_start, 0);
1226 if (intersection_start >= a_size || intersection_end == 0)
1227 return 0;
1228
1229 uint32_t mask = u_bit_consecutive(0, a_size);
1230 return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
1231 }
1232
1233 /* src1 are bytes 0-3. dst/src0 are bytes 4-7. */
1234 void
create_bperm(Builder & bld,uint8_t swiz[4],Definition dst,Operand src1,Operand src0=Operand (v1))1235 create_bperm(Builder& bld, uint8_t swiz[4], Definition dst, Operand src1,
1236 Operand src0 = Operand(v1))
1237 {
1238 uint32_t swiz_packed =
1239 swiz[0] | ((uint32_t)swiz[1] << 8) | ((uint32_t)swiz[2] << 16) | ((uint32_t)swiz[3] << 24);
1240
1241 dst = Definition(PhysReg(dst.physReg().reg()), v1);
1242 if (!src1.isConstant())
1243 src1 = Operand(PhysReg(src1.physReg().reg()), v1);
1244 if (src0.isUndefined())
1245 src0 = Operand(dst.physReg(), v1);
1246 else if (!src0.isConstant())
1247 src0 = Operand(PhysReg(src0.physReg().reg()), v1);
1248 bld.vop3(aco_opcode::v_perm_b32, dst, src0, src1, Operand::c32(swiz_packed));
1249 }
1250
1251 void
emit_v_mov_b16(Builder & bld,Definition dst,Operand op)1252 emit_v_mov_b16(Builder& bld, Definition dst, Operand op)
1253 {
1254 /* v_mov_b16 uses 32bit inline constants. */
1255 if (op.isConstant()) {
1256 if (!op.isLiteral() && op.physReg() >= 240) {
1257 /* v_add_f16 is smaller because it can use 16bit fp inline constants. */
1258 Instruction* instr = bld.vop2_e64(aco_opcode::v_add_f16, dst, op, Operand::zero());
1259 instr->valu().opsel[3] = dst.physReg().byte() == 2;
1260 return;
1261 }
1262 op = Operand::c32((int32_t)(int16_t)op.constantValue());
1263 }
1264
1265 Instruction* instr = bld.vop1(aco_opcode::v_mov_b16, dst, op);
1266 instr->valu().opsel[0] = op.physReg().byte() == 2;
1267 instr->valu().opsel[3] = dst.physReg().byte() == 2;
1268 }
1269
1270 void
copy_constant(lower_context * ctx,Builder & bld,Definition dst,Operand op)1271 copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
1272 {
1273 assert(op.bytes() == dst.bytes());
1274
1275 if (dst.regClass().type() == RegType::sgpr)
1276 return copy_constant_sgpr(bld, dst, op.constantValue64());
1277
1278 bool dual_issue_mov = ctx->program->gfx_level >= GFX11 && ctx->program->wave_size == 64 &&
1279 ctx->program->workgroup_size > 32;
1280 if (dst.bytes() == 4 && op.isLiteral() && !dual_issue_mov) {
1281 uint32_t imm = op.constantValue();
1282 Operand rev_op = Operand::get_const(ctx->program->gfx_level, util_bitreverse(imm), 4);
1283 if (!rev_op.isLiteral()) {
1284 bld.vop1(aco_opcode::v_bfrev_b32, dst, rev_op);
1285 return;
1286 }
1287 }
1288
1289 if (op.bytes() == 4 && op.constantEquals(0x3e22f983) && ctx->program->gfx_level >= GFX8)
1290 op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */
1291
1292 if (dst.regClass() == v2) {
1293 if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) {
1294 bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand::zero(), op);
1295 } else {
1296 assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true));
1297 bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand::zero(), op);
1298 }
1299 } else if (dst.regClass() == v1) {
1300 bld.vop1(aco_opcode::v_mov_b32, dst, op);
1301 } else {
1302 assert(dst.regClass() == v1b || dst.regClass() == v2b);
1303
1304 bool use_sdwa = ctx->program->gfx_level >= GFX9 && ctx->program->gfx_level < GFX11;
1305 if (dst.regClass() == v1b && use_sdwa) {
1306 uint8_t val = op.constantValue();
1307 Operand op32 = Operand::c32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u));
1308 if (op32.isLiteral()) {
1309 uint32_t a = (uint32_t)int8_mul_table[val * 2];
1310 uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1];
1311 bld.vop2_sdwa(aco_opcode::v_mul_u32_u24, dst,
1312 Operand::c32(a | (a & 0x80u ? 0xffffff00u : 0x0u)),
1313 Operand::c32(b | (b & 0x80u ? 0xffffff00u : 0x0u)));
1314 } else {
1315 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32);
1316 }
1317 } else if (dst.regClass() == v1b && ctx->program->gfx_level >= GFX10) {
1318 Operand fop = Operand::c32(fui(float(op.constantValue())));
1319 Operand offset = Operand::c32(dst.physReg().byte());
1320 Operand def_op(PhysReg(dst.physReg().reg()), v1);
1321 bld.vop3(aco_opcode::v_cvt_pk_u8_f32, dst, fop, offset, def_op);
1322 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1323 emit_v_mov_b16(bld, dst, op);
1324 } else if (dst.regClass() == v2b && use_sdwa && !op.isLiteral()) {
1325 if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) {
1326 /* use v_mov_b32 to avoid possible issues with denormal flushing or
1327 * NaN. v_add_f16 is still needed for float constants. */
1328 uint32_t val32 = (int32_t)(int16_t)op.constantValue();
1329 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, Operand::c32(val32));
1330 } else {
1331 bld.vop2_sdwa(aco_opcode::v_add_f16, dst, op, Operand::zero());
1332 }
1333 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1334 op = Operand::c32(op.constantValue());
1335 Instruction* instr = bld.vop3(aco_opcode::v_add_u16_e64, dst, op, Operand::c32(0));
1336 instr->valu().opsel[3] = dst.physReg().byte() == 2;
1337 } else {
1338 uint32_t offset = dst.physReg().byte() * 8u;
1339 uint32_t mask = ((1u << (dst.bytes() * 8)) - 1) << offset;
1340 uint32_t val = (op.constantValue() << offset) & mask;
1341 dst = Definition(PhysReg(dst.physReg().reg()), v1);
1342 Operand def_op(dst.physReg(), v1);
1343 if (val != mask)
1344 bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(~mask), def_op);
1345 if (val != 0)
1346 bld.vop2(aco_opcode::v_or_b32, dst, Operand::c32(val), def_op);
1347 }
1348 }
1349 }
1350
1351 bool
do_copy(lower_context * ctx,Builder & bld,const copy_operation & copy,bool * preserve_scc,PhysReg scratch_sgpr)1352 do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc,
1353 PhysReg scratch_sgpr)
1354 {
1355 bool did_copy = false;
1356 for (unsigned offset = 0; offset < copy.bytes;) {
1357 if (copy.uses[offset]) {
1358 offset++;
1359 continue;
1360 }
1361
1362 Definition def;
1363 Operand op;
1364 split_copy(ctx, offset, &def, &op, copy, false, 8);
1365
1366 if (def.physReg() == scc) {
1367 bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand::zero());
1368 *preserve_scc = true;
1369 } else if (op.isConstant()) {
1370 copy_constant(ctx, bld, def, op);
1371 } else if (def.regClass() == v1) {
1372 bld.vop1(aco_opcode::v_mov_b32, def, op);
1373 } else if (def.regClass() == v2) {
1374 bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
1375 } else if (def.regClass() == s1) {
1376 bld.sop1(aco_opcode::s_mov_b32, def, op);
1377 } else if (def.regClass() == s2) {
1378 bld.sop1(aco_opcode::s_mov_b64, def, op);
1379 } else if (def.regClass() == v1b && ctx->program->gfx_level >= GFX11) {
1380 uint8_t swiz[] = {4, 5, 6, 7};
1381 swiz[def.physReg().byte()] = op.physReg().byte();
1382 create_bperm(bld, swiz, def, op);
1383 } else if (def.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1384 emit_v_mov_b16(bld, def, op);
1385 } else if (def.regClass().is_subdword()) {
1386 bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);
1387 } else {
1388 unreachable("unsupported copy");
1389 }
1390
1391 did_copy = true;
1392 offset += def.bytes();
1393 }
1394 return did_copy;
1395 }
1396
1397 void
swap_subdword_gfx11(Builder & bld,Definition def,Operand op)1398 swap_subdword_gfx11(Builder& bld, Definition def, Operand op)
1399 {
1400 if (def.physReg().reg() == op.physReg().reg()) {
1401 assert(def.bytes() != 2); /* handled by caller */
1402 uint8_t swiz[] = {4, 5, 6, 7};
1403 std::swap(swiz[def.physReg().byte()], swiz[op.physReg().byte()]);
1404 create_bperm(bld, swiz, def, Operand::zero());
1405 return;
1406 }
1407
1408 if (def.bytes() == 2) {
1409 Operand def_as_op = Operand(def.physReg(), def.regClass());
1410 Definition op_as_def = Definition(op.physReg(), op.regClass());
1411 /* v_swap_b16 is not offically supported as VOP3, so it can't be used with v128-255.
1412 * Tests show that VOP3 appears to work correctly, but according to AMD that should
1413 * not be relied on.
1414 */
1415 if (def.physReg() < (256 + 128) && op.physReg() < (256 + 128)) {
1416 Instruction* instr = bld.vop1(aco_opcode::v_swap_b16, def, op_as_def, op, def_as_op);
1417 instr->valu().opsel[0] = op.physReg().byte();
1418 instr->valu().opsel[3] = def.physReg().byte();
1419 } else {
1420 Instruction* instr = bld.vop3(aco_opcode::v_xor_b16, def, op, def_as_op);
1421 instr->valu().opsel[0] = op.physReg().byte();
1422 instr->valu().opsel[1] = def_as_op.physReg().byte();
1423 instr->valu().opsel[3] = def.physReg().byte();
1424 instr = bld.vop3(aco_opcode::v_xor_b16, op_as_def, op, def_as_op);
1425 instr->valu().opsel[0] = op.physReg().byte();
1426 instr->valu().opsel[1] = def_as_op.physReg().byte();
1427 instr->valu().opsel[3] = op_as_def.physReg().byte();
1428 instr = bld.vop3(aco_opcode::v_xor_b16, def, op, def_as_op);
1429 instr->valu().opsel[0] = op.physReg().byte();
1430 instr->valu().opsel[1] = def_as_op.physReg().byte();
1431 instr->valu().opsel[3] = def.physReg().byte();
1432 }
1433 } else {
1434 PhysReg op_half = op.physReg();
1435 op_half.reg_b &= ~1;
1436
1437 PhysReg def_other_half = def.physReg();
1438 def_other_half.reg_b &= ~1;
1439 def_other_half.reg_b ^= 2;
1440
1441 /* We can only swap individual bytes within a single VGPR, so temporarily move both bytes
1442 * into the same VGPR.
1443 */
1444 swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1445 swap_subdword_gfx11(bld, def, Operand(def_other_half.advance(op.physReg().byte() & 1), v1b));
1446 swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1447 }
1448 }
1449
1450 void
do_swap(lower_context * ctx,Builder & bld,const copy_operation & copy,bool preserve_scc,Pseudo_instruction * pi)1451 do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool preserve_scc,
1452 Pseudo_instruction* pi)
1453 {
1454 unsigned offset = 0;
1455
1456 if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&
1457 (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {
1458 /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte
1459 * swap */
1460 PhysReg op = copy.op.physReg();
1461 PhysReg def = copy.def.physReg();
1462 op.reg_b &= ~0x3;
1463 def.reg_b &= ~0x3;
1464
1465 copy_operation tmp;
1466 tmp.op = Operand(op, v1);
1467 tmp.def = Definition(def, v1);
1468 tmp.bytes = 4;
1469 memset(tmp.uses, 1, 4);
1470 do_swap(ctx, bld, tmp, preserve_scc, pi);
1471
1472 op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1473 def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1474 tmp.op = Operand(op, v1b);
1475 tmp.def = Definition(def, v1b);
1476 tmp.bytes = 1;
1477 tmp.uses[0] = 1;
1478 do_swap(ctx, bld, tmp, preserve_scc, pi);
1479
1480 offset = copy.bytes;
1481 }
1482
1483 for (; offset < copy.bytes;) {
1484 Definition def;
1485 Operand op;
1486 unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8;
1487 split_copy(ctx, offset, &def, &op, copy, true, max_size);
1488
1489 assert(op.regClass() == def.regClass());
1490 Operand def_as_op = Operand(def.physReg(), def.regClass());
1491 Definition op_as_def = Definition(op.physReg(), op.regClass());
1492 if (ctx->program->gfx_level >= GFX9 && def.regClass() == v1) {
1493 bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1494 } else if (def.regClass() == v1) {
1495 assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);
1496 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1497 bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1498 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1499 } else if (op.physReg() == scc || def.physReg() == scc) {
1500 /* we need to swap scc and another sgpr */
1501 assert(!preserve_scc);
1502
1503 PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg();
1504
1505 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1506 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1),
1507 Operand::zero());
1508 bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
1509 } else if (def.regClass() == s1) {
1510 if (preserve_scc) {
1511 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op);
1512 bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
1513 bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1));
1514 } else {
1515 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1516 bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op);
1517 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1518 }
1519 } else if (def.regClass() == s2) {
1520 if (preserve_scc)
1521 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1522 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1523 bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);
1524 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1525 if (preserve_scc)
1526 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1),
1527 Operand::zero());
1528 } else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
1529 bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op,
1530 Operand::c32(2u));
1531 } else {
1532 assert(def.regClass().is_subdword());
1533 if (ctx->program->gfx_level >= GFX11) {
1534 swap_subdword_gfx11(bld, def, op);
1535 } else {
1536 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1537 bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op);
1538 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1539 }
1540 }
1541
1542 offset += def.bytes();
1543 }
1544
1545 if (ctx->program->gfx_level <= GFX7)
1546 return;
1547
1548 /* fixup in case we swapped bytes we shouldn't have */
1549 copy_operation tmp_copy = copy;
1550 tmp_copy.op.setFixed(copy.def.physReg());
1551 tmp_copy.def.setFixed(copy.op.physReg());
1552 do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
1553 }
1554
1555 void
do_pack_2x16(lower_context * ctx,Builder & bld,Definition def,Operand lo,Operand hi)1556 do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi)
1557 {
1558 assert(ctx->program->gfx_level >= GFX8);
1559
1560 if (lo.isConstant() && hi.isConstant()) {
1561 copy_constant(ctx, bld, def, Operand::c32(lo.constantValue() | (hi.constantValue() << 16)));
1562 return;
1563 }
1564
1565 bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) &&
1566 (ctx->program->gfx_level >= GFX10 ||
1567 (ctx->program->gfx_level >= GFX9 && !lo.isLiteral() && !hi.isLiteral()));
1568
1569 if (can_use_pack) {
1570 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
1571 /* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */
1572 instr->valu().opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1);
1573 return;
1574 }
1575
1576 /* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */
1577 if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 &&
1578 (!hi.isConstant() || (hi.constantValue() && (!Operand::c32(hi.constantValue()).isLiteral() ||
1579 ctx->program->gfx_level >= GFX10)))) {
1580 if (hi.isConstant())
1581 bld.vop3(aco_opcode::v_alignbyte_b32, def, Operand::c32(hi.constantValue()), lo,
1582 Operand::c32(2u));
1583 else
1584 bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
1585 return;
1586 }
1587
1588 Definition def_lo = Definition(def.physReg(), v2b);
1589 Definition def_hi = Definition(def.physReg().advance(2), v2b);
1590
1591 if (lo.isConstant()) {
1592 /* move hi and zero low bits */
1593 if (hi.physReg().byte() == 0)
1594 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1595 else
1596 bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi);
1597 if (lo.constantValue())
1598 bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
1599 Operand(def.physReg(), v1));
1600 return;
1601 }
1602 if (hi.isConstant()) {
1603 /* move lo and zero high bits */
1604 if (lo.physReg().byte() == 2)
1605 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1606 else if (ctx->program->gfx_level >= GFX11)
1607 bld.vop1(aco_opcode::v_cvt_u32_u16, def, lo);
1608 else
1609 bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo);
1610 if (hi.constantValue())
1611 bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
1612 Operand(def.physReg(), v1));
1613 return;
1614 }
1615
1616 if (lo.physReg().reg() == def.physReg().reg()) {
1617 /* lo is in the high bits of def */
1618 assert(lo.physReg().byte() == 2);
1619 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1620 lo.setFixed(def.physReg());
1621 } else if (hi.physReg() == def.physReg()) {
1622 /* hi is in the low bits of def */
1623 assert(hi.physReg().byte() == 0);
1624 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1625 hi.setFixed(def.physReg().advance(2));
1626 } else if (ctx->program->gfx_level >= GFX8) {
1627 /* Either lo or hi can be placed with just a v_mov. SDWA is not needed, because
1628 * op.physReg().byte()==def.physReg().byte() and the other half will be overwritten.
1629 */
1630 assert(lo.physReg().byte() == 0 || hi.physReg().byte() == 2);
1631 Operand& op = lo.physReg().byte() == 0 ? lo : hi;
1632 PhysReg reg = def.physReg().advance(op.physReg().byte());
1633 bld.vop1(aco_opcode::v_mov_b32, Definition(reg, v2b), op);
1634 op.setFixed(reg);
1635 }
1636
1637 /* either hi or lo are already placed correctly */
1638 if (ctx->program->gfx_level >= GFX11) {
1639 if (lo.physReg().reg() == def.physReg().reg())
1640 emit_v_mov_b16(bld, def_hi, hi);
1641 else
1642 emit_v_mov_b16(bld, def_lo, lo);
1643 } else {
1644 if (lo.physReg().reg() == def.physReg().reg())
1645 bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi);
1646 else
1647 bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo);
1648 }
1649 }
1650
1651 void
try_coalesce_copies(lower_context * ctx,std::map<PhysReg,copy_operation> & copy_map,copy_operation & copy)1652 try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_map,
1653 copy_operation& copy)
1654 {
1655 // TODO try more relaxed alignment for subdword copies
1656 unsigned next_def_align = util_next_power_of_two(copy.bytes + 1);
1657 unsigned next_op_align = next_def_align;
1658 if (copy.def.regClass().type() == RegType::vgpr)
1659 next_def_align = MIN2(next_def_align, 4);
1660 if (copy.op.regClass().type() == RegType::vgpr)
1661 next_op_align = MIN2(next_op_align, 4);
1662
1663 if (copy.bytes >= 8 || copy.def.physReg().reg_b % next_def_align ||
1664 (!copy.op.isConstant() && copy.op.physReg().reg_b % next_op_align))
1665 return;
1666
1667 auto other = copy_map.find(copy.def.physReg().advance(copy.bytes));
1668 if (other == copy_map.end() || copy.bytes + other->second.bytes > 8 ||
1669 copy.op.isConstant() != other->second.op.isConstant())
1670 return;
1671
1672 /* don't create 64-bit copies before GFX10 */
1673 if (copy.bytes >= 4 && copy.def.regClass().type() == RegType::vgpr &&
1674 ctx->program->gfx_level < GFX10)
1675 return;
1676
1677 unsigned new_size = copy.bytes + other->second.bytes;
1678 if (copy.op.isConstant()) {
1679 uint64_t val =
1680 copy.op.constantValue64() | (other->second.op.constantValue64() << (copy.bytes * 8u));
1681 if (!util_is_power_of_two_or_zero(new_size))
1682 return;
1683 if (!Operand::is_constant_representable(val, new_size, true,
1684 copy.def.regClass().type() == RegType::vgpr))
1685 return;
1686 copy.op = Operand::get_const(ctx->program->gfx_level, val, new_size);
1687 } else {
1688 if (other->second.op.physReg() != copy.op.physReg().advance(copy.bytes))
1689 return;
1690 copy.op = Operand(copy.op.physReg(), copy.op.regClass().resize(new_size));
1691 }
1692
1693 copy.bytes = new_size;
1694 copy.def = Definition(copy.def.physReg(), copy.def.regClass().resize(copy.bytes));
1695 copy_map.erase(other);
1696 }
1697
1698 void
handle_operands(std::map<PhysReg,copy_operation> & copy_map,lower_context * ctx,amd_gfx_level gfx_level,Pseudo_instruction * pi)1699 handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
1700 amd_gfx_level gfx_level, Pseudo_instruction* pi)
1701 {
1702 Builder bld(ctx->program, &ctx->instructions);
1703 unsigned num_instructions_before = ctx->instructions.size();
1704 aco_ptr<Instruction> mov;
1705 bool writes_scc = false;
1706
1707 /* count the number of uses for each dst reg */
1708 for (auto it = copy_map.begin(); it != copy_map.end();) {
1709
1710 if (it->second.def.physReg() == scc)
1711 writes_scc = true;
1712
1713 assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr));
1714
1715 /* if src and dst reg are the same, remove operation */
1716 if (it->first == it->second.op.physReg()) {
1717 it = copy_map.erase(it);
1718 continue;
1719 }
1720
1721 /* split large copies */
1722 if (it->second.bytes > 8) {
1723 assert(!it->second.op.isConstant());
1724 assert(!it->second.def.regClass().is_subdword());
1725 RegClass rc = it->second.def.regClass().resize(it->second.def.bytes() - 8);
1726 Definition hi_def = Definition(PhysReg{it->first + 2}, rc);
1727 rc = it->second.op.regClass().resize(it->second.op.bytes() - 8);
1728 Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 2}, rc);
1729 copy_operation copy = {hi_op, hi_def, it->second.bytes - 8};
1730 copy_map[hi_def.physReg()] = copy;
1731 assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0);
1732 it->second.op = Operand(it->second.op.physReg(), it->second.op.regClass().resize(8));
1733 it->second.def = Definition(it->second.def.physReg(), it->second.def.regClass().resize(8));
1734 it->second.bytes = 8;
1735 }
1736
1737 try_coalesce_copies(ctx, copy_map, it->second);
1738
1739 /* check if the definition reg is used by another copy operation */
1740 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1741 if (copy.second.op.isConstant())
1742 continue;
1743 for (uint16_t i = 0; i < it->second.bytes; i++) {
1744 /* distance might underflow */
1745 unsigned distance = it->first.reg_b + i - copy.second.op.physReg().reg_b;
1746 if (distance < copy.second.bytes)
1747 it->second.uses[i] += 1;
1748 }
1749 }
1750
1751 ++it;
1752 }
1753
1754 /* first, handle paths in the location transfer graph */
1755 bool preserve_scc = pi->tmp_in_scc && !writes_scc;
1756 bool skip_partial_copies = true;
1757 for (auto it = copy_map.begin();;) {
1758 if (copy_map.empty()) {
1759 ctx->program->statistics[aco_statistic_copies] +=
1760 ctx->instructions.size() - num_instructions_before;
1761 return;
1762 }
1763 if (it == copy_map.end()) {
1764 if (!skip_partial_copies)
1765 break;
1766 skip_partial_copies = false;
1767 it = copy_map.begin();
1768 }
1769
1770 /* check if we can pack one register at once */
1771 if (it->first.byte() == 0 && it->second.bytes == 2) {
1772 PhysReg reg_hi = it->first.advance(2);
1773 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1774 if (other != copy_map.end() && other->second.bytes == 2) {
1775 /* check if the target register is otherwise unused */
1776 bool unused_lo = !it->second.is_used || (it->second.is_used == 0x0101 &&
1777 other->second.op.physReg() == it->first);
1778 bool unused_hi = !other->second.is_used ||
1779 (other->second.is_used == 0x0101 && it->second.op.physReg() == reg_hi);
1780 if (unused_lo && unused_hi) {
1781 Operand lo = it->second.op;
1782 Operand hi = other->second.op;
1783 do_pack_2x16(ctx, bld, Definition(it->first, v1), lo, hi);
1784 copy_map.erase(it);
1785 copy_map.erase(other);
1786
1787 for (std::pair<const PhysReg, copy_operation>& other2 : copy_map) {
1788 for (uint16_t i = 0; i < other2.second.bytes; i++) {
1789 /* distance might underflow */
1790 unsigned distance_lo = other2.first.reg_b + i - lo.physReg().reg_b;
1791 unsigned distance_hi = other2.first.reg_b + i - hi.physReg().reg_b;
1792 if (distance_lo < 2 || distance_hi < 2)
1793 other2.second.uses[i] -= 1;
1794 }
1795 }
1796 it = copy_map.begin();
1797 continue;
1798 }
1799 }
1800 }
1801
1802 /* optimize constant copies to aligned sgpr pair that's otherwise unused. */
1803 if (it->first <= exec && (it->first % 2) == 0 && it->second.bytes == 4 &&
1804 it->second.op.isConstant() && !it->second.is_used) {
1805 PhysReg reg_hi = it->first.advance(4);
1806 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1807 if (other != copy_map.end() && other->second.bytes == 4 && other->second.op.isConstant() &&
1808 !other->second.is_used) {
1809 uint64_t constant =
1810 it->second.op.constantValue64() | (other->second.op.constantValue64() << 32);
1811 copy_constant_sgpr(bld, Definition(it->first, s2), constant);
1812 copy_map.erase(it);
1813 copy_map.erase(other);
1814 it = copy_map.begin();
1815 continue;
1816 }
1817 }
1818
1819 /* find portions where the target reg is not used as operand for any other copy */
1820 if (it->second.is_used) {
1821 if (it->second.op.isConstant() || skip_partial_copies) {
1822 /* we have to skip constants until is_used=0.
1823 * we also skip partial copies at the beginning to help coalescing */
1824 ++it;
1825 continue;
1826 }
1827
1828 unsigned has_zero_use_bytes = 0;
1829 for (unsigned i = 0; i < it->second.bytes; i++)
1830 has_zero_use_bytes |= (it->second.uses[i] == 0) << i;
1831
1832 if (has_zero_use_bytes) {
1833 /* Skipping partial copying and doing a v_swap_b32 and then fixup
1834 * copies is usually beneficial for sub-dword copies, but if doing
1835 * a partial copy allows further copies, it should be done instead. */
1836 bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);
1837 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1838 if (partial_copy)
1839 break;
1840 for (uint16_t i = 0; i < copy.second.bytes; i++) {
1841 /* distance might underflow */
1842 unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b;
1843 if (distance < it->second.bytes && copy.second.uses[i] == 1 &&
1844 !it->second.uses[distance])
1845 partial_copy = true;
1846 }
1847 }
1848
1849 if (!partial_copy) {
1850 ++it;
1851 continue;
1852 }
1853 } else {
1854 /* full target reg is used: register swapping needed */
1855 ++it;
1856 continue;
1857 }
1858 }
1859
1860 bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr);
1861 skip_partial_copies = did_copy;
1862 std::pair<PhysReg, copy_operation> copy = *it;
1863
1864 if (it->second.is_used == 0) {
1865 /* the target reg is not used as operand for any other copy, so we
1866 * copied to all of it */
1867 copy_map.erase(it);
1868 it = copy_map.begin();
1869 } else {
1870 /* we only performed some portions of this copy, so split it to only
1871 * leave the portions that still need to be done */
1872 copy_operation original = it->second; /* the map insertion below can overwrite this */
1873 copy_map.erase(it);
1874 for (unsigned offset = 0; offset < original.bytes;) {
1875 if (original.uses[offset] == 0) {
1876 offset++;
1877 continue;
1878 }
1879 Definition def;
1880 Operand op;
1881 split_copy(ctx, offset, &def, &op, original, false, 8);
1882
1883 copy_operation new_copy = {op, def, def.bytes()};
1884 for (unsigned i = 0; i < new_copy.bytes; i++)
1885 new_copy.uses[i] = original.uses[i + offset];
1886 copy_map[def.physReg()] = new_copy;
1887
1888 offset += def.bytes();
1889 }
1890
1891 it = copy_map.begin();
1892 }
1893
1894 /* Reduce the number of uses of the operand reg by one. Do this after
1895 * splitting the copy or removing it in case the copy writes to it's own
1896 * operand (for example, v[7:8] = v[8:9]) */
1897 if (did_copy && !copy.second.op.isConstant()) {
1898 for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
1899 for (uint16_t i = 0; i < other.second.bytes; i++) {
1900 /* distance might underflow */
1901 unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b;
1902 if (distance < copy.second.bytes && !copy.second.uses[distance])
1903 other.second.uses[i] -= 1;
1904 }
1905 }
1906 }
1907 }
1908
1909 /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */
1910 unsigned largest = 0;
1911 for (const std::pair<const PhysReg, copy_operation>& op : copy_map)
1912 largest = MAX2(largest, op.second.bytes);
1913
1914 while (!copy_map.empty()) {
1915
1916 /* Perform larger swaps first, because larger swaps swaps can make other
1917 * swaps unnecessary. */
1918 auto it = copy_map.begin();
1919 for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) {
1920 if (it2->second.bytes > it->second.bytes) {
1921 it = it2;
1922 if (it->second.bytes == largest)
1923 break;
1924 }
1925 }
1926
1927 /* should already be done */
1928 assert(!it->second.op.isConstant());
1929
1930 assert(it->second.op.isFixed());
1931 assert(it->second.def.regClass() == it->second.op.regClass());
1932
1933 if (it->first == it->second.op.physReg()) {
1934 copy_map.erase(it);
1935 continue;
1936 }
1937
1938 if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr)
1939 assert(!(it->second.def.physReg() == pi->scratch_sgpr));
1940
1941 /* to resolve the cycle, we have to swap the src reg with the dst reg */
1942 copy_operation swap = it->second;
1943
1944 /* if this is self-intersecting, we have to split it because
1945 * self-intersecting swaps don't make sense */
1946 PhysReg src = swap.op.physReg(), dst = swap.def.physReg();
1947 if (abs((int)src.reg_b - (int)dst.reg_b) < (int)swap.bytes) {
1948 unsigned offset = abs((int)src.reg_b - (int)dst.reg_b);
1949
1950 copy_operation remaining;
1951 src.reg_b += offset;
1952 dst.reg_b += offset;
1953 remaining.bytes = swap.bytes - offset;
1954 memcpy(remaining.uses, swap.uses + offset, remaining.bytes);
1955 remaining.op = Operand(src, swap.def.regClass().resize(remaining.bytes));
1956 remaining.def = Definition(dst, swap.def.regClass().resize(remaining.bytes));
1957 copy_map[dst] = remaining;
1958
1959 memset(swap.uses + offset, 0, swap.bytes - offset);
1960 swap.bytes = offset;
1961 }
1962
1963 /* GFX6-7 can only swap full registers */
1964 assert (ctx->program->gfx_level > GFX7 || (swap.bytes % 4) == 0);
1965
1966 do_swap(ctx, bld, swap, preserve_scc, pi);
1967
1968 /* remove from map */
1969 copy_map.erase(it);
1970
1971 /* change the operand reg of the target's uses and split uses if needed */
1972 uint32_t bytes_left = u_bit_consecutive(0, swap.bytes);
1973 for (auto target = copy_map.begin(); target != copy_map.end(); ++target) {
1974 if (target->second.op.physReg() == swap.def.physReg() &&
1975 swap.bytes == target->second.bytes) {
1976 target->second.op.setFixed(swap.op.physReg());
1977 break;
1978 }
1979
1980 uint32_t imask =
1981 get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,
1982 target->second.op.physReg().reg_b, target->second.bytes);
1983
1984 if (!imask)
1985 continue;
1986
1987 int offset = (int)target->second.op.physReg().reg_b - (int)swap.def.physReg().reg_b;
1988
1989 /* split and update the middle (the portion that reads the swap's
1990 * definition) to read the swap's operand instead */
1991 int target_op_end = target->second.op.physReg().reg_b + target->second.bytes;
1992 int swap_def_end = swap.def.physReg().reg_b + swap.bytes;
1993 int before_bytes = MAX2(-offset, 0);
1994 int after_bytes = MAX2(target_op_end - swap_def_end, 0);
1995 int middle_bytes = target->second.bytes - before_bytes - after_bytes;
1996
1997 if (after_bytes) {
1998 unsigned after_offset = before_bytes + middle_bytes;
1999 assert(after_offset > 0);
2000 copy_operation copy;
2001 copy.bytes = after_bytes;
2002 memcpy(copy.uses, target->second.uses + after_offset, copy.bytes);
2003 RegClass rc = target->second.op.regClass().resize(after_bytes);
2004 copy.op = Operand(target->second.op.physReg().advance(after_offset), rc);
2005 copy.def = Definition(target->second.def.physReg().advance(after_offset), rc);
2006 copy_map[copy.def.physReg()] = copy;
2007 }
2008
2009 if (middle_bytes) {
2010 copy_operation copy;
2011 copy.bytes = middle_bytes;
2012 memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes);
2013 RegClass rc = target->second.op.regClass().resize(middle_bytes);
2014 copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc);
2015 copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc);
2016 copy_map[copy.def.physReg()] = copy;
2017 }
2018
2019 if (before_bytes) {
2020 copy_operation copy;
2021 target->second.bytes = before_bytes;
2022 RegClass rc = target->second.op.regClass().resize(before_bytes);
2023 target->second.op = Operand(target->second.op.physReg(), rc);
2024 target->second.def = Definition(target->second.def.physReg(), rc);
2025 memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes);
2026 }
2027
2028 /* break early since we know each byte of the swap's definition is used
2029 * at most once */
2030 bytes_left &= ~imask;
2031 if (!bytes_left)
2032 break;
2033 }
2034 }
2035 ctx->program->statistics[aco_statistic_copies] +=
2036 ctx->instructions.size() - num_instructions_before;
2037 }
2038
2039 void
handle_operands_linear_vgpr(std::map<PhysReg,copy_operation> & copy_map,lower_context * ctx,amd_gfx_level gfx_level,Pseudo_instruction * pi)2040 handle_operands_linear_vgpr(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
2041 amd_gfx_level gfx_level, Pseudo_instruction* pi)
2042 {
2043 Builder bld(ctx->program, &ctx->instructions);
2044
2045 for (auto& copy : copy_map) {
2046 copy.second.op =
2047 Operand(copy.second.op.physReg(), RegClass::get(RegType::vgpr, copy.second.op.bytes()));
2048 copy.second.def = Definition(copy.second.def.physReg(),
2049 RegClass::get(RegType::vgpr, copy.second.def.bytes()));
2050 }
2051
2052 std::map<PhysReg, copy_operation> second_map(copy_map);
2053 handle_operands(second_map, ctx, gfx_level, pi);
2054
2055 bool tmp_in_scc = pi->tmp_in_scc;
2056 if (tmp_in_scc) {
2057 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
2058 pi->tmp_in_scc = false;
2059 }
2060 bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), Operand(exec, bld.lm));
2061
2062 handle_operands(copy_map, ctx, gfx_level, pi);
2063
2064 bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), Operand(exec, bld.lm));
2065 if (tmp_in_scc) {
2066 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1),
2067 Operand::zero());
2068 pi->tmp_in_scc = true;
2069 }
2070
2071 ctx->program->statistics[aco_statistic_copies] += tmp_in_scc ? 4 : 2;
2072 }
2073
2074 void
emit_set_mode(Builder & bld,float_mode new_mode,bool set_round,bool set_denorm)2075 emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm)
2076 {
2077 if (bld.program->gfx_level >= GFX10) {
2078 if (set_round)
2079 bld.sopp(aco_opcode::s_round_mode, new_mode.round);
2080 if (set_denorm)
2081 bld.sopp(aco_opcode::s_denorm_mode, new_mode.denorm);
2082 } else if (set_round || set_denorm) {
2083 /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */
2084 bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(new_mode.val), (7 << 11) | 1);
2085 }
2086 }
2087
2088 void
emit_set_mode_from_block(Builder & bld,Program & program,Block * block)2089 emit_set_mode_from_block(Builder& bld, Program& program, Block* block)
2090 {
2091 float_mode initial;
2092 initial.val = program.config->float_mode;
2093
2094 bool inital_unknown =
2095 (program.info.merged_shader_compiled_separately && program.stage.sw == SWStage::GS) ||
2096 (program.info.merged_shader_compiled_separately && program.stage.sw == SWStage::TCS);
2097 bool is_start = block->index == 0;
2098 bool set_round = is_start && (inital_unknown || block->fp_mode.round != initial.round);
2099 bool set_denorm = is_start && (inital_unknown || block->fp_mode.denorm != initial.denorm);
2100 if (block->kind & block_kind_top_level) {
2101 for (unsigned pred : block->linear_preds) {
2102 if (program.blocks[pred].fp_mode.round != block->fp_mode.round)
2103 set_round = true;
2104 if (program.blocks[pred].fp_mode.denorm != block->fp_mode.denorm)
2105 set_denorm = true;
2106 }
2107 }
2108 /* only allow changing modes at top-level blocks so this doesn't break
2109 * the "jump over empty blocks" optimization */
2110 assert((!set_round && !set_denorm) || (block->kind & block_kind_top_level));
2111 emit_set_mode(bld, block->fp_mode, set_round, set_denorm);
2112 }
2113
2114 void
lower_image_sample(lower_context * ctx,aco_ptr<Instruction> & instr)2115 lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
2116 {
2117 Operand linear_vgpr = instr->operands[3];
2118
2119 unsigned nsa_size = ctx->program->dev.max_nsa_vgprs;
2120 unsigned vaddr_size = linear_vgpr.size();
2121 unsigned num_copied_vgprs = instr->operands.size() - 4;
2122 nsa_size = num_copied_vgprs > 0 && (ctx->program->gfx_level >= GFX11 || vaddr_size <= nsa_size)
2123 ? nsa_size
2124 : 0;
2125
2126 Operand vaddr[16];
2127 unsigned num_vaddr = 0;
2128
2129 if (nsa_size) {
2130 assert(num_copied_vgprs <= nsa_size);
2131 for (unsigned i = 0; i < num_copied_vgprs; i++)
2132 vaddr[num_vaddr++] = instr->operands[4 + i];
2133 for (unsigned i = num_copied_vgprs; i < std::min(vaddr_size, nsa_size); i++)
2134 vaddr[num_vaddr++] = Operand(linear_vgpr.physReg().advance(i * 4), v1);
2135 if (vaddr_size > nsa_size) {
2136 RegClass rc = RegClass::get(RegType::vgpr, (vaddr_size - nsa_size) * 4);
2137 vaddr[num_vaddr++] = Operand(PhysReg(linear_vgpr.physReg().advance(nsa_size * 4)), rc);
2138 }
2139 } else {
2140 PhysReg reg = linear_vgpr.physReg();
2141 std::map<PhysReg, copy_operation> copy_operations;
2142 for (unsigned i = 4; i < instr->operands.size(); i++) {
2143 Operand arg = instr->operands[i];
2144 Definition def(reg, RegClass::get(RegType::vgpr, arg.bytes()));
2145 copy_operations[def.physReg()] = {arg, def, def.bytes()};
2146 reg = reg.advance(arg.bytes());
2147 }
2148 vaddr[num_vaddr++] = linear_vgpr;
2149
2150 Pseudo_instruction pi = {};
2151 handle_operands(copy_operations, ctx, ctx->program->gfx_level, &pi);
2152 }
2153
2154 instr->mimg().strict_wqm = false;
2155
2156 if ((3 + num_vaddr) > instr->operands.size()) {
2157 Instruction* new_instr =
2158 create_instruction(instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size());
2159 std::copy(instr->definitions.cbegin(), instr->definitions.cend(),
2160 new_instr->definitions.begin());
2161 new_instr->operands[0] = instr->operands[0];
2162 new_instr->operands[1] = instr->operands[1];
2163 new_instr->operands[2] = instr->operands[2];
2164 memcpy((uint8_t*)new_instr + sizeof(Instruction), (uint8_t*)instr.get() + sizeof(Instruction),
2165 sizeof(MIMG_instruction) - sizeof(Instruction));
2166 instr.reset(new_instr);
2167 } else {
2168 while (instr->operands.size() > (3 + num_vaddr))
2169 instr->operands.pop_back();
2170 }
2171 std::copy(vaddr, vaddr + num_vaddr, std::next(instr->operands.begin(), 3));
2172 }
2173
2174 } /* end namespace */
2175
2176 void
hw_init_scratch(Builder & bld,Definition def,Operand scratch_addr,Operand scratch_offset)2177 hw_init_scratch(Builder& bld, Definition def, Operand scratch_addr, Operand scratch_offset)
2178 {
2179 /* Since we know what the high 16 bits of scratch_hi is, we can set all the high 16
2180 * bits in the same instruction that we add the carry.
2181 */
2182 Operand hi_add = Operand::c32(0xffff0000 - S_008F04_SWIZZLE_ENABLE_GFX6(1));
2183 Operand scratch_addr_lo(scratch_addr.physReg(), s1);
2184 Operand scratch_addr_hi(scratch_addr_lo.physReg().advance(4), s1);
2185
2186 if (bld.program->gfx_level >= GFX10) {
2187 PhysReg scratch_lo = def.physReg();
2188 PhysReg scratch_hi = def.physReg().advance(4);
2189
2190 bld.sop2(aco_opcode::s_add_u32, Definition(scratch_lo, s1), Definition(scc, s1),
2191 scratch_addr_lo, scratch_offset);
2192 bld.sop2(aco_opcode::s_addc_u32, Definition(scratch_hi, s1), Definition(scc, s1),
2193 scratch_addr_hi, hi_add, Operand(scc, s1));
2194
2195 /* "((size - 1) << 11) | register" (FLAT_SCRATCH_LO/HI is encoded as register
2196 * 20/21) */
2197 bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_lo, s1), (31 << 11) | 20);
2198 bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_hi, s1), (31 << 11) | 21);
2199 } else {
2200 bld.sop2(aco_opcode::s_add_u32, Definition(flat_scr_lo, s1), Definition(scc, s1),
2201 scratch_addr_lo, scratch_offset);
2202 bld.sop2(aco_opcode::s_addc_u32, Definition(flat_scr_hi, s1), Definition(scc, s1),
2203 scratch_addr_hi, hi_add, Operand(scc, s1));
2204 }
2205 }
2206
2207 void
lower_to_hw_instr(Program * program)2208 lower_to_hw_instr(Program* program)
2209 {
2210 gfx9_pops_done_msg_bounds pops_done_msg_bounds;
2211 if (program->has_pops_overlapped_waves_wait && program->gfx_level < GFX11) {
2212 pops_done_msg_bounds = gfx9_pops_done_msg_bounds(program);
2213 }
2214
2215 Block* discard_exit_block = NULL;
2216 Block* discard_pops_done_and_exit_block = NULL;
2217
2218 int end_with_regs_block_index = -1;
2219
2220 bool should_dealloc_vgprs = dealloc_vgprs(program);
2221
2222 for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) {
2223 Block* block = &program->blocks[block_idx];
2224 lower_context ctx;
2225 ctx.program = program;
2226 ctx.block = block;
2227 ctx.instructions.reserve(block->instructions.size());
2228 Builder bld(program, &ctx.instructions);
2229
2230 emit_set_mode_from_block(bld, *program, block);
2231
2232 for (size_t instr_idx = 0; instr_idx < block->instructions.size(); instr_idx++) {
2233 aco_ptr<Instruction>& instr = block->instructions[instr_idx];
2234
2235 /* Send the ordered section done message from the middle of the block if needed (if the
2236 * ordered section is ended by an instruction inside this block).
2237 * Also make sure the done message is sent if it's needed in case early exit happens for
2238 * any reason.
2239 */
2240 if ((block_idx == pops_done_msg_bounds.end_block_idx() &&
2241 instr_idx == pops_done_msg_bounds.instr_after_end_idx()) ||
2242 (instr->opcode == aco_opcode::s_endpgm &&
2243 pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx))) {
2244 bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
2245 }
2246
2247 aco_ptr<Instruction> mov;
2248 if (instr->isPseudo() && instr->opcode != aco_opcode::p_unit_test) {
2249 Pseudo_instruction* pi = &instr->pseudo();
2250
2251 switch (instr->opcode) {
2252 case aco_opcode::p_extract_vector: {
2253 PhysReg reg = instr->operands[0].physReg();
2254 Definition& def = instr->definitions[0];
2255 reg.reg_b += instr->operands[1].constantValue() * def.bytes();
2256
2257 if (reg == def.physReg())
2258 break;
2259
2260 RegClass op_rc = def.regClass().is_subdword()
2261 ? def.regClass()
2262 : RegClass(instr->operands[0].getTemp().type(), def.size());
2263 std::map<PhysReg, copy_operation> copy_operations;
2264 copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()};
2265 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2266 break;
2267 }
2268 case aco_opcode::p_create_vector:
2269 case aco_opcode::p_start_linear_vgpr: {
2270 if (instr->operands.empty())
2271 break;
2272
2273 std::map<PhysReg, copy_operation> copy_operations;
2274 PhysReg reg = instr->definitions[0].physReg();
2275
2276 for (const Operand& op : instr->operands) {
2277 RegClass rc = RegClass::get(instr->definitions[0].regClass().type(), op.bytes());
2278 if (op.isConstant()) {
2279 const Definition def = Definition(reg, rc);
2280 copy_operations[reg] = {op, def, op.bytes()};
2281 reg.reg_b += op.bytes();
2282 continue;
2283 }
2284 if (op.isUndefined()) {
2285 // TODO: coalesce subdword copies if dst byte is 0
2286 reg.reg_b += op.bytes();
2287 continue;
2288 }
2289
2290 RegClass rc_def = op.regClass().is_subdword() ? op.regClass() : rc;
2291 const Definition def = Definition(reg, rc_def);
2292 copy_operations[def.physReg()] = {op, def, op.bytes()};
2293 reg.reg_b += op.bytes();
2294 }
2295 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2296 break;
2297 }
2298 case aco_opcode::p_split_vector: {
2299 std::map<PhysReg, copy_operation> copy_operations;
2300 PhysReg reg = instr->operands[0].physReg();
2301
2302 for (const Definition& def : instr->definitions) {
2303 RegClass rc_op = def.regClass().is_subdword()
2304 ? def.regClass()
2305 : instr->operands[0].getTemp().regClass().resize(def.bytes());
2306 const Operand op = Operand(reg, rc_op);
2307 copy_operations[def.physReg()] = {op, def, def.bytes()};
2308 reg.reg_b += def.bytes();
2309 }
2310 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2311 break;
2312 }
2313 case aco_opcode::p_parallelcopy: {
2314 std::map<PhysReg, copy_operation> copy_operations;
2315 bool linear_vgpr = false;
2316 bool non_linear_vgpr = false;
2317 for (unsigned j = 0; j < instr->operands.size(); j++) {
2318 assert(instr->definitions[j].bytes() == instr->operands[j].bytes());
2319 copy_operations[instr->definitions[j].physReg()] = {
2320 instr->operands[j], instr->definitions[j], instr->operands[j].bytes()};
2321 linear_vgpr |= instr->definitions[j].regClass().is_linear_vgpr();
2322 non_linear_vgpr |= !instr->definitions[j].regClass().is_linear_vgpr();
2323 }
2324 assert(!linear_vgpr || !non_linear_vgpr);
2325 if (linear_vgpr)
2326 handle_operands_linear_vgpr(copy_operations, &ctx, program->gfx_level, pi);
2327 else
2328 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2329 break;
2330 }
2331 case aco_opcode::p_exit_early_if: {
2332 /* don't bother with an early exit near the end of the program */
2333 if ((block->instructions.size() - 1 - instr_idx) <= 4 &&
2334 block->instructions.back()->opcode == aco_opcode::s_endpgm) {
2335 unsigned null_exp_dest =
2336 program->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
2337 bool ignore_early_exit = true;
2338
2339 for (unsigned k = instr_idx + 1; k < block->instructions.size(); ++k) {
2340 const aco_ptr<Instruction>& instr2 = block->instructions[k];
2341 if (instr2->opcode == aco_opcode::s_endpgm ||
2342 instr2->opcode == aco_opcode::p_logical_end)
2343 continue;
2344 else if (instr2->opcode == aco_opcode::exp &&
2345 instr2->exp().dest == null_exp_dest &&
2346 instr2->exp().enabled_mask == 0)
2347 continue;
2348 else if (instr2->opcode == aco_opcode::p_parallelcopy &&
2349 instr2->definitions[0].isFixed() &&
2350 instr2->definitions[0].physReg() == exec)
2351 continue;
2352
2353 ignore_early_exit = false;
2354 }
2355
2356 if (ignore_early_exit)
2357 break;
2358 }
2359
2360 const bool discard_sends_pops_done =
2361 pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx);
2362
2363 Block* discard_block =
2364 discard_sends_pops_done ? discard_pops_done_and_exit_block : discard_exit_block;
2365 if (!discard_block) {
2366 discard_block = program->create_and_insert_block();
2367 discard_block->kind = block_kind_discard_early_exit;
2368 if (discard_sends_pops_done) {
2369 discard_pops_done_and_exit_block = discard_block;
2370 } else {
2371 discard_exit_block = discard_block;
2372 }
2373 block = &program->blocks[block_idx];
2374
2375 bld.reset(discard_block);
2376 if (program->has_pops_overlapped_waves_wait &&
2377 (program->gfx_level >= GFX11 || discard_sends_pops_done)) {
2378 /* If this discard early exit potentially exits the POPS ordered section, do
2379 * the waitcnt necessary before resuming overlapping waves as the normal
2380 * waitcnt insertion doesn't work in a discard early exit block.
2381 */
2382 if (program->gfx_level >= GFX10)
2383 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
2384 wait_imm pops_exit_wait_imm;
2385 pops_exit_wait_imm.vm = 0;
2386 if (program->has_smem_buffer_or_global_loads)
2387 pops_exit_wait_imm.lgkm = 0;
2388 bld.sopp(aco_opcode::s_waitcnt, pops_exit_wait_imm.pack(program->gfx_level));
2389 }
2390 if (discard_sends_pops_done)
2391 bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
2392 unsigned target = V_008DFC_SQ_EXP_NULL;
2393 if (program->gfx_level >= GFX11)
2394 target =
2395 program->has_color_exports ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_MRTZ;
2396 if (program->stage == fragment_fs)
2397 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
2398 target, false, true, true);
2399 if (should_dealloc_vgprs) {
2400 bld.sopp(aco_opcode::s_nop, 0);
2401 bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
2402 }
2403 bld.sopp(aco_opcode::s_endpgm);
2404
2405 bld.reset(&ctx.instructions);
2406 }
2407
2408 assert(instr->operands[0].physReg() == scc);
2409 bld.sopp(aco_opcode::s_cbranch_scc0, instr->operands[0], discard_block->index);
2410
2411 discard_block->linear_preds.push_back(block->index);
2412 block->linear_succs.push_back(discard_block->index);
2413 break;
2414 }
2415 case aco_opcode::p_spill: {
2416 assert(instr->operands[0].regClass() == v1.as_linear());
2417 for (unsigned i = 0; i < instr->operands[2].size(); i++) {
2418 Operand src =
2419 instr->operands[2].isConstant()
2420 ? Operand::c32(uint32_t(instr->operands[2].constantValue64() >> (32 * i)))
2421 : Operand(PhysReg{instr->operands[2].physReg() + i}, s1);
2422 bld.writelane(bld.def(v1, instr->operands[0].physReg()), src,
2423 Operand::c32(instr->operands[1].constantValue() + i),
2424 instr->operands[0]);
2425 }
2426 break;
2427 }
2428 case aco_opcode::p_reload: {
2429 assert(instr->operands[0].regClass() == v1.as_linear());
2430 for (unsigned i = 0; i < instr->definitions[0].size(); i++)
2431 bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
2432 instr->operands[0],
2433 Operand::c32(instr->operands[1].constantValue() + i));
2434 break;
2435 }
2436 case aco_opcode::p_as_uniform: {
2437 if (instr->operands[0].isConstant() ||
2438 instr->operands[0].regClass().type() == RegType::sgpr) {
2439 std::map<PhysReg, copy_operation> copy_operations;
2440 copy_operations[instr->definitions[0].physReg()] = {
2441 instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};
2442 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2443 } else {
2444 assert(instr->operands[0].regClass().type() == RegType::vgpr);
2445 assert(instr->definitions[0].regClass().type() == RegType::sgpr);
2446 assert(instr->operands[0].size() == instr->definitions[0].size());
2447 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
2448 bld.vop1(aco_opcode::v_readfirstlane_b32,
2449 bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
2450 Operand(PhysReg{instr->operands[0].physReg() + i}, v1));
2451 }
2452 }
2453 break;
2454 }
2455 case aco_opcode::p_pops_gfx9_add_exiting_wave_id: {
2456 bld.sop2(aco_opcode::s_add_i32, instr->definitions[0], instr->definitions[1],
2457 Operand(pops_exiting_wave_id, s1), instr->operands[0]);
2458 break;
2459 }
2460 case aco_opcode::p_bpermute_readlane: {
2461 emit_bpermute_readlane(bld, instr);
2462 break;
2463 }
2464 case aco_opcode::p_bpermute_shared_vgpr: {
2465 emit_bpermute_shared_vgpr(bld, instr);
2466 break;
2467 }
2468 case aco_opcode::p_bpermute_permlane: {
2469 emit_bpermute_permlane(bld, instr);
2470 break;
2471 }
2472 case aco_opcode::p_constaddr: {
2473 unsigned id = instr->definitions[0].tempId();
2474 PhysReg reg = instr->definitions[0].physReg();
2475 bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand::c32(id));
2476 if (ctx.program->gfx_level >= GFX12)
2477 bld.sop1(aco_opcode::s_sext_i32_i16, Definition(reg.advance(4), s1), Operand(reg.advance(4), s1));
2478 bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
2479 Operand(reg, s1), instr->operands[0], Operand::c32(id));
2480 /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2481 break;
2482 }
2483 case aco_opcode::p_resume_shader_address: {
2484 /* Find index of resume block. */
2485 unsigned resume_idx = instr->operands[0].constantValue();
2486 unsigned resume_block_idx = 0;
2487 for (Block& resume_block : program->blocks) {
2488 if (resume_block.kind & block_kind_resume) {
2489 if (resume_idx == 0) {
2490 resume_block_idx = resume_block.index;
2491 break;
2492 }
2493 resume_idx--;
2494 }
2495 }
2496 assert(resume_block_idx != 0);
2497 unsigned id = instr->definitions[0].tempId();
2498 PhysReg reg = instr->definitions[0].physReg();
2499 bld.sop1(aco_opcode::p_resumeaddr_getpc, instr->definitions[0], Operand::c32(id));
2500 if (ctx.program->gfx_level >= GFX12)
2501 bld.sop1(aco_opcode::s_sext_i32_i16, Definition(reg.advance(4), s1), Operand(reg.advance(4), s1));
2502 bld.sop2(aco_opcode::p_resumeaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
2503 Operand(reg, s1), Operand::c32(resume_block_idx), Operand::c32(id));
2504 /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2505 break;
2506 }
2507 case aco_opcode::p_extract: {
2508 assert(instr->operands[1].isConstant());
2509 assert(instr->operands[2].isConstant());
2510 assert(instr->operands[3].isConstant());
2511 if (instr->definitions[0].regClass() == s1)
2512 assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2513 Definition dst = instr->definitions[0];
2514 Operand op = instr->operands[0];
2515 unsigned bits = instr->operands[2].constantValue();
2516 unsigned index = instr->operands[1].constantValue();
2517 unsigned offset = index * bits;
2518 bool signext = !instr->operands[3].constantEquals(0);
2519
2520 if (dst.regClass() == s1) {
2521 if (offset == 0 && signext && (bits == 8 || bits == 16)) {
2522 bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16,
2523 dst, op);
2524 } else if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
2525 bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
2526 } else if (ctx.program->gfx_level >= GFX9 && offset == 16 && bits == 16 &&
2527 !signext) {
2528 bld.sop2(aco_opcode::s_pack_hh_b32_b16, dst, op, Operand::zero());
2529 } else if (offset == (32 - bits)) {
2530 bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,
2531 bld.def(s1, scc), op, Operand::c32(offset));
2532 } else {
2533 bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
2534 bld.def(s1, scc), op, Operand::c32((bits << 16) | offset));
2535 }
2536 } else if (dst.regClass() == v1 && op.physReg().byte() == 0) {
2537 assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0);
2538 if (offset == (32 - bits) && op.regClass() != s1) {
2539 bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst,
2540 Operand::c32(offset), op);
2541 } else if (offset == 0 && bits == 16 && ctx.program->gfx_level >= GFX11) {
2542 bld.vop1(signext ? aco_opcode::v_cvt_i32_i16 : aco_opcode::v_cvt_u32_u16, dst,
2543 op);
2544 } else {
2545 bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, dst, op,
2546 Operand::c32(offset), Operand::c32(bits));
2547 }
2548 } else {
2549 assert(dst.regClass() == v2b || dst.regClass() == v1b || op.regClass() == v2b ||
2550 op.regClass() == v1b);
2551 if (ctx.program->gfx_level >= GFX11) {
2552 unsigned op_vgpr_byte = op.physReg().byte() + offset / 8;
2553 unsigned sign_byte = op_vgpr_byte + bits / 8 - 1;
2554
2555 uint8_t swiz[4] = {4, 5, 6, 7};
2556 swiz[dst.physReg().byte()] = op_vgpr_byte;
2557 if (bits == 16)
2558 swiz[dst.physReg().byte() + 1] = op_vgpr_byte + 1;
2559 for (unsigned i = bits / 8; i < dst.bytes(); i++) {
2560 uint8_t ext = bperm_0;
2561 if (signext) {
2562 if (sign_byte == 1)
2563 ext = bperm_b1_sign;
2564 else if (sign_byte == 3)
2565 ext = bperm_b3_sign;
2566 else /* replicate so sign-extension can be done later */
2567 ext = sign_byte;
2568 }
2569 swiz[dst.physReg().byte() + i] = ext;
2570 }
2571 create_bperm(bld, swiz, dst, op);
2572
2573 if (signext && sign_byte != 3 && sign_byte != 1) {
2574 assert(bits == 8);
2575 assert(dst.regClass() == v2b || dst.regClass() == v1);
2576 uint8_t ext_swiz[4] = {4, 5, 6, 7};
2577 uint8_t ext = dst.physReg().byte() == 2 ? bperm_b7_sign : bperm_b5_sign;
2578 memset(ext_swiz + dst.physReg().byte() + 1, ext, dst.bytes() - 1);
2579 create_bperm(bld, ext_swiz, dst, Operand::zero());
2580 }
2581 } else {
2582 SDWA_instruction& sdwa = bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa();
2583 sdwa.sel[0] = SubdwordSel(bits / 8, offset / 8, signext);
2584 }
2585 }
2586 break;
2587 }
2588 case aco_opcode::p_insert: {
2589 assert(instr->operands[1].isConstant());
2590 assert(instr->operands[2].isConstant());
2591 if (instr->definitions[0].regClass() == s1)
2592 assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2593 Definition dst = instr->definitions[0];
2594 Operand op = instr->operands[0];
2595 unsigned bits = instr->operands[2].constantValue();
2596 unsigned index = instr->operands[1].constantValue();
2597 unsigned offset = index * bits;
2598
2599 bool has_sdwa = program->gfx_level >= GFX8 && program->gfx_level < GFX11;
2600 if (dst.regClass() == s1) {
2601 if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
2602 bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
2603 } else if (ctx.program->gfx_level >= GFX9 && offset == 16 && bits == 16) {
2604 bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, Operand::zero(), op);
2605 } else if (offset == (32 - bits)) {
2606 bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), op,
2607 Operand::c32(offset));
2608 } else if (offset == 0) {
2609 bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
2610 Operand::c32(bits << 16));
2611 } else {
2612 bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
2613 Operand::c32(bits << 16));
2614 bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc),
2615 Operand(dst.physReg(), s1), Operand::c32(offset));
2616 }
2617 } else if (dst.regClass() == v1 || !has_sdwa) {
2618 if (offset == (dst.bytes() * 8u - bits) && dst.regClass() == v1) {
2619 bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op);
2620 } else if (offset == 0 && dst.regClass() == v1) {
2621 bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2622 } else if (has_sdwa && (op.regClass() != s1 || program->gfx_level >= GFX9)) {
2623 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().dst_sel =
2624 SubdwordSel(bits / 8, offset / 8, false);
2625 } else if (program->gfx_level >= GFX11) {
2626 uint8_t swiz[] = {4, 5, 6, 7};
2627 for (unsigned i = 0; i < dst.bytes(); i++)
2628 swiz[dst.physReg().byte() + i] = bperm_0;
2629 for (unsigned i = 0; i < bits / 8; i++)
2630 swiz[dst.physReg().byte() + i + offset / 8] = op.physReg().byte() + i;
2631 create_bperm(bld, swiz, dst, op);
2632 } else {
2633 bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2634 bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset),
2635 Operand(dst.physReg(), v1));
2636 }
2637 } else {
2638 assert(dst.regClass() == v2b);
2639 if (!offset) {
2640 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().sel[0] =
2641 SubdwordSel::ubyte;
2642 } else if (program->gfx_level >= GFX9) {
2643 bld.vop2_sdwa(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op)
2644 ->sdwa()
2645 .sel[1] = SubdwordSel::ubyte;
2646 } else {
2647 assert(offset == 8);
2648 Definition dst_hi = Definition(dst.physReg().advance(1), v1b);
2649 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst_hi, op)->sdwa().sel[0] =
2650 SubdwordSel::ubyte;
2651 uint32_t c = ~(BITFIELD_MASK(offset) << (dst.physReg().byte() * 8));
2652 bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(c),
2653 Operand(PhysReg(op.physReg().reg()), v1));
2654 }
2655 }
2656 break;
2657 }
2658 case aco_opcode::p_init_scratch: {
2659 assert(program->gfx_level >= GFX8 && program->gfx_level <= GFX10_3);
2660 if (!program->config->scratch_bytes_per_wave)
2661 break;
2662
2663 Operand scratch_addr = instr->operands[0];
2664 if (scratch_addr.isUndefined()) {
2665 PhysReg reg = instr->definitions[0].physReg();
2666 bld.sop1(aco_opcode::p_load_symbol, Definition(reg, s1),
2667 Operand::c32(aco_symbol_scratch_addr_lo));
2668 bld.sop1(aco_opcode::p_load_symbol, Definition(reg.advance(4), s1),
2669 Operand::c32(aco_symbol_scratch_addr_hi));
2670 scratch_addr.setFixed(reg);
2671 } else if (program->stage.hw != AC_HW_COMPUTE_SHADER) {
2672 bld.smem(aco_opcode::s_load_dwordx2, instr->definitions[0], scratch_addr,
2673 Operand::zero());
2674 scratch_addr.setFixed(instr->definitions[0].physReg());
2675 }
2676
2677 hw_init_scratch(bld, instr->definitions[0], scratch_addr, instr->operands[1]);
2678 break;
2679 }
2680 case aco_opcode::p_jump_to_epilog: {
2681 if (pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx)) {
2682 bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
2683 }
2684 bld.sop1(aco_opcode::s_setpc_b64, instr->operands[0]);
2685 break;
2686 }
2687 case aco_opcode::p_interp_gfx11: {
2688 assert(instr->definitions[0].regClass() == v1 ||
2689 instr->definitions[0].regClass() == v2b);
2690 assert(instr->operands[0].regClass() == v1.as_linear());
2691 assert(instr->operands[1].isConstant());
2692 assert(instr->operands[2].isConstant());
2693 assert(instr->operands.back().physReg() == m0);
2694 Definition dst = instr->definitions[0];
2695 PhysReg lin_vgpr = instr->operands[0].physReg();
2696 unsigned attribute = instr->operands[1].constantValue();
2697 unsigned component = instr->operands[2].constantValue();
2698 uint16_t dpp_ctrl = 0;
2699 bool high_16bits = false;
2700 Operand coord1, coord2;
2701 if (instr->operands.size() == 7) {
2702 assert(instr->operands[3].isConstant());
2703 high_16bits = instr->operands[3].constantValue();
2704 assert(instr->operands[4].regClass() == v1);
2705 assert(instr->operands[5].regClass() == v1);
2706 coord1 = instr->operands[4];
2707 coord2 = instr->operands[5];
2708 } else {
2709 assert(instr->operands[3].isConstant());
2710 dpp_ctrl = instr->operands[3].constantValue();
2711 }
2712
2713 bld.ldsdir(aco_opcode::lds_param_load, Definition(lin_vgpr, v1), Operand(m0, s1),
2714 attribute, component);
2715
2716 Operand p(lin_vgpr, v1);
2717 Operand dst_op(dst.physReg(), v1);
2718 if (instr->operands.size() == 5) {
2719 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
2720 } else if (dst.regClass() == v2b) {
2721 bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, Definition(dst), p,
2722 coord1, p, high_16bits ? 0x5 : 0);
2723 bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p,
2724 coord2, dst_op, high_16bits ? 0x1 : 0);
2725 } else {
2726 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), p, coord1,
2727 p);
2728 bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2,
2729 dst_op);
2730 }
2731 break;
2732 }
2733 case aco_opcode::p_dual_src_export_gfx11: {
2734 PhysReg dst0 = instr->definitions[0].physReg();
2735 PhysReg dst1 = instr->definitions[1].physReg();
2736 Definition exec_tmp = instr->definitions[2];
2737 Definition not_vcc_tmp = instr->definitions[3];
2738 Definition clobber_vcc = instr->definitions[4];
2739 Definition clobber_scc = instr->definitions[5];
2740
2741 assert(exec_tmp.regClass() == bld.lm);
2742 assert(not_vcc_tmp.regClass() == bld.lm);
2743 assert(clobber_vcc.regClass() == bld.lm && clobber_vcc.physReg() == vcc);
2744 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
2745
2746 bld.sop1(Builder::s_mov, Definition(exec_tmp.physReg(), bld.lm),
2747 Operand(exec, bld.lm));
2748 bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), clobber_scc,
2749 Operand(exec, bld.lm));
2750
2751 uint8_t enabled_channels = 0;
2752 Operand mrt0[4], mrt1[4];
2753
2754 copy_constant_sgpr(bld, clobber_vcc, 0x5555'5555'5555'5555ull);
2755
2756 Operand src_even = Operand(clobber_vcc.physReg(), bld.lm);
2757
2758 bld.sop1(Builder::s_not, not_vcc_tmp, clobber_scc, src_even);
2759
2760 Operand src_odd = Operand(not_vcc_tmp.physReg(), bld.lm);
2761
2762 for (unsigned i = 0; i < 4; i++) {
2763 if (instr->operands[i].isUndefined() && instr->operands[i + 4].isUndefined()) {
2764 mrt0[i] = instr->operands[i];
2765 mrt1[i] = instr->operands[i + 4];
2766 continue;
2767 }
2768
2769 Operand src0 = instr->operands[i];
2770 Operand src1 = instr->operands[i + 4];
2771
2772 /* | even lanes | odd lanes
2773 * mrt0 | src0 even | src1 even
2774 * mrt1 | src0 odd | src1 odd
2775 */
2776 bld.vop2_dpp(aco_opcode::v_cndmask_b32, Definition(dst0, v1), src1, src0,
2777 src_even, dpp_row_xmask(1));
2778 bld.vop2_e64_dpp(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src0, src1,
2779 src_odd, dpp_row_xmask(1));
2780
2781 mrt0[i] = Operand(dst0, v1);
2782 mrt1[i] = Operand(dst1, v1);
2783
2784 enabled_channels |= 1 << i;
2785
2786 dst0 = dst0.advance(4);
2787 dst1 = dst1.advance(4);
2788 }
2789
2790 bld.sop1(Builder::s_mov, Definition(exec, bld.lm),
2791 Operand(exec_tmp.physReg(), bld.lm));
2792
2793 /* Force export all channels when everything is undefined. */
2794 if (!enabled_channels)
2795 enabled_channels = 0xf;
2796
2797 bld.exp(aco_opcode::exp, mrt0[0], mrt0[1], mrt0[2], mrt0[3], enabled_channels,
2798 V_008DFC_SQ_EXP_MRT + 21, false);
2799 bld.exp(aco_opcode::exp, mrt1[0], mrt1[1], mrt1[2], mrt1[3], enabled_channels,
2800 V_008DFC_SQ_EXP_MRT + 22, false);
2801 break;
2802 }
2803 case aco_opcode::p_end_with_regs: {
2804 end_with_regs_block_index = block->index;
2805 break;
2806 }
2807 case aco_opcode::p_shader_cycles_hi_lo_hi: {
2808 unsigned shader_cycles_lo = 29;
2809 unsigned shader_cycles_hi = 30;
2810 bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[0],
2811 ((32 - 1) << 11) | shader_cycles_hi);
2812 bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[1],
2813 ((32 - 1) << 11) | shader_cycles_lo);
2814 bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[2],
2815 ((32 - 1) << 11) | shader_cycles_hi);
2816 break;
2817 }
2818 default: break;
2819 }
2820 } else if (instr->isBranch()) {
2821 Pseudo_branch_instruction* branch = &instr->branch();
2822 const uint32_t target = branch->target[0];
2823 const bool uniform_branch = !((branch->opcode == aco_opcode::p_cbranch_z ||
2824 branch->opcode == aco_opcode::p_cbranch_nz) &&
2825 branch->operands[0].physReg() == exec);
2826
2827 if (branch->never_taken) {
2828 assert(!uniform_branch);
2829 continue;
2830 }
2831
2832 /* Check if the branch instruction can be removed.
2833 * This is beneficial when executing the next block with an empty exec mask
2834 * is faster than the branch instruction itself.
2835 *
2836 * Override this judgement when:
2837 * - The application prefers to remove control flow
2838 * - The compiler stack knows that it's a divergent branch always taken
2839 */
2840 const bool prefer_remove = branch->rarely_taken;
2841 bool can_remove = block->index < target;
2842 unsigned num_scalar = 0;
2843 unsigned num_vector = 0;
2844
2845 /* Check the instructions between branch and target */
2846 for (unsigned i = block->index + 1; i < branch->target[0]; i++) {
2847 /* Uniform conditional branches must not be ignored if they
2848 * are about to jump over actual instructions */
2849 if (uniform_branch && !program->blocks[i].instructions.empty())
2850 can_remove = false;
2851
2852 if (!can_remove)
2853 break;
2854
2855 for (aco_ptr<Instruction>& inst : program->blocks[i].instructions) {
2856 if (inst->isSOPP()) {
2857 if (instr_info.classes[(int)inst->opcode] == instr_class::branch) {
2858 /* Discard early exits and loop breaks and continues should work fine with
2859 * an empty exec mask.
2860 */
2861 bool is_break_continue =
2862 program->blocks[i].kind & (block_kind_break | block_kind_continue);
2863 bool discard_early_exit =
2864 program->blocks[inst->salu().imm].kind & block_kind_discard_early_exit;
2865 if ((inst->opcode != aco_opcode::s_cbranch_scc0 &&
2866 inst->opcode != aco_opcode::s_cbranch_scc1) ||
2867 (!discard_early_exit && !is_break_continue))
2868 can_remove = false;
2869 } else {
2870 can_remove = false;
2871 }
2872 } else if (inst->isSALU()) {
2873 num_scalar++;
2874 } else if (inst->isVALU() || inst->isVINTRP()) {
2875 if (instr->opcode == aco_opcode::v_writelane_b32 ||
2876 instr->opcode == aco_opcode::v_writelane_b32_e64) {
2877 /* writelane ignores exec, writing inactive lanes results in UB. */
2878 can_remove = false;
2879 }
2880 num_vector++;
2881 /* VALU which writes SGPRs are always executed on GFX10+ */
2882 if (ctx.program->gfx_level >= GFX10) {
2883 for (Definition& def : inst->definitions) {
2884 if (def.regClass().type() == RegType::sgpr)
2885 num_scalar++;
2886 }
2887 }
2888 } else if (inst->isEXP() || inst->isSMEM() || inst->isBarrier()) {
2889 /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs),
2890 * SMEM might be an invalid access, and barriers are probably expensive. */
2891 can_remove = false;
2892 } else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() ||
2893 inst->isLDSDIR()) {
2894 // TODO: GFX6-9 can use vskip
2895 can_remove = prefer_remove;
2896 } else {
2897 can_remove = false;
2898 assert(false && "Pseudo instructions should be lowered by this point.");
2899 }
2900
2901 if (!prefer_remove) {
2902 /* Under these conditions, we shouldn't remove the branch.
2903 * Don't care about the estimated cycles when the shader prefers flattening.
2904 */
2905 unsigned est_cycles;
2906 if (ctx.program->gfx_level >= GFX10)
2907 est_cycles = num_scalar * 2 + num_vector;
2908 else
2909 est_cycles = num_scalar * 4 + num_vector * 4;
2910
2911 if (est_cycles > 16)
2912 can_remove = false;
2913 }
2914
2915 if (!can_remove)
2916 break;
2917 }
2918 }
2919
2920 if (can_remove)
2921 continue;
2922
2923 /* emit branch instruction */
2924 switch (instr->opcode) {
2925 case aco_opcode::p_branch:
2926 assert(block->linear_succs[0] == target);
2927 bld.sopp(aco_opcode::s_branch, branch->definitions[0], target);
2928 break;
2929 case aco_opcode::p_cbranch_nz:
2930 assert(block->linear_succs[1] == target);
2931 if (branch->operands[0].physReg() == exec)
2932 bld.sopp(aco_opcode::s_cbranch_execnz, branch->definitions[0], target);
2933 else if (branch->operands[0].physReg() == vcc)
2934 bld.sopp(aco_opcode::s_cbranch_vccnz, branch->definitions[0], target);
2935 else {
2936 assert(branch->operands[0].physReg() == scc);
2937 bld.sopp(aco_opcode::s_cbranch_scc1, branch->definitions[0], target);
2938 }
2939 break;
2940 case aco_opcode::p_cbranch_z:
2941 assert(block->linear_succs[1] == target);
2942 if (branch->operands[0].physReg() == exec)
2943 bld.sopp(aco_opcode::s_cbranch_execz, branch->definitions[0], target);
2944 else if (branch->operands[0].physReg() == vcc)
2945 bld.sopp(aco_opcode::s_cbranch_vccz, branch->definitions[0], target);
2946 else {
2947 assert(branch->operands[0].physReg() == scc);
2948 bld.sopp(aco_opcode::s_cbranch_scc0, branch->definitions[0], target);
2949 }
2950 break;
2951 default: unreachable("Unknown Pseudo branch instruction!");
2952 }
2953
2954 } else if (instr->isReduction()) {
2955 Pseudo_reduction_instruction& reduce = instr->reduction();
2956 emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size,
2957 reduce.operands[1].physReg(), // tmp
2958 reduce.definitions[1].physReg(), // stmp
2959 reduce.operands[2].physReg(), // vtmp
2960 reduce.definitions[2].physReg(), // sitmp
2961 reduce.operands[0], reduce.definitions[0]);
2962 } else if (instr->isBarrier()) {
2963 Pseudo_barrier_instruction& barrier = instr->barrier();
2964
2965 /* Anything larger than a workgroup isn't possible. Anything
2966 * smaller requires no instructions and this pseudo instruction
2967 * would only be included to control optimizations. */
2968 bool emit_s_barrier = barrier.exec_scope == scope_workgroup &&
2969 program->workgroup_size > program->wave_size;
2970
2971 bld.insert(std::move(instr));
2972 if (emit_s_barrier && ctx.program->gfx_level >= GFX12) {
2973 bld.sop1(aco_opcode::s_barrier_signal, Operand::c32(-1));
2974 bld.sopp(aco_opcode::s_barrier_wait, UINT16_MAX);
2975 } else if (emit_s_barrier) {
2976 bld.sopp(aco_opcode::s_barrier);
2977 }
2978 } else if (instr->opcode == aco_opcode::p_v_cvt_f16_f32_rtne ||
2979 instr->opcode == aco_opcode::p_s_cvt_f16_f32_rtne) {
2980 float_mode new_mode = block->fp_mode;
2981 new_mode.round16_64 = fp_round_ne;
2982 bool set_round = new_mode.round != block->fp_mode.round;
2983
2984 emit_set_mode(bld, new_mode, set_round, false);
2985
2986 if (instr->opcode == aco_opcode::p_v_cvt_f16_f32_rtne)
2987 instr->opcode = aco_opcode::v_cvt_f16_f32;
2988 else
2989 instr->opcode = aco_opcode::s_cvt_f16_f32;
2990 ctx.instructions.emplace_back(std::move(instr));
2991
2992 emit_set_mode(bld, block->fp_mode, set_round, false);
2993 } else if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32) {
2994 Definition def = instr->definitions[0];
2995 VALU_instruction& valu =
2996 bld.vop3(aco_opcode::v_cvt_pk_u8_f32, def, instr->operands[0],
2997 Operand::c32(def.physReg().byte()), Operand(def.physReg(), v1))
2998 ->valu();
2999 valu.abs = instr->valu().abs;
3000 valu.neg = instr->valu().neg;
3001 } else if (instr->isMIMG() && instr->mimg().strict_wqm) {
3002 lower_image_sample(&ctx, instr);
3003 ctx.instructions.emplace_back(std::move(instr));
3004 } else {
3005 ctx.instructions.emplace_back(std::move(instr));
3006 }
3007 }
3008
3009 /* Send the ordered section done message from this block if it's needed in this block, but
3010 * instr_after_end_idx() points beyond the end of its instructions. This may commonly happen
3011 * if the common post-dominator of multiple end locations turns out to be an empty block.
3012 */
3013 if (block_idx == pops_done_msg_bounds.end_block_idx() &&
3014 pops_done_msg_bounds.instr_after_end_idx() >= block->instructions.size()) {
3015 bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
3016 }
3017
3018 block->instructions = std::move(ctx.instructions);
3019 }
3020
3021 /* If block with p_end_with_regs is not the last block (i.e. p_exit_early_if may append exit
3022 * block at last), create an exit block for it to branch to.
3023 */
3024 int last_block_index = program->blocks.size() - 1;
3025 if (end_with_regs_block_index >= 0 && end_with_regs_block_index != last_block_index) {
3026 Block* exit_block = program->create_and_insert_block();
3027 Block* end_with_regs_block = &program->blocks[end_with_regs_block_index];
3028 exit_block->linear_preds.push_back(end_with_regs_block->index);
3029 end_with_regs_block->linear_succs.push_back(exit_block->index);
3030
3031 Builder bld(program, end_with_regs_block);
3032 bld.sopp(aco_opcode::s_branch, exit_block->index);
3033
3034 /* For insert waitcnt pass to add waitcnt in exit block, otherwise waitcnt will be added
3035 * after the s_branch which won't be executed.
3036 */
3037 end_with_regs_block->kind &= ~block_kind_end_with_regs;
3038 exit_block->kind |= block_kind_end_with_regs;
3039 }
3040
3041 program->progress = CompilationProgress::after_lower_to_hw;
3042 }
3043
3044 } // namespace aco
3045