1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6 #include "common/amdgfxregs.h"
7
8 #include "helpers.h"
9
10 using namespace aco;
11
12 void
create_mubuf(unsigned offset,PhysReg dst=PhysReg (256),PhysReg vaddr=PhysReg (256))13 create_mubuf(unsigned offset, PhysReg dst = PhysReg(256), PhysReg vaddr = PhysReg(256))
14 {
15 bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst, v1), Operand(PhysReg(0), s4),
16 Operand(vaddr, v1), Operand::zero(), offset, true);
17 }
18
19 void
create_mubuf_store(PhysReg src=PhysReg (256))20 create_mubuf_store(PhysReg src = PhysReg(256))
21 {
22 bld.mubuf(aco_opcode::buffer_store_dword, Operand(PhysReg(0), s4), Operand(src, v1),
23 Operand::zero(), Operand(src, v1), 0, true);
24 }
25
26 void
create_mimg(bool nsa,unsigned addrs,unsigned instr_dwords)27 create_mimg(bool nsa, unsigned addrs, unsigned instr_dwords)
28 {
29 aco_ptr<Instruction> mimg{
30 create_instruction(aco_opcode::image_sample, Format::MIMG, 3 + addrs, 1)};
31 mimg->definitions[0] = Definition(PhysReg(256), v1);
32 mimg->operands[0] = Operand(PhysReg(0), s8);
33 mimg->operands[1] = Operand(PhysReg(0), s4);
34 mimg->operands[2] = Operand(v1);
35 for (unsigned i = 0; i < addrs; i++)
36 mimg->operands[3 + i] = Operand(PhysReg(256 + (nsa ? i * 2 : i)), v1);
37 mimg->mimg().dmask = 0x1;
38 mimg->mimg().dim = ac_image_2d;
39
40 assert(get_mimg_nsa_dwords(mimg.get()) + 2 == instr_dwords);
41
42 bld.insert(std::move(mimg));
43 }
44
45 void
create_bvh()46 create_bvh()
47 {
48 aco_ptr<Instruction> instr{
49 create_instruction(aco_opcode::image_bvh64_intersect_ray, Format::MIMG, 8, 1)};
50 instr->definitions[0] = Definition(PhysReg(256), v4);
51 instr->operands[0] = Operand(PhysReg(0), s4);
52 instr->operands[1] = Operand(s4);
53 instr->operands[2] = Operand(v1);
54 instr->operands[3] = Operand(PhysReg(256 + 0), v2); /* node */
55 instr->operands[4] = Operand(PhysReg(256 + 2), v1); /* tmax */
56 instr->operands[5] = Operand(PhysReg(256 + 3), v3); /* origin */
57 instr->operands[6] = Operand(PhysReg(256 + 6), v3); /* dir */
58 instr->operands[7] = Operand(PhysReg(256 + 9), v3); /* inv dir */
59 instr->mimg().dmask = 0xf;
60 instr->mimg().unrm = true;
61 instr->mimg().r128 = true;
62 bld.insert(std::move(instr));
63 }
64
65 BEGIN_TEST(insert_nops.nsa_to_vmem_bug)
66 if (!setup_cs(NULL, GFX10))
67 return;
68
69 /* no nop needed because offset&6==0 */
70 //>> p_unit_test 0
71 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
72 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:8 offen
73 bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
74 create_mimg(true, 6, 4);
75 create_mubuf(8);
76
77 /* nop needed */
78 //! p_unit_test 1
79 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
80 //! s_nop
81 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
82 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
83 create_mimg(true, 6, 4);
84 create_mubuf(4);
85
86 /* no nop needed because the MIMG is not NSA */
87 //! p_unit_test 2
88 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[1], %0:v[2], %0:v[3], %0:v[4], %0:v[5] 2d
89 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
90 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
91 create_mimg(false, 6, 2);
92 create_mubuf(4);
93
94 /* no nop needed because there's already an instruction in-between */
95 //! p_unit_test 3
96 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
97 //! v_nop
98 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
99 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
100 create_mimg(true, 6, 4);
101 bld.vop1(aco_opcode::v_nop);
102 create_mubuf(4);
103
104 /* no nop needed because the NSA instruction is under 4 dwords */
105 //! p_unit_test 4
106 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d
107 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
108 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
109 create_mimg(true, 2, 3);
110 create_mubuf(4);
111
112 /* NSA instruction and MUBUF/MTBUF in a different block */
113 //! p_unit_test 5
114 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
115 //! BB1
116 //! /* logical preds: / linear preds: BB0, / kind: uniform, */
117 //! s_nop
118 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
119 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
120 create_mimg(true, 6, 4);
121 bld.reset(program->create_and_insert_block());
122 create_mubuf(4);
123 program->blocks[0].linear_succs.push_back(1);
124 program->blocks[1].linear_preds.push_back(0);
125
126 finish_insert_nops_test();
127 END_TEST
128
129 BEGIN_TEST(insert_nops.writelane_to_nsa_bug)
130 if (!setup_cs(NULL, GFX10))
131 return;
132
133 /* nop needed */
134 //>> p_unit_test 0
135 //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
136 //! s_nop
137 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d
138 bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
139 bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
140 Operand(PhysReg(511), v1));
141 create_mimg(true, 2, 3);
142
143 /* no nop needed because the MIMG is not NSA */
144 //! p_unit_test 1
145 //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
146 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[1] 2d
147 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
148 bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
149 Operand(PhysReg(511), v1));
150 create_mimg(false, 2, 2);
151
152 /* no nop needed because there's already an instruction in-between */
153 //! p_unit_test 2
154 //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
155 //! v_nop
156 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d
157 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
158 bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
159 Operand(PhysReg(511), v1));
160 bld.vop1(aco_opcode::v_nop);
161 create_mimg(true, 2, 3);
162
163 /* writelane and NSA instruction in different blocks */
164 //! p_unit_test 3
165 //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
166 //! BB1
167 //! /* logical preds: / linear preds: BB0, / kind: uniform, */
168 //! s_nop
169 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d
170 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
171 bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
172 Operand(PhysReg(511), v1));
173 bld.reset(program->create_and_insert_block());
174 create_mimg(true, 2, 3);
175 program->blocks[0].linear_succs.push_back(1);
176 program->blocks[1].linear_preds.push_back(0);
177
178 finish_insert_nops_test();
179 END_TEST
180
181 BEGIN_TEST(insert_nops.vmem_to_scalar_write)
182 if (!setup_cs(NULL, GFX10))
183 return;
184
185 /* WaR: VMEM load */
186 //>> p_unit_test 0
187 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
188 //! s_waitcnt_depctr vm_vsrc(0)
189 //! s1: %0:s[0] = s_mov_b32 0
190 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
191 create_mubuf(0);
192 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
193
194 //! p_unit_test 1
195 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
196 //! s_waitcnt_depctr vm_vsrc(0)
197 //! s2: %0:exec = s_mov_b64 -1
198 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
199 create_mubuf(0);
200 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
201
202 /* no hazard: VMEM load */
203 //! p_unit_test 2
204 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
205 //! s1: %0:s[4] = s_mov_b32 0
206 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
207 create_mubuf(0);
208 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero());
209
210 /* no hazard: VMEM load with VALU in-between */
211 //! p_unit_test 3
212 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
213 //! v_nop
214 //! s1: %0:s[0] = s_mov_b32 0
215 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
216 create_mubuf(0);
217 bld.vop1(aco_opcode::v_nop);
218 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
219
220 /* WaR: LDS */
221 //! p_unit_test 4
222 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
223 //! s_waitcnt_depctr vm_vsrc(0)
224 //! s1: %0:m0 = s_mov_b32 0
225 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
226 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
227 Operand(m0, s1));
228 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
229
230 //! p_unit_test 5
231 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
232 //! s_waitcnt_depctr vm_vsrc(0)
233 //! s2: %0:exec = s_mov_b64 -1
234 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
235 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
236 Operand(m0, s1));
237 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
238
239 /* no hazard: LDS */
240 //! p_unit_test 6
241 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
242 //! s1: %0:s[0] = s_mov_b32 0
243 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
244 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
245 Operand(m0, s1));
246 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
247
248 /* no hazard: LDS with VALU in-between */
249 //! p_unit_test 7
250 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
251 //! v_nop
252 //! s1: %0:m0 = s_mov_b32 0
253 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
254 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
255 Operand(m0, s1));
256 bld.vop1(aco_opcode::v_nop);
257 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
258
259 /* no hazard: VMEM/LDS with the correct waitcnt in-between */
260 //! p_unit_test 8
261 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
262 //! s_waitcnt vmcnt(0)
263 //! s1: %0:s[0] = s_mov_b32 0
264 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
265 create_mubuf(0);
266 bld.sopp(aco_opcode::s_waitcnt, 0x3f70);
267 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
268
269 //! p_unit_test 9
270 //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
271 //! s_waitcnt_vscnt %0:null imm:0
272 //! s1: %0:s[0] = s_mov_b32 0
273 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
274 create_mubuf_store();
275 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
276 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
277
278 //! p_unit_test 10
279 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
280 //! s_waitcnt lgkmcnt(0)
281 //! s1: %0:m0 = s_mov_b32 0
282 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
283 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
284 Operand(m0, s1));
285 bld.sopp(aco_opcode::s_waitcnt, 0xc07f);
286 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
287
288 /* VMEM/LDS with the wrong waitcnt in-between */
289 //! p_unit_test 11
290 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
291 //! s_waitcnt_vscnt %0:null imm:0
292 //! s_waitcnt_depctr vm_vsrc(0)
293 //! s1: %0:s[0] = s_mov_b32 0
294 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
295 create_mubuf(0);
296 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
297 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
298
299 //! p_unit_test 12
300 //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
301 //! s_waitcnt lgkmcnt(0)
302 //! s_waitcnt_depctr vm_vsrc(0)
303 //! s1: %0:s[0] = s_mov_b32 0
304 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
305 create_mubuf_store();
306 bld.sopp(aco_opcode::s_waitcnt, 0xc07f);
307 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
308
309 //! p_unit_test 13
310 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
311 //! s_waitcnt vmcnt(0)
312 //! s_waitcnt_depctr vm_vsrc(0)
313 //! s1: %0:m0 = s_mov_b32 0
314 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
315 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
316 Operand(m0, s1));
317 bld.sopp(aco_opcode::s_waitcnt, 0x3f70);
318 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
319
320 finish_insert_nops_test();
321 END_TEST
322
323 BEGIN_TEST(insert_nops.lds_direct_valu)
324 for (amd_gfx_level gfx : {GFX11, GFX12}) {
325 if (!setup_cs(NULL, gfx))
326 continue;
327
328 /* WaW */
329 //>> p_unit_test 0
330 //! v1: %0:v[0] = v_mov_b32 0
331 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
332 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
333 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
334 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
335
336 /* WaR */
337 //! p_unit_test 1
338 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
339 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
340 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
341 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
342 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
343
344 /* No hazard. */
345 //! p_unit_test 2
346 //! v1: %0:v[1] = v_mov_b32 0
347 //! v1: %0:v[0] = lds_direct_load %0:m0
348 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
349 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::zero());
350 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
351
352 /* multiples hazards, nearest should be considered */
353 //! p_unit_test 3
354 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
355 //! v1: %0:v[0] = v_mov_b32 0
356 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
357 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
358 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
359 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
360 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
361
362 /* independent VALU increase wait_vdst */
363 //! p_unit_test 4
364 //! v1: %0:v[0] = v_mov_b32 0
365 //! v_nop
366 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
367 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
368 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
369 bld.vop1(aco_opcode::v_nop);
370 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
371
372 //! p_unit_test 5
373 //! v1: %0:v[0] = v_mov_b32 0
374 //; for i in range(10): insert_pattern('v_nop')
375 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:10
376 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
377 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
378 for (unsigned i = 0; i < 10; i++)
379 bld.vop1(aco_opcode::v_nop);
380 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
381
382 //! p_unit_test 6
383 //! v1: %0:v[0] = v_mov_b32 0
384 //; for i in range(20): insert_pattern('v_nop')
385 //! v1: %0:v[0] = lds_direct_load %0:m0
386 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
387 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
388 for (unsigned i = 0; i < 20; i++)
389 bld.vop1(aco_opcode::v_nop);
390 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
391
392 /* transcendental requires wait_vdst=0 */
393 //! p_unit_test 7
394 //! v1: %0:v[0] = v_mov_b32 0
395 //! v_nop
396 //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
397 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
398 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
399 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
400 bld.vop1(aco_opcode::v_nop);
401 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
402 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
403
404 //! p_unit_test 8
405 //! v1: %0:v[0] = v_sqrt_f32 %0:v[0]
406 //! v_nop
407 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
408 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
409 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
410 bld.vop1(aco_opcode::v_nop);
411 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
412
413 /* transcendental is fine if it's before the instruction */
414 //! p_unit_test 9
415 //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
416 //! v1: %0:v[0] = v_mov_b32 0
417 //! v_nop
418 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
419 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
420 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
421 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
422 bld.vop1(aco_opcode::v_nop);
423 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
424
425 /* non-VALU does not increase wait_vdst */
426 //! p_unit_test 10
427 //! v1: %0:v[0] = v_mov_b32 0
428 //! s1: %0:m0 = s_mov_b32 0
429 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
430 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
431 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
432 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
433 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
434
435 /* consider instructions which wait on vdst */
436 //! p_unit_test 11
437 //! v1: %0:v[0] = v_mov_b32 0
438 //! v_nop
439 //! s_waitcnt_depctr va_vdst(0)
440 //! v1: %0:v[0] = lds_direct_load %0:m0
441 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
442 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
443 bld.vop1(aco_opcode::v_nop);
444 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
445 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
446
447 finish_insert_nops_test();
448 }
449 END_TEST
450
451 BEGIN_TEST(insert_nops.lds_direct_vmem)
452 for (amd_gfx_level gfx : {GFX11, GFX12}) {
453 if (!setup_cs(NULL, gfx))
454 continue;
455
456 /* WaR: VMEM */
457 //>> p_unit_test 0
458 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
459 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
460 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
461 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
462 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
463 create_mubuf(0, PhysReg(257));
464 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
465
466 /* WaW: VMEM */
467 //! p_unit_test 1
468 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
469 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
470 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
471 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
472 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
473 create_mubuf(0, PhysReg(256), PhysReg(257));
474 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
475
476 /* no hazard: VMEM */
477 //! p_unit_test 2
478 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
479 //! v1: %0:v[0] = lds_direct_load %0:m0
480 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
481 create_mubuf(0, PhysReg(257), PhysReg(257));
482 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
483
484 /* no hazard: VMEM with VALU in-between */
485 //! p_unit_test 3
486 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
487 //! v_nop
488 //! v1: %0:v[0] = lds_direct_load %0:m0
489 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
490 create_mubuf(0, PhysReg(257));
491 bld.vop1(aco_opcode::v_nop);
492 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
493
494 /* WaR: LDS */
495 //! p_unit_test 4
496 //! v1: %0:v[1] = ds_read_b32 %0:v[0]
497 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
498 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
499 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
500 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
501 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
502 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
503
504 /* WaW: LDS */
505 //! p_unit_test 5
506 //! v1: %0:v[0] = ds_read_b32 %0:v[1]
507 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
508 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
509 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
510 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
511 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
512 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
513
514 /* no hazard: LDS */
515 //! p_unit_test 6
516 //! v1: %0:v[1] = ds_read_b32 %0:v[1]
517 //! v1: %0:v[0] = lds_direct_load %0:m0
518 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
519 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
520 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
521
522 /* no hazard: LDS with VALU in-between */
523 //! p_unit_test 7
524 //! v1: %0:v[1] = ds_read_b32 %0:v[0]
525 //! v_nop
526 //! v1: %0:v[0] = lds_direct_load %0:m0
527 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
528 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
529 bld.vop1(aco_opcode::v_nop);
530 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
531
532 /* no hazard: VMEM/LDS with the correct waitcnt in-between */
533 //! p_unit_test 8
534 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
535 //~gfx11! s_waitcnt vmcnt(0)
536 //~gfx12! s_wait_loadcnt imm:0
537 //! v1: %0:v[0] = lds_direct_load %0:m0
538 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
539 create_mubuf(0, PhysReg(257));
540 if (gfx >= GFX12)
541 bld.sopp(aco_opcode::s_wait_loadcnt, 0);
542 else
543 bld.sopp(aco_opcode::s_waitcnt, 0x3ff);
544 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
545
546 //! p_unit_test 9
547 //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
548 //~gfx11! s_waitcnt_vscnt %0:null imm:0
549 //~gfx12! s_wait_storecnt imm:0
550 //! v1: %0:v[0] = lds_direct_load %0:m0
551 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
552 create_mubuf_store();
553 if (gfx >= GFX12)
554 bld.sopp(aco_opcode::s_wait_storecnt, 0);
555 else
556 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
557 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
558
559 //! p_unit_test 10
560 //! v1: %0:v[1] = ds_read_b32 %0:v[0]
561 //~gfx11! s_waitcnt lgkmcnt(0)
562 //~gfx12! s_wait_dscnt imm:0
563 //! v1: %0:v[0] = lds_direct_load %0:m0
564 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
565 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
566 if (gfx >= GFX12)
567 bld.sopp(aco_opcode::s_wait_dscnt, 0);
568 else
569 bld.sopp(aco_opcode::s_waitcnt, 0xfc0f);
570 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
571
572 if (gfx >= GFX12) {
573 //~gfx12! p_unit_test 11
574 //~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d
575 //~gfx12! s_wait_loadcnt imm:0
576 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
577 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
578 Instruction* instr =
579 bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8),
580 Operand(s4), Operand(v1), Operand(PhysReg(256), v2))
581 .instr;
582 instr->mimg().dmask = 0x1;
583 instr->mimg().dim = ac_image_2d;
584 bld.sopp(aco_opcode::s_wait_loadcnt, 0);
585 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
586
587 //~gfx12! p_unit_test 12
588 //~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d
589 //~gfx12! s_wait_samplecnt imm:0
590 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
591 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
592 instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1),
593 Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1),
594 Operand(PhysReg(256), v2))
595 .instr;
596 instr->mimg().dmask = 0x1;
597 instr->mimg().dim = ac_image_2d;
598 bld.sopp(aco_opcode::s_wait_samplecnt, 0);
599 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
600
601 //~gfx12! p_unit_test 13
602 //~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128
603 //~gfx12! s_wait_bvhcnt imm:0
604 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
605 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
606 create_bvh();
607 bld.sopp(aco_opcode::s_wait_bvhcnt, 0);
608 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
609 }
610
611 /* VMEM/LDS with the wrong waitcnt in-between */
612 //! p_unit_test 14
613 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
614 //~gfx11! s_waitcnt_vscnt %0:null imm:0
615 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
616 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
617 //~gfx12! s_wait_storecnt imm:0
618 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
619 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14));
620 create_mubuf(0, PhysReg(257));
621 if (gfx >= GFX12)
622 bld.sopp(aco_opcode::s_wait_storecnt, 0);
623 else
624 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
625 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
626
627 //! p_unit_test 15
628 //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
629 //~gfx11! s_waitcnt lgkmcnt(0)
630 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
631 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
632 //~gfx12! s_wait_dscnt imm:0
633 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
634 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15));
635 create_mubuf_store();
636 if (gfx >= GFX12)
637 bld.sopp(aco_opcode::s_wait_dscnt, 0);
638 else
639 bld.sopp(aco_opcode::s_waitcnt, 0xfc0f);
640 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
641
642 //! p_unit_test 16
643 //! v1: %0:v[1] = ds_read_b32 %0:v[0]
644 //~gfx11! s_waitcnt vmcnt(0)
645 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
646 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
647 //~gfx12! s_wait_loadcnt imm:0
648 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
649 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16));
650 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
651 if (gfx >= GFX12)
652 bld.sopp(aco_opcode::s_wait_loadcnt, 0);
653 else
654 bld.sopp(aco_opcode::s_waitcnt, 0x3ff);
655 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
656
657 //! p_unit_test 17
658 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
659 //~gfx11! s_waitcnt_vscnt %0:null imm:0
660 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
661 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
662 //~gfx12! s_wait_storecnt imm:0
663 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
664 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17));
665 create_mubuf(0, PhysReg(256), PhysReg(257));
666 if (gfx >= GFX12)
667 bld.sopp(aco_opcode::s_wait_storecnt, 0);
668 else
669 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
670 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
671
672 if (gfx >= GFX12) {
673 //~gfx12! p_unit_test 18
674 //~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d
675 //~gfx12! s_wait_samplecnt imm:0
676 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
677 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(18));
678 Instruction* instr =
679 bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8),
680 Operand(s4), Operand(v1), Operand(PhysReg(256), v2))
681 .instr;
682 instr->mimg().dmask = 0x1;
683 instr->mimg().dim = ac_image_2d;
684 bld.sopp(aco_opcode::s_wait_samplecnt, 0);
685 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
686
687 //~gfx12! p_unit_test 19
688 //~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d
689 //~gfx12! s_wait_loadcnt imm:0
690 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
691 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(19));
692 instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1),
693 Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1),
694 Operand(PhysReg(256), v2))
695 .instr;
696 instr->mimg().dmask = 0x1;
697 instr->mimg().dim = ac_image_2d;
698 bld.sopp(aco_opcode::s_wait_loadcnt, 0);
699 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
700
701 //~gfx12! p_unit_test 20
702 //~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128
703 //~gfx12! s_wait_loadcnt imm:0
704 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
705 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(20));
706 create_bvh();
707 bld.sopp(aco_opcode::s_wait_loadcnt, 0);
708 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
709 }
710
711 finish_insert_nops_test();
712 }
713 END_TEST
714
715 BEGIN_TEST(insert_nops.valu_trans_use)
716 if (!setup_cs(NULL, GFX11))
717 return;
718
719 //>> p_unit_test 0
720 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
721 //! s_waitcnt_depctr va_vdst(0)
722 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
723 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
724 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
725 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
726
727 /* Sufficient VALU mitigates the hazard. */
728 //! p_unit_test 1
729 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
730 //; for i in range(4): insert_pattern('v_nop')
731 //! s_waitcnt_depctr va_vdst(0)
732 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
733 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
734 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
735 for (unsigned i = 0; i < 4; i++)
736 bld.vop1(aco_opcode::v_nop);
737 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
738
739 //! p_unit_test 2
740 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
741 //; for i in range(8): insert_pattern('v_nop')
742 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
743 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
744 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
745 for (unsigned i = 0; i < 8; i++)
746 bld.vop1(aco_opcode::v_nop);
747 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
748
749 /* Sufficient transcendental VALU mitigates the hazard. */
750 //! p_unit_test 3
751 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
752 //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
753 //! s_waitcnt_depctr va_vdst(0)
754 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
755 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
756 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
757 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
758 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
759
760 //! p_unit_test 4
761 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
762 //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
763 //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
764 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
765 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
766 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
767 for (unsigned i = 0; i < 2; i++)
768 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
769 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
770
771 /* Transcendental VALU should be counted towards VALU */
772 //! p_unit_test 5
773 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
774 //; for i in range(5): insert_pattern('v_nop')
775 //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
776 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
777 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
778 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
779 for (unsigned i = 0; i < 5; i++)
780 bld.vop1(aco_opcode::v_nop);
781 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
782 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
783
784 /* non-VALU does not mitigate the hazard. */
785 //! p_unit_test 6
786 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
787 //; for i in range(8): insert_pattern('s_nop')
788 //! s_waitcnt_depctr va_vdst(0)
789 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
790 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
791 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
792 for (unsigned i = 0; i < 8; i++)
793 bld.sopp(aco_opcode::s_nop, 0);
794 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
795
796 finish_insert_nops_test();
797 END_TEST
798
799 BEGIN_TEST(insert_nops.valu_partial_forwarding.basic)
800 if (!setup_cs(NULL, GFX11))
801 return;
802
803 /* Basic case. */
804 //>> p_unit_test 0
805 //! v1: %0:v[0] = v_mov_b32 0
806 //! s2: %0:exec = s_mov_b64 -1
807 //! v1: %0:v[1] = v_mov_b32 1
808 //! s_waitcnt_depctr va_vdst(0)
809 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
810 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
811 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
812 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
813 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
814 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
815 Operand(PhysReg(257), v1));
816
817 /* We should consider both the closest and further VALU after the exec write. */
818 //! p_unit_test 1
819 //! v1: %0:v[0] = v_mov_b32 0
820 //! s2: %0:exec = s_mov_b64 -1
821 //! v1: %0:v[1] = v_mov_b32 1
822 //; for i in range(2): insert_pattern('v_nop')
823 //! v1: %0:v[2] = v_mov_b32 2
824 //! s_waitcnt_depctr va_vdst(0)
825 //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
826 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
827 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
828 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
829 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
830 bld.vop1(aco_opcode::v_nop);
831 bld.vop1(aco_opcode::v_nop);
832 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
833 bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
834 Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
835
836 //! p_unit_test 2
837 //! v1: %0:v[0] = v_mov_b32 0
838 //! s2: %0:exec = s_mov_b64 -1
839 //! v1: %0:v[1] = v_mov_b32 1
840 //! v1: %0:v[2] = v_mov_b32 2
841 //; for i in range(4): insert_pattern('v_nop')
842 //! s_waitcnt_depctr va_vdst(0)
843 //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
844 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
845 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
846 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
847 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
848 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
849 for (unsigned i = 0; i < 4; i++)
850 bld.vop1(aco_opcode::v_nop);
851 bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
852 Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
853
854 /* If a VALU writes a read VGPR in-between the first and second writes, it should still be
855 * counted towards the distance between the first and second writes.
856 */
857 //! p_unit_test 3
858 //! v1: %0:v[0] = v_mov_b32 0
859 //! s2: %0:exec = s_mov_b64 -1
860 //! v1: %0:v[1] = v_mov_b32 1
861 //; for i in range(2): insert_pattern('v_nop')
862 //! v1: %0:v[2] = v_mov_b32 2
863 //; for i in range(3): insert_pattern('v_nop')
864 //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
865 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
866 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
867 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
868 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
869 bld.vop1(aco_opcode::v_nop);
870 bld.vop1(aco_opcode::v_nop);
871 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
872 for (unsigned i = 0; i < 3; i++)
873 bld.vop1(aco_opcode::v_nop);
874 bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
875 Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
876
877 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
878
879 finish_insert_nops_test();
880 END_TEST
881
882 BEGIN_TEST(insert_nops.valu_partial_forwarding.multiple_exec_writes)
883 if (!setup_cs(NULL, GFX11))
884 return;
885
886 //>> p_unit_test 0
887 //! v1: %0:v[0] = v_mov_b32 0
888 //! s2: %0:exec = s_mov_b64 0
889 //! s2: %0:exec = s_mov_b64 -1
890 //! v1: %0:v[1] = v_mov_b32 1
891 //! s_waitcnt_depctr va_vdst(0)
892 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
893 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
894 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
895 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0));
896 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
897 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
898 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
899 Operand(PhysReg(257), v1));
900
901 //! p_unit_test 1
902 //! v1: %0:v[0] = v_mov_b32 0
903 //! s2: %0:exec = s_mov_b64 0
904 //! v1: %0:v[1] = v_mov_b32 1
905 //! s2: %0:exec = s_mov_b64 -1
906 //! s_waitcnt_depctr va_vdst(0)
907 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
908 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
909 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
910 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0));
911 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
912 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
913 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
914 Operand(PhysReg(257), v1));
915
916 finish_insert_nops_test();
917 END_TEST
918
919 BEGIN_TEST(insert_nops.valu_partial_forwarding.control_flow)
920 if (!setup_cs(NULL, GFX11))
921 return;
922
923 /* Control flow merges: one branch shouldn't interfere with the other (clobbering VALU closer
924 * than interesting one).
925 */
926 //>> p_unit_test 0
927 //! s_cbranch_scc1 block:BB2
928 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0u));
929 bld.sopp(aco_opcode::s_cbranch_scc1, 2);
930
931 //! BB1
932 //! /* logical preds: / linear preds: BB0, / kind: */
933 //! v1: %0:v[0] = v_mov_b32 0
934 //! s2: %0:exec = s_mov_b64 -1
935 //! v_nop
936 //! s_branch block:BB3
937 bld.reset(program->create_and_insert_block());
938 program->blocks[0].linear_succs.push_back(1);
939 program->blocks[1].linear_preds.push_back(0);
940 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
941 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
942 bld.vop1(aco_opcode::v_nop);
943 bld.sopp(aco_opcode::s_branch, 3);
944
945 //! BB2
946 //! /* logical preds: / linear preds: BB0, / kind: */
947 //! v1: %0:v[0] = v_mov_b32 0
948 bld.reset(program->create_and_insert_block());
949 program->blocks[0].linear_succs.push_back(2);
950 program->blocks[2].linear_preds.push_back(0);
951 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
952
953 //! BB3
954 //! /* logical preds: / linear preds: BB1, BB2, / kind: */
955 //! v1: %0:v[1] = v_mov_b32 1
956 //! s_waitcnt_depctr va_vdst(0)
957 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
958 bld.reset(program->create_and_insert_block());
959 program->blocks[1].linear_succs.push_back(3);
960 program->blocks[2].linear_succs.push_back(3);
961 program->blocks[3].linear_preds.push_back(1);
962 program->blocks[3].linear_preds.push_back(2);
963 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
964 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
965 Operand(PhysReg(257), v1));
966
967 /* Control flow merges: one branch shouldn't interfere with the other (should consider furthest
968 * VALU writes after exec).
969 */
970 //! p_unit_test 1
971 //! s_cbranch_scc1 block:BB5
972 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
973 bld.sopp(aco_opcode::s_cbranch_scc1, 5);
974
975 //! BB4
976 //! /* logical preds: / linear preds: BB3, / kind: */
977 //! v1: %0:v[0] = v_mov_b32 0
978 //! s2: %0:exec = s_mov_b64 -1
979 //; for i in range(2): insert_pattern('v_nop')
980 //! v1: %0:v[1] = v_mov_b32 1
981 //! v_nop
982 //! s_branch block:BB6
983 bld.reset(program->create_and_insert_block());
984 program->blocks[3].linear_succs.push_back(4);
985 program->blocks[4].linear_preds.push_back(3);
986 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
987 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
988 bld.vop1(aco_opcode::v_nop);
989 bld.vop1(aco_opcode::v_nop);
990 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
991 bld.vop1(aco_opcode::v_nop);
992 bld.sopp(aco_opcode::s_branch, 6);
993
994 //! BB5
995 //! /* logical preds: / linear preds: BB3, / kind: */
996 //! v1: %0:v[1] = v_mov_b32 1
997 bld.reset(program->create_and_insert_block());
998 program->blocks[3].linear_succs.push_back(5);
999 program->blocks[5].linear_preds.push_back(3);
1000 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
1001
1002 //! BB6
1003 //! /* logical preds: / linear preds: BB4, BB5, / kind: */
1004 //! s_waitcnt_depctr va_vdst(0)
1005 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
1006 bld.reset(program->create_and_insert_block());
1007 program->blocks[4].linear_succs.push_back(6);
1008 program->blocks[5].linear_succs.push_back(6);
1009 program->blocks[6].linear_preds.push_back(4);
1010 program->blocks[6].linear_preds.push_back(5);
1011 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
1012 Operand(PhysReg(257), v1));
1013
1014 /* Control flow merges: one branch shouldn't interfere with the other (should consider closest
1015 * VALU writes after exec).
1016 */
1017 //! p_unit_test 2
1018 //! s_cbranch_scc1 block:BB8
1019 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
1020 bld.sopp(aco_opcode::s_cbranch_scc1, 8);
1021
1022 //! BB7
1023 //! /* logical preds: / linear preds: BB6, / kind: */
1024 //! v1: %0:v[0] = v_mov_b32 0
1025 //! s2: %0:exec = s_mov_b64 -1
1026 //! v1: %0:v[1] = v_mov_b32 1
1027 //; for i in range(4): insert_pattern('v_nop')
1028 //! s_branch block:BB9
1029 bld.reset(program->create_and_insert_block());
1030 program->blocks[6].linear_succs.push_back(7);
1031 program->blocks[7].linear_preds.push_back(6);
1032 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
1033 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
1034 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
1035 for (unsigned i = 0; i < 4; i++)
1036 bld.vop1(aco_opcode::v_nop);
1037 bld.sopp(aco_opcode::s_branch, 9);
1038
1039 //! BB8
1040 //! /* logical preds: / linear preds: BB6, / kind: */
1041 //! v1: %0:v[1] = v_mov_b32 1
1042 //; for i in range(5): insert_pattern('v_nop')
1043 bld.reset(program->create_and_insert_block());
1044 program->blocks[6].linear_succs.push_back(8);
1045 program->blocks[8].linear_preds.push_back(6);
1046 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
1047 for (unsigned i = 0; i < 5; i++)
1048 bld.vop1(aco_opcode::v_nop);
1049
1050 //! BB9
1051 //! /* logical preds: / linear preds: BB7, BB8, / kind: uniform, */
1052 //! s_waitcnt_depctr va_vdst(0)
1053 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
1054 bld.reset(program->create_and_insert_block());
1055 program->blocks[7].linear_succs.push_back(9);
1056 program->blocks[8].linear_succs.push_back(9);
1057 program->blocks[9].linear_preds.push_back(7);
1058 program->blocks[9].linear_preds.push_back(8);
1059 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
1060 Operand(PhysReg(257), v1));
1061
1062 finish_insert_nops_test();
1063 END_TEST
1064
1065 BEGIN_TEST(insert_nops.valu_mask_write)
1066 if (!setup_cs(NULL, GFX11))
1067 return;
1068
1069 /* Basic case. */
1070 //>> p_unit_test 0
1071 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1072 //! s1: %0:s[1] = s_mov_b32 0
1073 //! s_waitcnt_depctr sa_sdst(0)
1074 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1075 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1076 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1077 Operand::zero(), Operand(PhysReg(0), s2));
1078 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1079 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1080
1081 /* Mitigation. */
1082 //! p_unit_test 1
1083 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1084 //! v1: %0:v[1] = v_mov_b32 %0:s[1]
1085 //! s1: %0:s[1] = s_mov_b32 0
1086 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1087 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1088 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1089 Operand::zero(), Operand(PhysReg(0), s2));
1090 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1));
1091 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1092 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1093
1094 //! p_unit_test 2
1095 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1096 //! s1: %0:s[1] = s_mov_b32 0
1097 //! s_waitcnt_depctr sa_sdst(0)
1098 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1099 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1100 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1101 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1102 Operand::zero(), Operand(PhysReg(0), s2));
1103 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1104 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1105 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1106
1107 //! p_unit_test 3
1108 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1109 //! s1: %0:s[1] = s_mov_b32 0
1110 //! s_waitcnt_depctr sa_sdst(0)
1111 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1112 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1113 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1114 Operand::zero(), Operand(PhysReg(0), s2));
1115 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1116 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
1117 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1118
1119 /* v_cndmask_b32 is both involved in the hazard and is a mitigation. */
1120 //! p_unit_test 4
1121 //! v1: %0:v[0] = v_cndmask_b32 %0:s[2], 0, %0:s[0-1]
1122 //! s1: %0:s[1] = s_mov_b32 0
1123 //! s_waitcnt_depctr sa_sdst(0)
1124 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1125 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1126 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand(PhysReg(2), s1),
1127 Operand::zero(), Operand(PhysReg(0), s2));
1128 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1129 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1130
1131 /* VALU reading exec does not mitigate the hazard. We also don't consider literals. */
1132 //! p_unit_test 5
1133 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1134 //! v1: %0:v[1] = v_mov_b32 %0:exec_lo
1135 //! s1: %0:s[1] = s_mov_b32 0
1136 //! s_waitcnt_depctr sa_sdst(0)
1137 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1138 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1139 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1140 Operand::zero(), Operand(PhysReg(0), s2));
1141 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(exec_lo, s1));
1142 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1143 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1144
1145 //! p_unit_test 6
1146 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1147 //! v1: %0:v[1] = v_mov_b32 0x200
1148 //! s1: %0:s[1] = s_mov_b32 0
1149 //! s_waitcnt_depctr sa_sdst(0)
1150 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1151 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1152 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1153 Operand::zero(), Operand(PhysReg(0), s2));
1154 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::literal32(0x200));
1155 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1156 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1157
1158 /* Basic case: VALU. */
1159 //! p_unit_test 7
1160 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1161 //! s1: %0:s[1] = s_mov_b32 0
1162 //! s_waitcnt_depctr sa_sdst(0)
1163 //! v1: %0:v[1] = v_mov_b32 %0:s[1]
1164 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1165 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1166 Operand::zero(), Operand(PhysReg(0), s2));
1167 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1168 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1));
1169
1170 /* SALU which both reads and writes a lane mask SGPR. */
1171 //! p_unit_test 8
1172 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1173 //! s1: %0:s[1] = s_mov_b32 0
1174 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3]
1175 //! s_waitcnt_depctr sa_sdst(0)
1176 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1177 //! s_waitcnt_depctr sa_sdst(0)
1178 //! s1: %0:s[4] = s_mov_b32 %0:s[2]
1179 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1180 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1181 Operand::zero(), Operand(PhysReg(0), s2));
1182 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1183 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1184 Operand::zero(), Operand(PhysReg(2), s2));
1185 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1186 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(2), s1));
1187
1188 /* When a SALU writes a lane mask, we shouldn't forget the current SGPRs used as lane masks then
1189 * written. */
1190 //! p_unit_test 9
1191 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1192 //! s1: %0:s[0] = s_mov_b32 0
1193 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3]
1194 //! s1: %0:s[2] = s_mov_b32 0
1195 //! s_waitcnt_depctr sa_sdst(0)
1196 //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1197 //! s1: %0:s[5] = s_mov_b32 %0:s[2]
1198 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1199 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1200 Operand::zero(), Operand(PhysReg(0), s2));
1201 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1202 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1203 Operand::zero(), Operand(PhysReg(2), s2));
1204 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand::zero());
1205 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1206 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(2), s1));
1207
1208 /* When a SALU writes a lane mask, we shouldn't forget all SGPRs used as lane masks, there might
1209 * be later problematic writes. */
1210 //! p_unit_test 10
1211 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1212 //! s1: %0:s[0] = s_mov_b32 0
1213 //! s_waitcnt_depctr sa_sdst(0)
1214 //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1215 //! s1: %0:s[1] = s_mov_b32 0
1216 //! s_waitcnt_depctr sa_sdst(0)
1217 //! s1: %0:s[5] = s_mov_b32 %0:s[1]
1218 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
1219 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1220 Operand::zero(), Operand(PhysReg(0), s2));
1221 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1222 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1223 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1224 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(1), s1));
1225
1226 //! p_unit_test 11
1227 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1228 //! s1: %0:s[0] = s_mov_b32 0
1229 //! s_waitcnt_depctr sa_sdst(0)
1230 //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1231 //! s1: %0:s[0] = s_mov_b32 0
1232 //! s_waitcnt_depctr sa_sdst(0)
1233 //! s1: %0:s[5] = s_mov_b32 %0:s[0]
1234 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
1235 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1236 Operand::zero(), Operand(PhysReg(0), s2));
1237 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1238 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1239 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1240 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(0), s1));
1241
1242 //! p_unit_test 12
1243 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
1244
1245 //! BB1
1246 //! /* logical preds: / linear preds: BB0, / kind: */
1247 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1248 bld.reset(program->create_and_insert_block());
1249 program->blocks[0].linear_succs.push_back(1);
1250 program->blocks[1].linear_preds.push_back(0);
1251 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1252 Operand::zero(), Operand(PhysReg(0), s2));
1253
1254 //! BB2
1255 //! /* logical preds: / linear preds: BB0, / kind: */
1256 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3]
1257 bld.reset(program->create_and_insert_block());
1258 program->blocks[0].linear_succs.push_back(2);
1259 program->blocks[2].linear_preds.push_back(0);
1260 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1261 Operand::zero(), Operand(PhysReg(2), s2));
1262
1263 //! BB3
1264 //! /* logical preds: / linear preds: BB1, BB2, / kind: uniform, */
1265 //! s1: %0:s[0] = s_mov_b32 0
1266 //! s_waitcnt_depctr sa_sdst(0)
1267 //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1268 //! s1: %0:s[2] = s_mov_b32 0
1269 //! s_waitcnt_depctr sa_sdst(0)
1270 //! s1: %0:s[5] = s_mov_b32 %0:s[2]
1271 bld.reset(program->create_and_insert_block());
1272 program->blocks[1].linear_succs.push_back(3);
1273 program->blocks[2].linear_succs.push_back(3);
1274 program->blocks[3].linear_preds.push_back(1);
1275 program->blocks[3].linear_preds.push_back(2);
1276 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1277 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1278 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand::zero());
1279 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(2), s1));
1280
1281 finish_insert_nops_test();
1282 END_TEST
1283
1284 BEGIN_TEST(insert_nops.wmma_raw)
1285 if (!setup_cs(NULL, GFX11))
1286 return;
1287
1288 /* Basic case. */
1289 //>> p_unit_test 0
1290 //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1291 //! v_nop
1292 //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[16-23].xx, %_:v[48-51].xx
1293 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1294 Operand A(PhysReg(256 + 0), v8);
1295 Operand B(PhysReg(256 + 8), v8);
1296 Operand C(PhysReg(256 + 20), v4);
1297 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1298 0);
1299 A.setFixed(PhysReg(256 + 24));
1300 B.setFixed(PhysReg(256 + 16));
1301 C.setFixed(PhysReg(256 + 48));
1302 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1303 0);
1304
1305 /* Mitigation. */
1306 //! p_unit_test 1
1307 //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1308 //! v1: %_:v[56] = v_rcp_f32 0
1309 //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[16-23].xx, %_:v[48-51].xx
1310 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1311 A.setFixed(PhysReg(256 + 0));
1312 B.setFixed(PhysReg(256 + 8));
1313 C.setFixed(PhysReg(256 + 20));
1314 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1315 0);
1316 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256 + 56), v1), Operand::zero());
1317 A.setFixed(PhysReg(256 + 24));
1318 B.setFixed(PhysReg(256 + 16));
1319 C.setFixed(PhysReg(256 + 48));
1320 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1321 0);
1322
1323 /* No hazard. */
1324 //>> p_unit_test 2
1325 //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1326 //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[32-39].xx, %_:v[48-51].xx
1327 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1328 A.setFixed(PhysReg(256 + 0));
1329 B.setFixed(PhysReg(256 + 8));
1330 C.setFixed(PhysReg(256 + 20));
1331 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1332 0);
1333 A.setFixed(PhysReg(256 + 24));
1334 B.setFixed(PhysReg(256 + 32));
1335 C.setFixed(PhysReg(256 + 48));
1336 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1337 0);
1338
1339 //>> p_unit_test 3
1340 //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1341 //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[32-39].xx, %_:v[20-23].xx
1342 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1343 A.setFixed(PhysReg(256 + 0));
1344 B.setFixed(PhysReg(256 + 8));
1345 C.setFixed(PhysReg(256 + 20));
1346 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1347 0);
1348 A.setFixed(PhysReg(256 + 24));
1349 B.setFixed(PhysReg(256 + 32));
1350 C.setFixed(PhysReg(256 + 20));
1351 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1352 0);
1353
1354 finish_insert_nops_test();
1355 END_TEST
1356
1357 enum StageInfoFlags {
1358 stage_separate = 1 << 0,
1359 stage_has_prolog = 1 << 1,
1360 stage_has_export = 1 << 2,
1361 stage_is_prolog = 1 << 3,
1362 stage_is_epilog = 1 << 4,
1363 };
1364
1365 struct StageInfo {
1366 const char* name;
1367 Stage stage;
1368 unsigned flags;
1369 };
1370
1371 BEGIN_TEST(insert_nops.export_priority.stages)
1372 Stage geometry_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::GS);
1373 for (StageInfo stage : (StageInfo[]){
1374 {"_fs_first_last", fragment_fs, stage_has_export},
1375 {"_fs_with_epilog_first", fragment_fs, 0},
1376 {"_fs_prolog_first", fragment_fs, stage_is_prolog},
1377 {"_fs_epilog_last", fragment_fs, stage_is_epilog | stage_has_export},
1378 {"_vs_first_last", vertex_ngg, stage_has_export},
1379 {"_vs_with_prolog_last", vertex_ngg, stage_has_export | stage_has_prolog},
1380 {"_tes_first_last", tess_eval_ngg, stage_has_export},
1381 {"_ms_first_last", mesh_ngg, stage_has_export},
1382 {"_tesgs_first_last", tess_eval_geometry_ngg, stage_has_export},
1383 {"_vsgs_first_last", vertex_geometry_ngg, stage_has_export},
1384 {"_vsgs_with_prolog_last", vertex_geometry_ngg, stage_has_export | stage_has_prolog},
1385 {"_separate_vs_first", vertex_ngg, stage_separate},
1386 {"_separate_vs_with_prolog", vertex_ngg, stage_separate | stage_has_prolog},
1387 {"_separate_tes_first", tess_eval_ngg, stage_separate},
1388 {"_separate_gs_last", geometry_ngg, stage_separate | stage_has_export}}) {
1389 if (!setup_cs(NULL, GFX11_5, CHIP_UNKNOWN, stage.name))
1390 continue;
1391
1392 program->stage = stage.stage;
1393 program->info.merged_shader_compiled_separately = stage.flags & stage_separate;
1394 program->info.vs.has_prolog = stage.flags & stage_has_prolog;
1395 program->is_prolog = stage.flags & stage_is_prolog;
1396 program->is_epilog = stage.flags & stage_is_epilog;
1397 //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1398 //~.*first.*! s_setprio imm:2
1399 if (stage.flags & stage_has_export) {
1400 //~.*last.*! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1401 //~.*last.*! s_setprio imm:0
1402 //~.*last.*! s_nop
1403 //~.*last.*! s_nop
1404 //~.*last.*! s_endpgm
1405 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1406 V_008DFC_SQ_EXP_POS, false);
1407 } else {
1408 //(?!.*last.*)! v_nop
1409 bld.vop1(aco_opcode::v_nop);
1410 }
1411
1412 finish_insert_nops_test(stage.flags & stage_has_export);
1413 }
1414 END_TEST
1415
1416 BEGIN_TEST(insert_nops.export_priority.instrs_after_export)
1417 if (!setup_cs(NULL, GFX11_5))
1418 return;
1419
1420 program->stage = vertex_ngg;
1421 //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1422 //! s_setprio imm:2
1423 //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1424 //! s_setprio imm:0
1425 //! s_waitcnt_expcnt %0:null imm:0
1426 //! s_nop
1427 //! s_nop
1428 //! s_setprio imm:2
1429 //! v_nop
1430 //! s_endpgm
1431 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1432 V_008DFC_SQ_EXP_POS, false);
1433 bld.vop1(aco_opcode::v_nop);
1434
1435 finish_insert_nops_test();
1436 END_TEST
1437
1438 BEGIN_TEST(insert_nops.export_priority.fallthrough_to_endpgm)
1439 if (!setup_cs(NULL, GFX11_5))
1440 return;
1441
1442 program->stage = vertex_ngg;
1443 //>> /* logical preds: / linear preds: / kind: top-level, */
1444 //! s_setprio imm:2
1445 //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1446 //! s_setprio imm:0
1447 //! s_nop
1448 //! s_nop
1449 //>> BB1
1450 //>> /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
1451 //! s_endpgm
1452 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1453 V_008DFC_SQ_EXP_POS, false);
1454
1455 bld.reset(program->create_and_insert_block());
1456 program->blocks[0].linear_succs.push_back(1);
1457 program->blocks[0].logical_succs.push_back(1);
1458 program->blocks[1].linear_preds.push_back(0);
1459 program->blocks[1].logical_preds.push_back(0);
1460
1461 finish_insert_nops_test();
1462 END_TEST
1463
1464 BEGIN_TEST(insert_nops.export_priority.multiple_exports)
1465 if (!setup_cs(NULL, GFX11_5))
1466 return;
1467
1468 program->stage = vertex_ngg;
1469 //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1470 //! s_setprio imm:2
1471 //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1472 //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos1
1473 //! s_setprio imm:0
1474 //! s_nop
1475 //! s_nop
1476 //! s_endpgm
1477 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1478 V_008DFC_SQ_EXP_POS, false);
1479 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1480 V_008DFC_SQ_EXP_POS + 1, false);
1481
1482 finish_insert_nops_test();
1483 END_TEST
1484
1485 BEGIN_TEST(insert_nops.export_priority.set_prio)
1486 if (!setup_cs(NULL, GFX11_5))
1487 return;
1488
1489 program->stage = vertex_ngg;
1490 //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1491 //! s_setprio imm:3
1492 //! v_nop
1493 //! s_setprio imm:2
1494 //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1495 //! s_setprio imm:0
1496 //! s_nop
1497 //! s_nop
1498 //! s_endpgm
1499 bld.sopp(aco_opcode::s_setprio, 3);
1500 bld.vop1(aco_opcode::v_nop);
1501 bld.sopp(aco_opcode::s_setprio, 1);
1502 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1503 V_008DFC_SQ_EXP_POS, false);
1504
1505 finish_insert_nops_test();
1506 END_TEST
1507
1508 BEGIN_TEST(insert_nops.setpc_gfx6)
1509 if (!setup_cs(NULL, GFX6))
1510 return;
1511
1512 /* SGPR->SMEM hazards */
1513 //>> p_unit_test 0
1514 //! s1: %0:s[0] = s_mov_b32 0
1515 //! s_nop imm:2
1516 //! s_setpc_b64 0
1517 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1518 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1519 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1520
1521 //! p_unit_test 1
1522 //! s1: %0:s[0] = s_mov_b32 0
1523 //! s_nop imm:2
1524 //! s_setpc_b64 0
1525 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1526 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1527 bld.sopp(aco_opcode::s_nop, 2);
1528 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1529
1530 finish_insert_nops_test();
1531
1532 /* This hazard can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves it. */
1533
1534 /* VINTRP->v_readlane_b32/etc */
1535 //>> p_unit_test 2
1536 //! v1: %0:v[0] = v_interp_mov_f32 2, %0:m0 attr0.x
1537 //! s_nop
1538 create_program(GFX6, compute_cs, 64, CHIP_UNKNOWN);
1539 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1540 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(PhysReg(256), v1), Operand::c32(2u),
1541 Operand(m0, s1), 0, 0);
1542 finish_insert_nops_test(false);
1543 END_TEST
1544
1545 BEGIN_TEST(insert_nops.setpc_gfx7)
1546 for (amd_gfx_level gfx : {GFX7, GFX9}) {
1547 if (!setup_cs(NULL, gfx))
1548 continue;
1549
1550 //>> p_unit_test 0
1551 //! s_setpc_b64 0
1552 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1553 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1554
1555 /* Break up SMEM clauses: resolved by the s_setpc_b64 itself */
1556 //! p_unit_test 1
1557 //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1558 //! s_setpc_b64 0
1559 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1560 bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1561 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1562
1563 /* SALU and GDS hazards */
1564 //! p_unit_test 2
1565 //! s_setreg_imm32_b32 0x0 imm:14337
1566 //! s_nop
1567 //! s_setpc_b64 0
1568 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1569 bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(0), (7 << 11) | 1);
1570 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1571
1572 /* VALU writes vcc -> vccz/v_div_fmas */
1573 //! p_unit_test 3
1574 //! s2: %0:vcc = v_cmp_eq_u32 0, 0
1575 //! s_nop imm:3
1576 //! s_setpc_b64 0
1577 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1578 bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand::zero(), Operand::zero());
1579 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1580
1581 /* VALU writes exec -> execz/DPP */
1582 //! p_unit_test 4
1583 //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1584 //! s_nop imm:3
1585 //! s_setpc_b64 0
1586 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1587 bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(),
1588 Operand::zero());
1589 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1590
1591 /* VALU->DPP */
1592 //! p_unit_test 5
1593 //! v1: %0:v[0] = v_mov_b32 0
1594 //~gfx9! s_nop
1595 //! s_setpc_b64 0
1596 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1597 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
1598 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1599
1600 /* VALU->v_readlane_b32/VMEM/etc */
1601 //! p_unit_test 6
1602 //! s1: %0:s[0] = v_readfirstlane_b32 %0:v[0]
1603 //! s_nop imm:3
1604 //! s_setpc_b64 0
1605 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1606 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(0), s1),
1607 Operand(PhysReg(256), v1));
1608 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1609
1610 finish_insert_nops_test();
1611
1612 /* These hazards can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves
1613 * them. */
1614
1615 //>> p_unit_test 7
1616 //! buffer_store_dwordx3 %0:s[0-3], %0:v[0], 0, %0:v[0-2] offen
1617 //! s_nop
1618 create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1619 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1620 bld.mubuf(aco_opcode::buffer_store_dwordx3, Operand(PhysReg(0), s4),
1621 Operand(PhysReg(256), v1), Operand::zero(), Operand(PhysReg(256), v3), 0, true);
1622 finish_insert_nops_test(false);
1623
1624 //>> p_unit_test 8
1625 //! s1: %0:m0 = s_mov_b32 0
1626 //! s_nop
1627 create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1628 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1629 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(m0), s1), Operand::zero());
1630 finish_insert_nops_test(false);
1631
1632 /* Break up SMEM clauses */
1633 //>> p_unit_test 9
1634 //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1635 //! s_nop
1636 create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1637 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1638 bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1639 finish_insert_nops_test(false);
1640 }
1641 END_TEST
1642
1643 BEGIN_TEST(insert_nops.setpc_gfx10)
1644 if (!setup_cs(NULL, GFX10))
1645 return;
1646
1647 //>> p_unit_test 0
1648 //! s_setpc_b64 0
1649 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1650 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1651
1652 /* VcmpxPermlaneHazard */
1653 //! p_unit_test 1
1654 //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1655 //! v1: %0:v[0] = v_mov_b32 %0:v[0]
1656 //! s_setpc_b64 0
1657 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1658 bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
1659 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1660
1661 /* VMEMtoScalarWriteHazard */
1662 //! p_unit_test 2
1663 //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1664 //! s_waitcnt_vscnt %0:null imm:0
1665 //! s_waitcnt_depctr vm_vsrc(0)
1666 //! s_setpc_b64 0
1667 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1668 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1669 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1),
1670 0); /* reset LdsBranchVmemWARHazard */
1671 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1672
1673 /* VcmpxExecWARHazard */
1674 //! p_unit_test 3
1675 //! s1: %0:s[0] = s_mov_b32 %0:exec_hi
1676 //! s_waitcnt_depctr sa_sdst(0)
1677 //! s_setpc_b64 0
1678 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1679 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand(exec_hi, s1));
1680 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1681
1682 /* LdsBranchVmemWARHazard */
1683 //! p_unit_test 4
1684 //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1685 //! v_nop
1686 //! s_branch block:BB0
1687 //! s_waitcnt_vscnt %0:null imm:0
1688 //! s_setpc_b64 0
1689 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1690 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1691 bld.vop1(aco_opcode::v_nop); /* reset VMEMtoScalarWriteHazard */
1692 bld.sopp(aco_opcode::s_branch, 0);
1693 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1694
1695 //! p_unit_test 5
1696 //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1697 //! v_nop
1698 //! s_waitcnt_vscnt %0:null imm:0
1699 //! s_setpc_b64 0
1700 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1701 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1702 bld.vop1(aco_opcode::v_nop); /* reset VMEMtoScalarWriteHazard */
1703 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1704
1705 /* waNsaCannotFollowWritelane: resolved by the s_setpc_b64 */
1706 //! p_unit_test 6
1707 //! v1: %0:v[0] = v_writelane_b32_e64 %0:v[1], 0, %0:v[0]
1708 //! s_setpc_b64 0
1709 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1710 bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1),
1711 Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1));
1712 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1713
1714 finish_insert_nops_test();
1715
1716 /* These hazards can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves them.
1717 */
1718
1719 /* SMEMtoVectorWriteHazard */
1720 //>> p_unit_test 7
1721 //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1722 //! s1: %0:null = s_mov_b32 0
1723 create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
1724 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1725 bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1726 finish_insert_nops_test(false);
1727
1728 /* NSAToVMEMBug is already resolved indirectly through VMEMtoScalarWriteHazard and
1729 * LdsBranchVmemWARHazard. */
1730 //>> p_unit_test 8
1731 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
1732 //! s_waitcnt_depctr vm_vsrc(0)
1733 //! s_waitcnt_vscnt %0:null imm:0
1734 create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
1735 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1736 create_mimg(true, 6, 4);
1737 finish_insert_nops_test(false);
1738
1739 /* waNsaCannotFollowWritelane */
1740 //>> p_unit_test 9
1741 //! v1: %0:v[0] = v_writelane_b32_e64 %0:v[1], 0, %0:v[0]
1742 //! s_nop
1743 create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
1744 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1745 bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1),
1746 Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1));
1747 finish_insert_nops_test(false);
1748 END_TEST
1749
1750 BEGIN_TEST(insert_nops.setpc_gfx11)
1751 if (!setup_cs(NULL, GFX11))
1752 return;
1753
1754 //>> p_unit_test 0
1755 //! s_setpc_b64 0
1756 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1757 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1758
1759 /* LdsDirectVALUHazard */
1760 //! p_unit_test 1
1761 //! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0
1762 //! s_waitcnt_depctr va_vdst(0)
1763 //! s_setpc_b64 0
1764 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1765 bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1),
1766 Operand::zero());
1767 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1768
1769 /* VALUPartialForwardingHazard */
1770 //! p_unit_test 2
1771 //! v1: %0:v[0] = v_mov_b32 0
1772 //! s_waitcnt_depctr va_vdst(0)
1773 //! s_setpc_b64 0
1774 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1775 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
1776 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1777
1778 /* VcmpxPermlaneHazard */
1779 //! p_unit_test 2
1780 //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1781 //! v_nop
1782 //! s_setpc_b64 0
1783 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1784 bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
1785 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1786
1787 /* VALUTransUseHazard */
1788 //! p_unit_test 3
1789 //! v1: %0:v[0] = v_rcp_f32 0
1790 //! s_waitcnt_depctr va_vdst(0)
1791 //! s_setpc_b64 0
1792 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1793 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand::zero());
1794 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1795
1796 /* VALUMaskWriteHazard */
1797 //! p_unit_test 4
1798 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
1799 //! s1: %0:vcc_hi = s_mov_b32 0
1800 //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
1801 //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0]
1802 //! s_waitcnt_depctr va_vdst(0)
1803 //! s_setpc_b64 0
1804 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1805 bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1806 Operand::zero(), Operand(vcc, s2));
1807 bld.sop1(aco_opcode::s_mov_b32, Definition(vcc_hi, s1), Operand::c32(0));
1808 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1809
1810 //! p_unit_test 8
1811 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
1812 //! s_waitcnt_depctr va_vdst(0)
1813 //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0]
1814 //! s_waitcnt_depctr va_vdst(0)
1815 //! s_setpc_b64 0
1816 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1817 bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1818 Operand::zero(), Operand(vcc, s2));
1819 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1820
1821 //! p_unit_test 5
1822 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
1823 //! s2: %0:vcc = s_mov_b64 0
1824 //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
1825 //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0]
1826 //! s_waitcnt_depctr va_vdst(0)
1827 //! s_setpc_b64 0
1828 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1829 bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1830 Operand::zero(), Operand(vcc, s2));
1831 bld.sop1(aco_opcode::s_mov_b64, Definition(vcc, s2), Operand::zero(8));
1832 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1833
1834 /* LdsDirectVMEMHazard */
1835 //! p_unit_test 6
1836 //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1837 //! s_waitcnt_depctr vm_vsrc(0)
1838 //! s_setpc_b64 0
1839 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1840 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1841 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1842
1843 /* WMMA Hazards */
1844 //! p_unit_test 7
1845 //! v4: %0:v[20-23] = v_wmma_f16_16x16x16_f16 %0:v[0-7].xx, %0:v[8-15].xx, %0:v[20-23].xx
1846 //! v_nop
1847 //! s_waitcnt_depctr va_vdst(0)
1848 //! s_setpc_b64 0
1849 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1850 Operand A(PhysReg(256 + 0), v8);
1851 Operand B(PhysReg(256 + 8), v8);
1852 Operand C(PhysReg(256 + 20), v4);
1853 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1854 0);
1855 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1856
1857 finish_insert_nops_test(true);
1858 END_TEST
1859
1860 BEGIN_TEST(insert_nops.setpc_gfx12)
1861 if (!setup_cs(NULL, GFX12))
1862 return;
1863
1864 //>> p_unit_test 0
1865 //! s_setpc_b64 0
1866 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1867 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1868
1869 /* LdsDirectVALUHazard */
1870 //! p_unit_test 1
1871 //! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0
1872 //! s_waitcnt_depctr va_vdst(0)
1873 //! s_setpc_b64 0
1874 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1875 bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1),
1876 Operand::zero());
1877 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1878
1879 /* VcmpxPermlaneHazard */
1880 //! p_unit_test 2
1881 //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1882 //! v_nop
1883 //! s_setpc_b64 0
1884 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1885 bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
1886 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1887
1888 /* LdsDirectVMEMHazard */
1889 //! p_unit_test 3
1890 //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1891 //! s_waitcnt_depctr vm_vsrc(0)
1892 //! s_setpc_b64 0
1893 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1894 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1895 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1896
1897 finish_insert_nops_test(true);
1898 END_TEST
1899