1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6 #include <llvm/Config/llvm-config.h>
7
8 #include "helpers.h"
9 #include "sid.h"
10
11 using namespace aco;
12
13 static std::vector<amd_gfx_level>
filter_gfx_levels(std::vector<amd_gfx_level> src)14 filter_gfx_levels(std::vector<amd_gfx_level> src)
15 {
16 std::vector<amd_gfx_level> res;
17 for (amd_gfx_level gfx : src) {
18 if (gfx < GFX12 || LLVM_VERSION_MAJOR >= 19)
19 res.push_back(gfx);
20 }
21 return res;
22 }
23
24 BEGIN_TEST(assembler.s_memtime)
25 for (unsigned i = GFX6; i <= GFX10; i++) {
26 if (!setup_cs(NULL, (amd_gfx_level)i))
27 continue;
28
29 //~gfx[6-7]>> c7800000
30 //~gfx[6-7]! bf810000
31 //~gfx[8-9]>> s_memtime s[0:1] ; c0900000 00000000
32 //~gfx10>> s_memtime s[0:1] ; f4900000 fa000000
33 bld.smem(aco_opcode::s_memtime, bld.def(s2)).def(0).setFixed(PhysReg{0});
34
35 finish_assembler_test();
36 }
37 END_TEST
38
39 BEGIN_TEST(assembler.branch_3f)
40 if (!setup_cs(NULL, (amd_gfx_level)GFX10))
41 return;
42
43 //! BB0:
44 //! s_branch BB1 ; bf820040
45 //! s_nop 0 ; bf800000
46 bld.sopp(aco_opcode::s_branch, Definition(PhysReg(0), s2), 1);
47
48 for (unsigned i = 0; i < 0x3f; i++)
49 bld.vop1(aco_opcode::v_nop);
50
51 bld.reset(program->create_and_insert_block());
52
53 program->blocks[1].linear_preds.push_back(0u);
54
55 finish_assembler_test();
56 END_TEST
57
58 BEGIN_TEST(assembler.long_jump.unconditional_forwards)
59 if (!setup_cs(NULL, (amd_gfx_level)GFX10))
60 return;
61
62 //!BB0:
63 //! s_getpc_b64 s[0:1] ; be801f00
64 //! s_addc_u32 s0, s0, 0x20014 ; 8200ff00 00020014
65 //! s_bitcmp1_b32 s0, 0 ; bf0d8000
66 //! s_bitset0_b32 s0, 0 ; be801b80
67 //! s_setpc_b64 s[0:1] ; be802000
68 bld.sopp(aco_opcode::s_branch, Definition(PhysReg(0), s2), 2);
69
70 bld.reset(program->create_and_insert_block());
71
72 //! s_nop 0 ; bf800000
73 //!(then repeated 32767 times)
74 for (unsigned i = 0; i < INT16_MAX + 1; i++)
75 bld.sopp(aco_opcode::s_nop, 0);
76
77 //! BB2:
78 //! s_endpgm ; bf810000
79 bld.reset(program->create_and_insert_block());
80
81 program->blocks[2].linear_preds.push_back(0u);
82 program->blocks[2].linear_preds.push_back(1u);
83
84 finish_assembler_test();
85 END_TEST
86
87 BEGIN_TEST(assembler.long_jump.conditional_forwards)
88 for (amd_gfx_level gfx : filter_gfx_levels({GFX10, GFX12})) {
89 if (!setup_cs(NULL, gfx))
90 continue;
91
92 //! BB0:
93 //! s_cbranch_scc1 BB1 ; $_
94 //! s_getpc_b64 s[0:1] ; $_
95 //~gfx12! s_sext_i32_i16 s1, s1 ; $_
96 //~gfx10! s_addc_u32 s0, s0, 0x20014 ; $_ $_
97 //~gfx12! s_add_co_ci_u32 s0, s0, 0x20014 ; $_ $_
98 //! s_bitcmp1_b32 s0, 0 ; $_
99 //! s_bitset0_b32 s0, 0 ; $_
100 //! s_setpc_b64 s[0:1] ; $_
101 bld.sopp(aco_opcode::s_cbranch_scc0, Definition(PhysReg(0), s2), 2);
102
103 bld.reset(program->create_and_insert_block());
104
105 //! BB1:
106 //! s_nop 0 ; bf800000
107 //!(then repeated 32767 times)
108 for (unsigned i = 0; i < INT16_MAX + 1; i++)
109 bld.sopp(aco_opcode::s_nop, 0);
110
111 //! BB2:
112 //! s_endpgm ; $_
113 bld.reset(program->create_and_insert_block());
114
115 program->blocks[1].linear_preds.push_back(0u);
116 program->blocks[2].linear_preds.push_back(0u);
117 program->blocks[2].linear_preds.push_back(1u);
118
119 finish_assembler_test();
120 }
121 END_TEST
122
123 BEGIN_TEST(assembler.long_jump.unconditional_backwards)
124 if (!setup_cs(NULL, (amd_gfx_level)GFX10))
125 return;
126
127 //!BB0:
128 //! s_nop 0 ; bf800000
129 //!(then repeated 32767 times)
130 for (unsigned i = 0; i < INT16_MAX + 1; i++)
131 bld.sopp(aco_opcode::s_nop, 0);
132
133 //! s_getpc_b64 s[0:1] ; be801f00
134 //! s_addc_u32 s0, s0, 0xfffdfffc ; 8200ff00 fffdfffc
135 //! s_bitcmp1_b32 s0, 0 ; bf0d8000
136 //! s_bitset0_b32 s0, 0 ; be801b80
137 //! s_setpc_b64 s[0:1] ; be802000
138 bld.sopp(aco_opcode::s_branch, Definition(PhysReg(0), s2), 0);
139
140 //! BB1:
141 //! s_endpgm ; bf810000
142 bld.reset(program->create_and_insert_block());
143
144 program->blocks[0].linear_preds.push_back(0u);
145 program->blocks[1].linear_preds.push_back(0u);
146
147 finish_assembler_test();
148 END_TEST
149
150 BEGIN_TEST(assembler.long_jump.conditional_backwards)
151 if (!setup_cs(NULL, (amd_gfx_level)GFX10))
152 return;
153
154 //!BB0:
155 //! s_nop 0 ; bf800000
156 //!(then repeated 32767 times)
157 for (unsigned i = 0; i < INT16_MAX + 1; i++)
158 bld.sopp(aco_opcode::s_nop, 0);
159
160 //! s_cbranch_execz BB1 ; bf880006
161 //! s_getpc_b64 s[0:1] ; be801f00
162 //! s_addc_u32 s0, s0, 0xfffdfff8 ; 8200ff00 fffdfff8
163 //! s_bitcmp1_b32 s0, 0 ; bf0d8000
164 //! s_bitset0_b32 s0, 0 ; be801b80
165 //! s_setpc_b64 s[0:1] ; be802000
166 bld.sopp(aco_opcode::s_cbranch_execnz, Definition(PhysReg(0), s2), 0);
167
168 //! BB1:
169 //! s_endpgm ; bf810000
170 bld.reset(program->create_and_insert_block());
171
172 program->blocks[0].linear_preds.push_back(0u);
173 program->blocks[1].linear_preds.push_back(0u);
174
175 finish_assembler_test();
176 END_TEST
177
178 BEGIN_TEST(assembler.long_jump .3f)
179 if (!setup_cs(NULL, (amd_gfx_level)GFX10))
180 return;
181
182 //! BB0:
183 //! s_branch BB1 ; bf820040
184 //! s_nop 0 ; bf800000
185 bld.sopp(aco_opcode::s_branch, Definition(PhysReg(0), s2), 1);
186
187 for (unsigned i = 0; i < 0x3f - 6; i++) // a unconditional long jump is 6 dwords
188 bld.vop1(aco_opcode::v_nop);
189 bld.sopp(aco_opcode::s_branch, Definition(PhysReg(0), s2), 2);
190
191 bld.reset(program->create_and_insert_block());
192 for (unsigned i = 0; i < INT16_MAX + 1; i++)
193 bld.vop1(aco_opcode::v_nop);
194 bld.reset(program->create_and_insert_block());
195
196 program->blocks[1].linear_preds.push_back(0u);
197 program->blocks[2].linear_preds.push_back(0u);
198 program->blocks[2].linear_preds.push_back(1u);
199
200 finish_assembler_test();
201 END_TEST
202
203 BEGIN_TEST(assembler.long_jump.constaddr)
204 if (!setup_cs(NULL, (amd_gfx_level)GFX10))
205 return;
206
207 //>> s_getpc_b64 s[0:1] ; be801f00
208 bld.sopp(aco_opcode::s_branch, Definition(PhysReg(0), s2), 2);
209
210 bld.reset(program->create_and_insert_block());
211
212 for (unsigned i = 0; i < INT16_MAX + 1; i++)
213 bld.sopp(aco_opcode::s_nop, 0);
214
215 bld.reset(program->create_and_insert_block());
216
217 //>> s_getpc_b64 s[0:1] ; be801f00
218 //! s_add_u32 s0, s0, 32 ; 8000ff00 00000020
219 bld.sop1(aco_opcode::p_constaddr_getpc, Definition(PhysReg(0), s2), Operand::zero());
220 bld.sop2(aco_opcode::p_constaddr_addlo, Definition(PhysReg(0), s1), bld.def(s1, scc),
221 Operand(PhysReg(0), s1), Operand::zero(), Operand::zero());
222
223 program->blocks[2].linear_preds.push_back(0u);
224 program->blocks[2].linear_preds.push_back(1u);
225
226 finish_assembler_test();
227 END_TEST
228
229 BEGIN_TEST(assembler.long_jump.discard_early_exit)
230 if (!setup_cs(NULL, (amd_gfx_level)GFX10))
231 return;
232
233 //! BB0:
234 //! s_cbranch_scc1 BB1 ; bf850006
235 //! s_getpc_b64 s[0:1] ; be801f00
236 //! s_addc_u32 s0, s0, 0x20014 ; 8200ff00 00020014
237 //! s_bitcmp1_b32 s0, 0 ; bf0d8000
238 //! s_bitset0_b32 s0, 0 ; be801b80
239 //! s_setpc_b64 s[0:1] ; be802000
240 bld.sopp(aco_opcode::s_cbranch_scc0, 2);
241
242 bld.reset(program->create_and_insert_block());
243
244 //! BB1:
245 //! s_nop 1 ; bf800001
246 //!(then repeated 32766 times)
247 //! s_endpgm ; bf810000
248 for (unsigned i = 0; i < INT16_MAX; i++)
249 bld.sopp(aco_opcode::s_nop, 1);
250
251 //! BB2:
252 //! s_endpgm ; bf810000
253 bld.reset(program->create_and_insert_block());
254
255 program->blocks[1].linear_preds.push_back(0u);
256 program->blocks[2].linear_preds.push_back(0u);
257 program->blocks[2].kind = block_kind_discard_early_exit;
258
259 finish_assembler_test();
260 END_TEST
261
262 BEGIN_TEST(assembler.v_add3)
263 for (unsigned i = GFX9; i <= GFX10; i++) {
264 if (!setup_cs(NULL, (amd_gfx_level)i))
265 continue;
266
267 //~gfx9>> v_add3_u32 v0, 0, 0, 0 ; d1ff0000 02010080
268 //~gfx10>> v_add3_u32 v0, 0, 0, 0 ; d76d0000 02010080
269 aco_ptr<Instruction> add3{create_instruction(aco_opcode::v_add3_u32, Format::VOP3, 3, 1)};
270 add3->operands[0] = Operand::zero();
271 add3->operands[1] = Operand::zero();
272 add3->operands[2] = Operand::zero();
273 add3->definitions[0] = Definition(PhysReg(0), v1);
274 bld.insert(std::move(add3));
275
276 finish_assembler_test();
277 }
278 END_TEST
279
280 BEGIN_TEST(assembler.v_add3_clamp)
281 for (unsigned i = GFX9; i <= GFX10; i++) {
282 if (!setup_cs(NULL, (amd_gfx_level)i))
283 continue;
284
285 //~gfx9>> integer addition + clamp ; d1ff8000 02010080
286 //~gfx10>> integer addition + clamp ; d76d8000 02010080
287 aco_ptr<Instruction> add3{create_instruction(aco_opcode::v_add3_u32, Format::VOP3, 3, 1)};
288 add3->operands[0] = Operand::zero();
289 add3->operands[1] = Operand::zero();
290 add3->operands[2] = Operand::zero();
291 add3->definitions[0] = Definition(PhysReg(0), v1);
292 add3->valu().clamp = 1;
293 bld.insert(std::move(add3));
294
295 finish_assembler_test();
296 }
297 END_TEST
298
299 BEGIN_TEST(assembler.smem_offset)
300 for (unsigned i = GFX9; i <= GFX10; i++) {
301 if (!setup_cs(NULL, (amd_gfx_level)i))
302 continue;
303
304 Definition dst(PhysReg(7), s1);
305 Operand sbase(PhysReg(6), s2);
306 Operand offset(PhysReg(5), s1);
307
308 //~gfx9>> s_load_dword s7, s[6:7], s5 ; c00001c3 00000005
309 //~gfx10>> s_load_dword s7, s[6:7], s5 ; f40001c3 0a000000
310 bld.smem(aco_opcode::s_load_dword, dst, sbase, offset);
311 //~gfx9! s_load_dword s7, s[6:7], 0x42 ; c00201c3 00000042
312 //~gfx10! s_load_dword s7, s[6:7], 0x42 ; f40001c3 fa000042
313 bld.smem(aco_opcode::s_load_dword, dst, sbase, Operand::c32(0x42));
314 if (i >= GFX9) {
315 //~gfx9! s_load_dword s7, s[6:7], s5 offset:0x42 ; c00241c3 0a000042
316 //~gfx10! s_load_dword s7, s[6:7], s5 offset:0x42 ; f40001c3 0a000042
317 bld.smem(aco_opcode::s_load_dword, dst, sbase, Operand::c32(0x42), offset);
318 }
319
320 finish_assembler_test();
321 }
322 END_TEST
323
324 BEGIN_TEST(assembler.p_constaddr)
325 if (!setup_cs(NULL, GFX9))
326 return;
327
328 Definition dst0 = bld.def(s2);
329 Definition dst1 = bld.def(s2);
330 dst0.setFixed(PhysReg(0));
331 dst1.setFixed(PhysReg(2));
332
333 //>> s_getpc_b64 s[0:1] ; be801c00
334 //! s_add_u32 s0, s0, 44 ; 8000ff00 0000002c
335 bld.pseudo(aco_opcode::p_constaddr, dst0, Operand::zero());
336
337 //! s_getpc_b64 s[2:3] ; be821c00
338 //! s_add_u32 s2, s2, 64 ; 8002ff02 00000040
339 bld.pseudo(aco_opcode::p_constaddr, dst1, Operand::c32(32));
340
341 aco::lower_to_hw_instr(program.get());
342 finish_assembler_test();
343 END_TEST
344
345 BEGIN_TEST(assembler.vopc_sdwa)
346 for (unsigned i = GFX9; i <= GFX10; i++) {
347 if (!setup_cs(NULL, (amd_gfx_level)i))
348 continue;
349
350 //~gfx9>> v_cmp_lt_u32_sdwa vcc, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d9300f9 86860080
351 //~gfx10>> v_cmp_lt_u32_sdwa vcc, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d8300f9 86860080
352 bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(vcc, s2), Operand::zero(),
353 Operand::zero());
354
355 //~gfx9! v_cmp_lt_u32_sdwa s[44:45], 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d9300f9 8686ac80
356 //~gfx10! v_cmp_lt_u32_sdwa s[44:45], 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d8300f9 8686ac80
357 bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(PhysReg(0x2c), s2), Operand::zero(),
358 Operand::zero());
359
360 //~gfx9! v_cmp_lt_u32_sdwa exec, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d9300f9 8686fe80
361 //~gfx10! v_cmp_lt_u32_sdwa exec, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d8300f9 8686fe80
362 bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(exec, s2), Operand::zero(),
363 Operand::zero());
364
365 if (i == GFX10) {
366 //~gfx10! v_cmpx_lt_u32_sdwa 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7da300f9 86860080
367 bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(exec, s2), Operand::zero(),
368 Operand::zero());
369 } else {
370 //~gfx9! v_cmpx_lt_u32_sdwa vcc, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7db300f9 86860080
371 bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(vcc, s2), Definition(exec, s2),
372 Operand::zero(), Operand::zero());
373
374 //~gfx9! v_cmpx_lt_u32_sdwa s[44:45], 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7db300f9 8686ac80
375 bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(PhysReg(0x2c), s2),
376 Definition(exec, s2), Operand::zero(), Operand::zero());
377 }
378
379 finish_assembler_test();
380 }
381 END_TEST
382
383 BEGIN_TEST(assembler.smem)
384 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
385 if (!setup_cs(NULL, gfx))
386 continue;
387
388 Definition dst = bld.def(s1);
389 dst.setFixed(PhysReg(4));
390
391 Operand op_s1(bld.tmp(s1));
392 op_s1.setFixed(PhysReg(8));
393
394 Operand op_s2(bld.tmp(s2));
395 op_s2.setFixed(PhysReg(16));
396
397 Operand op_s4(bld.tmp(s4));
398 op_s4.setFixed(PhysReg(32));
399
400 //~gfx11>> s_dcache_inv ; f4840000 f8000000
401 //~gfx12>> s_dcache_inv ; f4042000 f8000000
402 bld.smem(aco_opcode::s_dcache_inv);
403
404 //! s_load_b32 s4, s[16:17], 0x2a ; f4000108 f800002a
405 bld.smem(aco_opcode::s_load_dword, dst, op_s2, Operand::c32(42));
406
407 //~gfx11! s_load_b32 s4, s[16:17], s8 ; f4000108 10000000
408 //~gfx12! s_load_b32 s4, s[16:17], s8 offset:0x0 ; f4000108 10000000
409 bld.smem(aco_opcode::s_load_dword, dst, op_s2, op_s1);
410
411 //! s_load_b32 s4, s[16:17], s8 offset:0x2a ; f4000108 1000002a
412 bld.smem(aco_opcode::s_load_dword, dst, op_s2, Operand::c32(42), op_s1);
413
414 ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
415 ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
416 if (gfx >= GFX12) {
417 cache_coherent.gfx12.scope = gfx12_scope_device;
418 cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
419 } else {
420 cache_coherent.value = ac_glc;
421 cache_non_temporal.value = ac_dlc;
422 }
423
424 //~gfx11! s_buffer_load_b32 s4, s[32:35], s8 glc ; f4204110 10000000
425 //~gfx12! s_buffer_load_b32 s4, s[32:35], s8 offset:0x0 scope:SCOPE_DEV ; f4420110 10000000
426 bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().cache = cache_coherent;
427
428 //~gfx11! s_buffer_load_b32 s4, s[32:35], s8 dlc ; f4202110 10000000
429 //~gfx12! s_buffer_load_b32 s4, s[32:35], s8 offset:0x0 th:TH_LOAD_NT ; f4820110 10000000
430 bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().cache =
431 cache_non_temporal;
432
433 finish_assembler_test();
434 }
435 END_TEST
436
437 BEGIN_TEST(assembler.mubuf)
438 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
439 if (!setup_cs(NULL, gfx))
440 continue;
441
442 Definition dst = bld.def(v1);
443 dst.setFixed(PhysReg(256 + 42));
444
445 Operand op_s4(bld.tmp(s4));
446 op_s4.setFixed(PhysReg(32));
447
448 Operand op_v1(bld.tmp(v1));
449 op_v1.setFixed(PhysReg(256 + 10));
450
451 Operand op_v2(bld.tmp(v2));
452 op_v2.setFixed(PhysReg(256 + 20));
453
454 Operand op_s1(bld.tmp(s1));
455 op_s1.setFixed(PhysReg(30));
456
457 Operand op_m0(bld.tmp(s1));
458 op_m0.setFixed(m0);
459
460 //! llvm_version: #llvm_ver
461 fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
462
463 /* Addressing */
464 //~gfx11>> buffer_load_b32 v42, off, s[32:35], s30 ; e0500000 1e082a80
465 //~gfx12>> buffer_load_b32 v42, off, s[32:35], s30 ; c405001e 0080402a 00000000
466 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), op_s1, 0, false);
467
468 //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 ; e0500000 80082a80
469 //~gfx12! buffer_load_b32 v42, off, s[32:35], null ; c405007c 0080402a 00000000
470 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false);
471
472 //~gfx11! buffer_load_b32 v42, off, s[32:35], 42 ; e0500000 aa082a80
473 if (gfx == GFX11)
474 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::c32(42), 0,
475 false);
476
477 //~gfx11! buffer_load_b32 v42, v10, s[32:35], s30 offen ; e0500000 1e482a0a
478 //~gfx12! buffer_load_b32 v42, v10, s[32:35], s30 offen ; c405001e 4080402a 0000000a
479 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, op_v1, op_s1, 0, true);
480
481 //~gfx11! buffer_load_b32 v42, v10, s[32:35], s30 idxen ; e0500000 1e882a0a
482 //~gfx12! buffer_load_b32 v42, v10, s[32:35], s30 idxen ; c405001e 8080402a 0000000a
483 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, op_v1, op_s1, 0, false)->mubuf().idxen =
484 true;
485
486 //~gfx11! buffer_load_b32 v42, v[20:21], s[32:35], s30 idxen offen ; e0500000 1ec82a14
487 //~gfx12! buffer_load_b32 v42, v[20:21], s[32:35], s30 idxen offen ; c405001e c080402a 00000014
488 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, op_v2, op_s1, 0, true)->mubuf().idxen =
489 true;
490
491 //~gfx11! buffer_load_b32 v42, off, s[32:35], s30 offset:84 ; e0500054 1e082a80
492 //~gfx12! buffer_load_b32 v42, off, s[32:35], s30 offset:84 ; c405001e 0080402a 00005400
493 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), op_s1, 84, false);
494
495 /* Various flags */
496 ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
497 ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}};
498 ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
499 ac_hw_cache_flags cache_atomic_rtn = {{0, 0, 0, 0, 0}};
500 if (gfx >= GFX12) {
501 cache_coherent.gfx12.scope = gfx12_scope_device;
502 cache_sys_coherent.gfx12.scope = gfx12_scope_memory;
503 cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
504 cache_atomic_rtn.gfx12.temporal_hint = gfx12_atomic_return;
505 } else {
506 cache_coherent.value = ac_glc;
507 cache_sys_coherent.value = ac_slc;
508 cache_non_temporal.value = ac_dlc;
509 cache_atomic_rtn.value = ac_glc;
510 }
511
512 //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 glc ; e0504000 80082a80
513 //~gfx12! buffer_load_b32 v42, off, s[32:35], null scope:SCOPE_DEV ; c405007c 0088402a 00000000
514 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
515 ->mubuf()
516 .cache = cache_coherent;
517
518 //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 dlc ; e0502000 80082a80
519 //~gfx12! buffer_load_b32 v42, off, s[32:35], null th:TH_LOAD_NT ; c405007c 0090402a 00000000
520 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
521 ->mubuf()
522 .cache = cache_non_temporal;
523
524 //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 slc ; e0501000 80082a80
525 //~gfx12! buffer_load_b32 v42, off, s[32:35], null scope:SCOPE_SYS ; c405007c 008c402a 00000000
526 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
527 ->mubuf()
528 .cache = cache_sys_coherent;
529
530 //; if llvm_ver >= 16 and variant == 'gfx11':
531 //; insert_pattern('buffer_load_b32 v[42:43], off, s[32:35], 0 tfe ; e0500000 80282a80')
532 //; elif variant == 'gfx11':
533 //; insert_pattern('buffer_load_b32 v42, off, s[32:35], 0 tfe ; e0500000 80282a80')
534 //~gfx12! buffer_load_b32 v[42:43], off, s[32:35], null tfe ; c445007c 0080402a 00000000
535 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
536 ->mubuf()
537 .tfe = true;
538
539 /* LDS */
540 if (gfx == GFX11) {
541 //~gfx11! buffer_load_lds_b32 off, s[32:35], 0 ; e0c40000 80080080
542 bld.mubuf(aco_opcode::buffer_load_dword, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
543 false)
544 ->mubuf()
545 .lds = true;
546
547 //~gfx11! buffer_load_lds_i8 off, s[32:35], 0 ; e0b80000 80080080
548 bld.mubuf(aco_opcode::buffer_load_sbyte, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
549 false)
550 ->mubuf()
551 .lds = true;
552
553 //~gfx11! buffer_load_lds_i16 off, s[32:35], 0 ; e0c00000 80080080
554 bld.mubuf(aco_opcode::buffer_load_sshort, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
555 false)
556 ->mubuf()
557 .lds = true;
558
559 //~gfx11! buffer_load_lds_u8 off, s[32:35], 0 ; e0b40000 80080080
560 bld.mubuf(aco_opcode::buffer_load_ubyte, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
561 false)
562 ->mubuf()
563 .lds = true;
564
565 //~gfx11! buffer_load_lds_u16 off, s[32:35], 0 ; e0bc0000 80080080
566 bld.mubuf(aco_opcode::buffer_load_ushort, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
567 false)
568 ->mubuf()
569 .lds = true;
570
571 //~gfx11! buffer_load_lds_format_x off, s[32:35], 0 ; e0c80000 80080080
572 bld.mubuf(aco_opcode::buffer_load_format_x, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
573 false)
574 ->mubuf()
575 .lds = true;
576 }
577
578 /* Stores */
579 //~gfx11! buffer_store_b32 v10, off, s[32:35], s30 ; e0680000 1e080a80
580 //~gfx12! buffer_store_b32 v10, off, s[32:35], s30 ; c406801e 0080400a 00000000
581 bld.mubuf(aco_opcode::buffer_store_dword, op_s4, Operand(v1), op_s1, op_v1, 0, false);
582
583 //~gfx11! buffer_store_b64 v[20:21], v10, s[32:35], s30 offen ; e06c0000 1e48140a
584 //~gfx12! buffer_store_b64 v[20:21], v10, s[32:35], s30 offen ; c406c01e 40804014 0000000a
585 bld.mubuf(aco_opcode::buffer_store_dwordx2, op_s4, op_v1, op_s1, op_v2, 0, true);
586
587 /* Atomic with return */
588 //~gfx11! buffer_atomic_add_u32 v10, off, s[32:35], 0 glc ; e0d44000 80080a80
589 //~gfx12! buffer_atomic_add_u32 v10, off, s[32:35], null th:TH_ATOMIC_RETURN ; c40d407c 0090400a 00000000
590 bld.mubuf(aco_opcode::buffer_atomic_add, Definition(op_v1.physReg(), v1), op_s4, Operand(v1),
591 Operand::zero(), op_v1, 0, false)
592 ->mubuf()
593 .cache = cache_atomic_rtn;
594
595 finish_assembler_test();
596 }
597 END_TEST
598
599 BEGIN_TEST(assembler.mtbuf)
600 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
601 if (!setup_cs(NULL, gfx))
602 continue;
603
604 Definition dst = bld.def(v1);
605 dst.setFixed(PhysReg(256 + 42));
606
607 Operand op_s4(bld.tmp(s4));
608 op_s4.setFixed(PhysReg(32));
609
610 Operand op_v1(bld.tmp(v1));
611 op_v1.setFixed(PhysReg(256 + 10));
612
613 Operand op_v2(bld.tmp(v2));
614 op_v2.setFixed(PhysReg(256 + 20));
615
616 Operand op_s1(bld.tmp(s1));
617 op_s1.setFixed(PhysReg(30));
618
619 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_32_32;
620 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_FLOAT;
621
622 //! llvm_version: #llvm_ver
623 fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
624
625 /* Addressing */
626 //~gfx11>> tbuffer_load_format_x v42, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; e9900000 1e082a80
627 //~gfx12>> tbuffer_load_format_x v42, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; c420001e 1900402a 00000080
628 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), op_s1, dfmt, nfmt, 0,
629 false);
630
631 //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] ; e9900000 80082a80
632 //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] ; c420007c 1900402a 00000080
633 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
634 nfmt, 0, false);
635
636 //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 42 format:[BUF_FMT_32_32_FLOAT] ; e9900000 aa082a80
637 if (gfx == GFX11)
638 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::c32(42),
639 dfmt, nfmt, 0, false);
640
641 //~gfx11! tbuffer_load_format_x v42, v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; e9900000 1e482a0a
642 //~gfx12! tbuffer_load_format_x v42, v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; c420001e 5900402a 0000000a
643 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, op_v1, op_s1, dfmt, nfmt, 0, true);
644
645 //~gfx11! tbuffer_load_format_x v42, v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] idxen ; e9900000 1e882a0a
646 //~gfx12! tbuffer_load_format_x v42, v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] idxen ; c420001e 9900402a 0000000a
647 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, op_v1, op_s1, dfmt, nfmt, 0, false)
648 ->mtbuf()
649 .idxen = true;
650
651 //~gfx11! tbuffer_load_format_x v42, v[20:21], s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] idxen offen ; e9900000 1ec82a14
652 //~gfx12! tbuffer_load_format_x v42, v[20:21], s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] idxen offen ; c420001e d900402a 00000014
653 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, op_v2, op_s1, dfmt, nfmt, 0, true)
654 ->mtbuf()
655 .idxen = true;
656
657 //~gfx11! tbuffer_load_format_x v42, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offset:84 ; e9900054 1e082a80
658 //~gfx12! tbuffer_load_format_x v42, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offset:84 ; c420001e 1900402a 00005480
659 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), op_s1, dfmt, nfmt, 84,
660 false);
661
662 /* Various flags */
663 ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
664 ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}};
665 ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
666 if (gfx >= GFX12) {
667 cache_coherent.gfx12.scope = gfx12_scope_device;
668 cache_sys_coherent.gfx12.scope = gfx12_scope_memory;
669 cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
670 } else {
671 cache_coherent.value = ac_glc;
672 cache_sys_coherent.value = ac_slc;
673 cache_non_temporal.value = ac_dlc;
674 }
675
676 //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] glc ; e9904000 80082a80
677 //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] scope:SCOPE_DEV ; c420007c 1908402a 00000080
678 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
679 nfmt, 0, false)
680 ->mtbuf()
681 .cache = cache_coherent;
682
683 //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] dlc ; e9902000 80082a80
684 //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] th:TH_LOAD_NT ; c420007c 1910402a 00000080
685 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
686 nfmt, 0, false)
687 ->mtbuf()
688 .cache = cache_non_temporal;
689
690 //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] slc ; e9901000 80082a80
691 //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] scope:SCOPE_SYS ; c420007c 190c402a 00000080
692 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
693 nfmt, 0, false)
694 ->mtbuf()
695 .cache = cache_sys_coherent;
696
697 //; if llvm_ver >= 16 and variant == 'gfx11':
698 //; insert_pattern('tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] ; e9900000 80282a80')
699 //; elif variant == 'gfx11':
700 //; insert_pattern('tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] tfe ; e9900000 80282a80')
701 //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] ; c460007c 1900402a 00000080
702 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
703 nfmt, 0, false)
704 ->mtbuf()
705 .tfe = true;
706
707 /* Stores */
708 //~gfx11! tbuffer_store_format_x v10, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; e9920000 1e080a80
709 //~gfx12! tbuffer_store_format_x v10, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; c421001e 1900400a 00000080
710 bld.mtbuf(aco_opcode::tbuffer_store_format_x, op_s4, Operand(v1), op_s1, op_v1, dfmt, nfmt, 0,
711 false);
712
713 //~gfx11! tbuffer_store_format_xy v[20:21], v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; e9928000 1e48140a
714 //~gfx12! tbuffer_store_format_xy v[20:21], v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; c421401e 59004014 0000000a
715 bld.mtbuf(aco_opcode::tbuffer_store_format_xy, op_s4, op_v1, op_s1, op_v2, dfmt, nfmt, 0,
716 true);
717
718 finish_assembler_test();
719 }
720 END_TEST
721
722 BEGIN_TEST(assembler.mimg)
723 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
724 if (!setup_cs(NULL, gfx))
725 continue;
726
727 Definition dst_v1 = bld.def(v1);
728 dst_v1.setFixed(PhysReg(256 + 42));
729
730 Definition dst_v4 = bld.def(v4);
731 dst_v4.setFixed(PhysReg(256 + 84));
732
733 Operand op_s4(bld.tmp(s4));
734 op_s4.setFixed(PhysReg(32));
735
736 Operand op_s8(bld.tmp(s8));
737 op_s8.setFixed(PhysReg(64));
738
739 Operand op_v1(bld.tmp(v1));
740 op_v1.setFixed(PhysReg(256 + 10));
741
742 Operand op_v2(bld.tmp(v2));
743 op_v2.setFixed(PhysReg(256 + 20));
744
745 Operand op_v4(bld.tmp(v4));
746 op_v4.setFixed(PhysReg(256 + 30));
747
748 //~gfx11>> image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D ; f06c0f00 2010540a
749 //~gfx12>> image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D ; e7c6c000 10008054 0000000a
750 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1);
751
752 //~gfx11! image_sample v[84:87], v[20:21], s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_2D ; f06c0f04 20105414
753 //~gfx12! image_sample v[84:87], [v20, v21], s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_2D ; e7c6c001 10008054 00001514
754 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v2)->mimg().dim =
755 ac_image_2d;
756
757 //~gfx11! image_sample v42, v10, s[64:71], s[32:35] dmask:0x1 dim:SQ_RSRC_IMG_1D ; f06c0100 20102a0a
758 //~gfx12! image_sample v42, v10, s[64:71], s[32:35] dmask:0x1 dim:SQ_RSRC_IMG_1D ; e446c000 1000802a 0000000a
759 bld.mimg(aco_opcode::image_sample, dst_v1, op_s8, op_s4, Operand(v1), op_v1)->mimg().dmask =
760 0x1;
761
762 /* Various flags */
763 ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
764 ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}};
765 ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
766 ac_hw_cache_flags cache_atomic_rtn = {{0, 0, 0, 0, 0}};
767 if (gfx >= GFX12) {
768 cache_coherent.gfx12.scope = gfx12_scope_device;
769 cache_sys_coherent.gfx12.scope = gfx12_scope_memory;
770 cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
771 cache_atomic_rtn.gfx12.temporal_hint = gfx12_atomic_return;
772 } else {
773 cache_coherent.value = ac_glc;
774 cache_sys_coherent.value = ac_slc;
775 cache_non_temporal.value = ac_dlc;
776 cache_atomic_rtn.value = ac_glc;
777 }
778
779 //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D dlc ; f06c2f00 2010540a
780 //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_NT ; e7c6c000 10108054 0000000a
781 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
782 cache_non_temporal;
783
784 //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D glc ; f06c4f00 2010540a
785 //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D scope:SCOPE_DEV ; e7c6c000 10088054 0000000a
786 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
787 cache_coherent;
788
789 //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D slc ; f06c1f00 2010540a
790 //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D scope:SCOPE_SYS ; e7c6c000 100c8054 0000000a
791 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
792 cache_sys_coherent;
793
794 //~gfx11! image_sample v[84:88], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; f06c0f00 2030540a
795 //~gfx12! image_sample v[84:88], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; e7c6c008 10008054 0000000a
796 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().tfe =
797 true;
798
799 //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; f06c0f00 2050540a
800 //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; e7c6c000 10008154 0000000a
801 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().lwe =
802 true;
803
804 //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D r128 ; f06c8f00 2010540a
805 //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D r128 ; e7c6c010 10008054 0000000a
806 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().r128 =
807 true;
808
809 //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; f06d0f00 2010540a
810 //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; e7c6c040 10008054 0000000a
811 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().a16 =
812 true;
813
814 //~gfx11! image_sample v[84:85], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D d16 ; f06e0f00 2010540a
815 //~gfx12! image_sample v[84:85], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D d16 ; e7c6c020 10008054 0000000a
816 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().d16 =
817 true;
818
819 /* NSA */
820 //~gfx11! image_sample v[84:87], [v10, v40], s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_2D ; f06c0f05 2010540a 00000028
821 //~gfx12! image_sample v[84:87], [v10, v40], s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_2D ; e7c6c001 10008054 0000280a
822 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1,
823 Operand(bld.tmp(v1), PhysReg(256 + 40)))
824 ->mimg()
825 .dim = ac_image_2d;
826
827 //~gfx11! image_bvh_intersect_ray v[84:87], [v40, v42, v[44:46], v[48:50], v[52:54]], s[32:35] ; f0648f81 00085428 34302c2a
828 //~gfx12! image_bvh_intersect_ray v[84:87], [v40, v42, v[44:46], v[48:50], v[52:54]], s[32:35] ; d3c64010 34004054 302c2a28
829 aco_ptr<Instruction> instr{
830 create_instruction(aco_opcode::image_bvh_intersect_ray, Format::MIMG, 8, 1)};
831 instr->definitions[0] = dst_v4;
832 instr->operands[0] = op_s4;
833 instr->operands[1] = Operand(s4);
834 instr->operands[2] = Operand(v1);
835 instr->operands[3] = Operand(PhysReg(256 + 40), v1); /* node */
836 instr->operands[4] = Operand(PhysReg(256 + 42), v1); /* tmax */
837 instr->operands[5] = Operand(PhysReg(256 + 44), v3); /* origin */
838 instr->operands[6] = Operand(PhysReg(256 + 48), v3); /* dir */
839 instr->operands[7] = Operand(PhysReg(256 + 52), v3); /* inv dir */
840 instr->mimg().dmask = 0xf;
841 instr->mimg().unrm = true;
842 instr->mimg().r128 = true;
843 bld.insert(std::move(instr));
844
845 /* Stores */
846 //~gfx11! image_store v[30:33], v10, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_1D ; f0180f00 00101e0a
847 //~gfx12! image_store v[30:33], v10, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_1D ; d3c18000 0000801e 0000000a
848 bld.mimg(aco_opcode::image_store, op_s8, Operand(s4), op_v4, op_v1);
849
850 //~gfx11! image_atomic_add v10, v20, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D ; f0300f04 00100a14
851 //~gfx12! image_atomic_add_uint v10, [v20, v21, v0, v0], s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D ; d3c30001 0000800a 00001514
852 bld.mimg(aco_opcode::image_atomic_add, Definition(op_v1.physReg(), v1), op_s8, Operand(s4),
853 op_v1, op_v2)
854 ->mimg()
855 .dim = ac_image_2d;
856
857 /* Atomic with return */
858 //~gfx11! image_atomic_add v10, v20, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D glc ; f0304f04 00100a14
859 //~gfx12! image_atomic_add_uint v10, [v20, v21, v0, v0], s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D th:TH_ATOMIC_RETURN ; d3c30001 0010800a 00001514
860 bld.mimg(aco_opcode::image_atomic_add, Definition(op_v1.physReg(), v1), op_s8, Operand(s4),
861 op_v1, op_v2, 0xf, false, false, false, cache_atomic_rtn)
862 ->mimg()
863 .dim = ac_image_2d;
864
865 //~gfx11! image_load v[84:87], v[20:21], s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D ; f0000f04 00105414
866 //~gfx12! image_load v[84:87], [v20, v21], s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D ; d3c00001 00008054 00001514
867 bld.mimg(aco_opcode::image_load, dst_v4, op_s8, Operand(s4), Operand(v1), op_v2)->mimg().dim =
868 ac_image_2d;
869
870 //~gfx11! image_msaa_load v[84:87], v[30:33], s[64:71] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; f060011c 0010541e
871 //~gfx12! image_msaa_load v[84:87], [v30, v31, v32, v33], s[64:71] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; e4460007 00008054 21201f1e
872 bld.mimg(aco_opcode::image_msaa_load, dst_v4, op_s8, Operand(s4), Operand(v1), op_v4, 0x1)
873 ->mimg()
874 .dim = ac_image_2darraymsaa;
875
876 finish_assembler_test();
877 }
878 END_TEST
879
880 BEGIN_TEST(assembler.flat)
881 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
882 if (!setup_cs(NULL, gfx))
883 continue;
884
885 Definition dst_v1 = bld.def(v1);
886 dst_v1.setFixed(PhysReg(256 + 42));
887
888 Operand op_s1(bld.tmp(s1));
889 op_s1.setFixed(PhysReg(32));
890
891 Operand op_s2(bld.tmp(s2));
892 op_s2.setFixed(PhysReg(64));
893
894 Operand op_v1(bld.tmp(v1));
895 op_v1.setFixed(PhysReg(256 + 10));
896
897 Operand op_v2(bld.tmp(v2));
898 op_v2.setFixed(PhysReg(256 + 20));
899
900 /* Addressing */
901 //~gfx11>> flat_load_b32 v42, v[20:21] ; dc500000 2a7c0014
902 //~gfx12>> flat_load_b32 v42, v[20:21] ; ec05007c 0000002a 00000014
903 bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1));
904
905 //~gfx11! global_load_b32 v42, v[20:21], off ; dc520000 2a7c0014
906 //~gfx12! global_load_b32 v42, v[20:21], off ; ee05007c 0000002a 00000014
907 bld.global(aco_opcode::global_load_dword, dst_v1, op_v2, Operand(s1));
908
909 //~gfx11! global_load_b32 v42, v10, s[64:65] ; dc520000 2a40000a
910 //~gfx12! global_load_b32 v42, v10, s[64:65] ; ee050040 0000002a 0000000a
911 bld.global(aco_opcode::global_load_dword, dst_v1, op_v1, op_s2);
912
913 //~gfx11! scratch_load_b32 v42, v10, off ; dc510000 2afc000a
914 //~gfx12! scratch_load_b32 v42, v10, off ; ed05007c 0002002a 0000000a
915 bld.scratch(aco_opcode::scratch_load_dword, dst_v1, op_v1, Operand(s1));
916
917 //~gfx11! scratch_load_b32 v42, off, s32 ; dc510000 2a200080
918 //~gfx12! scratch_load_b32 v42, off, s32 ; ed050020 0000002a 00000000
919 bld.scratch(aco_opcode::scratch_load_dword, dst_v1, Operand(v1), op_s1);
920
921 //~gfx11! scratch_load_b32 v42, v10, s32 ; dc510000 2aa0000a
922 //~gfx12! scratch_load_b32 v42, v10, s32 ; ed050020 0002002a 0000000a
923 bld.scratch(aco_opcode::scratch_load_dword, dst_v1, op_v1, op_s1);
924
925 //~gfx11! scratch_load_b32 v42, off, off ; dc510000 2a7c0080
926 //~gfx12! scratch_load_b32 v42, off, off ; ed05007c 0000002a 00000000
927 bld.scratch(aco_opcode::scratch_load_dword, dst_v1, Operand(v1), Operand(s1));
928
929 //~gfx11! global_load_b32 v42, v[20:21], off offset:-42 ; dc521fd6 2a7c0014
930 //~gfx12! global_load_b32 v42, v[20:21], off offset:-42 ; ee05007c 0000002a ffffd614
931 bld.global(aco_opcode::global_load_dword, dst_v1, op_v2, Operand(s1), -42);
932
933 //~gfx11! global_load_b32 v42, v[20:21], off offset:84 ; dc520054 2a7c0014
934 //~gfx12! global_load_b32 v42, v[20:21], off offset:84 ; ee05007c 0000002a 00005414
935 bld.global(aco_opcode::global_load_dword, dst_v1, op_v2, Operand(s1), 84);
936
937 /* Various flags */
938 ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
939 ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}};
940 ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
941 ac_hw_cache_flags cache_atomic_rtn = {{0, 0, 0, 0, 0}};
942 if (gfx >= GFX12) {
943 cache_coherent.gfx12.scope = gfx12_scope_device;
944 cache_sys_coherent.gfx12.scope = gfx12_scope_memory;
945 cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
946 cache_atomic_rtn.gfx12.temporal_hint = gfx12_atomic_return;
947 } else {
948 cache_coherent.value = ac_glc;
949 cache_sys_coherent.value = ac_slc;
950 cache_non_temporal.value = ac_dlc;
951 cache_atomic_rtn.value = ac_glc;
952 }
953
954 //~gfx11! flat_load_b32 v42, v[20:21] slc ; dc508000 2a7c0014
955 //~gfx12! flat_load_b32 v42, v[20:21] scope:SCOPE_SYS ; ec05007c 000c002a 00000014
956 bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
957 cache_sys_coherent;
958
959 //~gfx11! flat_load_b32 v42, v[20:21] glc ; dc504000 2a7c0014
960 //~gfx12! flat_load_b32 v42, v[20:21] scope:SCOPE_DEV ; ec05007c 0008002a 00000014
961 bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
962 cache_coherent;
963
964 //~gfx11! flat_load_b32 v42, v[20:21] dlc ; dc502000 2a7c0014
965 //~gfx12! flat_load_b32 v42, v[20:21] th:TH_LOAD_NT ; ec05007c 0010002a 00000014
966 bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
967 cache_non_temporal;
968
969 /* Stores */
970 //~gfx11! flat_store_b32 v[20:21], v10 ; dc680000 007c0a14
971 //~gfx12! flat_store_b32 v[20:21], v10 ; ec06807c 05000000 00000014
972 bld.flat(aco_opcode::flat_store_dword, op_v2, Operand(s1), op_v1);
973
974 /* Atomic with return */
975 //~gfx11! global_atomic_add_u32 v42, v[20:21], v10, off glc ; dcd64000 2a7c0a14
976 //~gfx12! global_atomic_add_u32 v42, v[20:21], v10, off th:TH_ATOMIC_RETURN ; ee0d407c 0510002a 00000014
977 bld.global(aco_opcode::global_atomic_add, dst_v1, op_v2, Operand(s1), op_v1)->global().cache =
978 cache_atomic_rtn;
979
980 finish_assembler_test();
981 }
982 END_TEST
983
984 BEGIN_TEST(assembler.exp)
985 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
986 if (!setup_cs(NULL, gfx))
987 continue;
988
989 Operand op[4];
990 for (unsigned i = 0; i < 4; i++)
991 op[i] = Operand(PhysReg(256 + i), v1);
992
993 Operand op_m0(bld.tmp(s1));
994 op_m0.setFixed(m0);
995
996 //~gfx11>> exp mrt3 v1, v0, v3, v2 ; f800003f 02030001
997 //~gfx12>> export mrt3 v1, v0, v3, v2 ; f800003f 02030001
998 bld.exp(aco_opcode::exp, op[1], op[0], op[3], op[2], 0xf, 3);
999
1000 //~gfx11! exp mrt3 v1, off, v0, off ; f8000035 80008001
1001 //~gfx12! export mrt3 v1, off, v0, off ; f8000035 80008001
1002 bld.exp(aco_opcode::exp, op[1], Operand(v1), op[0], Operand(v1), 0x5, 3);
1003
1004 //~gfx11! exp mrt3 v1, v0, v3, v2 done ; f800083f 02030001
1005 //~gfx12! export mrt3 v1, v0, v3, v2 done ; f800083f 02030001
1006 bld.exp(aco_opcode::exp, op[1], op[0], op[3], op[2], 0xf, 3, false, true);
1007
1008 //~gfx11! exp mrt3 v1, v0, v3, v2 row_en ; f800203f 02030001
1009 //~gfx12! export mrt3 v1, v0, v3, v2 row_en ; f800203f 02030001
1010 bld.exp(aco_opcode::exp, op[1], op[0], op[3], op[2], op_m0, 0xf, 3)->exp().row_en = true;
1011
1012 finish_assembler_test();
1013 }
1014 END_TEST
1015
1016 BEGIN_TEST(assembler.vinterp)
1017 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1018 if (!setup_cs(NULL, gfx))
1019 continue;
1020
1021 Definition dst = bld.def(v1);
1022 dst.setFixed(PhysReg(256 + 42));
1023
1024 Operand op0(bld.tmp(v1));
1025 op0.setFixed(PhysReg(256 + 10));
1026
1027 Operand op1(bld.tmp(v1));
1028 op1.setFixed(PhysReg(256 + 20));
1029
1030 Operand op2(bld.tmp(v1));
1031 op2.setFixed(PhysReg(256 + 30));
1032
1033 //! llvm_version: #llvm_ver
1034 fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
1035
1036 //>> v_interp_p10_f32 v42, v10, v20, v30 wait_exp:7 ; cd00072a 047a290a
1037 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2);
1038
1039 //! v_interp_p10_f32 v42, v10, v20, v30 wait_exp:6 ; cd00062a 047a290a
1040 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6);
1041
1042 //; if llvm_ver >= 18:
1043 //; insert_pattern('v_interp_p2_f32 v42, v10, v20, v30 wait_exp:0 ; cd01002a 047a290a')
1044 //; else:
1045 //; insert_pattern('v_interp_p2_f32 v42, v10, v20, v30 ; cd01002a 047a290a')
1046 bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, dst, op0, op1, op2, 0, 0);
1047
1048 //! v_interp_p10_f32 v42, -v10, v20, v30 wait_exp:6 ; cd00062a 247a290a
1049 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6)
1050 ->vinterp_inreg()
1051 .neg[0] = true;
1052
1053 //! v_interp_p10_f32 v42, v10, -v20, v30 wait_exp:6 ; cd00062a 447a290a
1054 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6)
1055 ->vinterp_inreg()
1056 .neg[1] = true;
1057
1058 //! v_interp_p10_f32 v42, v10, v20, -v30 wait_exp:6 ; cd00062a 847a290a
1059 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6)
1060 ->vinterp_inreg()
1061 .neg[2] = true;
1062
1063 //! v_interp_p10_f16_f32 v42, v10, v20, v30 op_sel:[1,0,0,0] wait_exp:6 ; cd020e2a 047a290a
1064 bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, dst, op0, op1, op2, 0x1, 6);
1065
1066 //! v_interp_p2_f16_f32 v42, v10, v20, v30 op_sel:[0,1,0,0] wait_exp:6 ; cd03162a 047a290a
1067 bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, dst, op0, op1, op2, 0x2, 6);
1068
1069 //! v_interp_p10_rtz_f16_f32 v42, v10, v20, v30 op_sel:[0,0,1,0] wait_exp:6 ; cd04262a 047a290a
1070 bld.vinterp_inreg(aco_opcode::v_interp_p10_rtz_f16_f32_inreg, dst, op0, op1, op2, 0x4, 6);
1071
1072 //! v_interp_p2_rtz_f16_f32 v42, v10, v20, v30 op_sel:[0,0,0,1] wait_exp:6 ; cd05462a 047a290a
1073 bld.vinterp_inreg(aco_opcode::v_interp_p2_rtz_f16_f32_inreg, dst, op0, op1, op2, 0x8, 6);
1074
1075 //! v_interp_p10_f32 v42, v10, v20, v30 clamp wait_exp:6 ; cd00862a 047a290a
1076 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6)
1077 ->vinterp_inreg()
1078 .clamp = true;
1079
1080 finish_assembler_test();
1081 }
1082 END_TEST
1083
1084 BEGIN_TEST(assembler.ldsdir)
1085 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1086 if (!setup_cs(NULL, gfx))
1087 continue;
1088
1089 Definition dst = bld.def(v1);
1090 dst.setFixed(PhysReg(256 + 42));
1091
1092 Operand op(bld.tmp(s1));
1093 op.setFixed(m0);
1094
1095 //! llvm_version: #llvm_ver
1096 fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
1097
1098 //~gfx11>> lds_direct_load v42 wait_vdst:15 ; ce1f002a
1099 //~gfx12>> ds_direct_load v42 wait_va_vdst:15 wait_vm_vsrc:1 ; ce9f002a
1100 bld.ldsdir(aco_opcode::lds_direct_load, dst, op)->ldsdir().wait_vdst = 15;
1101
1102 //~gfx11! lds_direct_load v42 wait_vdst:6 ; ce16002a
1103 //~gfx12! ds_direct_load v42 wait_va_vdst:6 wait_vm_vsrc:1 ; ce96002a
1104 bld.ldsdir(aco_opcode::lds_direct_load, dst, op)->ldsdir().wait_vdst = 6;
1105
1106 //; if llvm_ver >= 18 and variant == 'gfx11':
1107 //; insert_pattern('lds_direct_load v42 wait_vdst:0 ; ce10002a')
1108 //; elif variant == 'gfx11':
1109 //; insert_pattern('lds_direct_load v42 ; ce10002a')
1110 //~gfx12! ds_direct_load v42 wait_va_vdst:0 wait_vm_vsrc:1 ; ce90002a
1111 bld.ldsdir(aco_opcode::lds_direct_load, dst, op)->ldsdir().wait_vdst = 0;
1112
1113 //~gfx11! lds_param_load v42, attr56.x wait_vdst:8 ; ce08e02a
1114 //~gfx12! ds_param_load v42, attr56.x wait_va_vdst:8 wait_vm_vsrc:1 ; ce88e02a
1115 bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0)->ldsdir().wait_vdst = 8;
1116
1117 //; if llvm_ver >= 18 and variant == 'gfx11':
1118 //; insert_pattern('lds_param_load v42, attr56.x wait_vdst:0 ; ce00e02a')
1119 //; elif variant == 'gfx11':
1120 //; insert_pattern('lds_param_load v42, attr56.x ; ce00e02a')
1121 //~gfx12! ds_param_load v42, attr56.x wait_va_vdst:0 wait_vm_vsrc:1 ; ce80e02a
1122 bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0)->ldsdir().wait_vdst = 0;
1123
1124 //~gfx11! lds_param_load v42, attr34.y wait_vdst:8 ; ce08892a
1125 //~gfx12! ds_param_load v42, attr34.y wait_va_vdst:8 wait_vm_vsrc:1 ; ce88892a
1126 bld.ldsdir(aco_opcode::lds_param_load, dst, op, 34, 1)->ldsdir().wait_vdst = 8;
1127
1128 //~gfx11! lds_param_load v42, attr12.z wait_vdst:8 ; ce08322a
1129 //~gfx12! ds_param_load v42, attr12.z wait_va_vdst:8 wait_vm_vsrc:1 ; ce88322a
1130 bld.ldsdir(aco_opcode::lds_param_load, dst, op, 12, 2)->ldsdir().wait_vdst = 8;
1131
1132 //~gfx11>> lds_direct_load v42 wait_vdst:15 ; ce1f002a
1133 //~gfx12>> ds_direct_load v42 wait_va_vdst:15 wait_vm_vsrc:0 ; ce1f002a
1134 bld.ldsdir(aco_opcode::lds_direct_load, dst, op)->ldsdir().wait_vsrc = 0;
1135
1136 finish_assembler_test();
1137 }
1138 END_TEST
1139
1140 BEGIN_TEST(assembler.vop12c_v128)
1141 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1142 if (!setup_cs(NULL, gfx))
1143 continue;
1144
1145 Definition dst_v0 = bld.def(v1);
1146 dst_v0.setFixed(PhysReg(256));
1147
1148 Definition dst_v128 = bld.def(v1);
1149 dst_v128.setFixed(PhysReg(256 + 128));
1150
1151 Operand op_v1(bld.tmp(v1));
1152 op_v1.setFixed(PhysReg(256 + 1));
1153
1154 Operand op_v2(bld.tmp(v1));
1155 op_v2.setFixed(PhysReg(256 + 2));
1156
1157 Operand op_v129(bld.tmp(v1));
1158 op_v129.setFixed(PhysReg(256 + 129));
1159
1160 Operand op_v130(bld.tmp(v1));
1161 op_v130.setFixed(PhysReg(256 + 130));
1162
1163 //! llvm_version: #llvm_ver
1164 fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
1165
1166 //>> BB0:
1167 //; if llvm_ver == 16:
1168 //; insert_pattern('v_mul_f16_e32 v0, v1, v2 ; Error: VGPR_32_Lo128: unknown register 128 ; 6a000501')
1169 //; else:
1170 //; insert_pattern('v_mul_f16_e32 v0, v1, v2 ; 6a000501')
1171 bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v2);
1172
1173 //! v_mul_f16_e64 v128, v1, v2 ; d5350080 00020501
1174 bld.vop2(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2);
1175
1176 //! v_mul_f16_e64 v0, v129, v2 ; d5350000 00020581
1177 bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2);
1178
1179 //! v_mul_f16_e64 v0, v1, v130 ; d5350000 00030501
1180 bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130);
1181
1182 //! v_rcp_f16_e64 v128, v1 ; d5d40080 00000101
1183 bld.vop1(aco_opcode::v_rcp_f16, dst_v128, op_v1);
1184
1185 //! v_cmp_eq_f16_e64 vcc, v129, v2 ; d402006a 00020581
1186 bld.vopc(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2);
1187
1188 //! v_mul_f16_e64_dpp v128, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350080 000204fa ff0d2101
1189 bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1));
1190
1191 //! v_mul_f16_e64_dpp v0, v129, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350000 000204fa ff0d2181
1192 bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2, dpp_row_rr(1));
1193
1194 //! v_mul_f16_e64_dpp v0, v1, v130 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350000 000304fa ff0d2101
1195 bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130, dpp_row_rr(1));
1196
1197 //! v_mul_f16_e64_dpp v128, v1, v2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5350080 000204ea 00000001
1198 bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2);
1199
1200 //! v_mul_f16_e64_dpp v0, v129, v2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5350000 000204ea 00000081
1201 bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2);
1202
1203 //! v_mul_f16_e64_dpp v0, v1, v130 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5350000 000304ea 00000001
1204 bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130);
1205
1206 //! v_fma_f16 v128, v1, v2, 0x60 ; d6480080 03fe0501 00000060
1207 bld.vop2(aco_opcode::v_fmaak_f16, dst_v128, op_v1, op_v2, Operand::literal32(96));
1208
1209 //! v_fma_f16 v128, v1, 0x60, v2 ; d6480080 0409ff01 00000060
1210 bld.vop2(aco_opcode::v_fmamk_f16, dst_v128, op_v1, op_v2, Operand::literal32(96));
1211
1212 //! v_rcp_f16_e64_dpp v128, -v1 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5d40080 200000fa ff1d2101
1213 bld.vop1_dpp(aco_opcode::v_rcp_f16, dst_v128, op_v1, dpp_row_rr(1))->dpp16().neg[0] = true;
1214
1215 //! v_rcp_f16_e64_dpp v128, |v1| row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5d40180 000000fa ff2d2101
1216 bld.vop1_dpp(aco_opcode::v_rcp_f16, dst_v128, op_v1, dpp_row_rr(1))->dpp16().abs[0] = true;
1217
1218 //! v_mul_f16_e64_dpp v128, -v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350080 200204fa ff1d2101
1219 bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1))->dpp16().neg[0] =
1220 true;
1221
1222 //! v_mul_f16_e64_dpp v128, |v1|, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350180 000204fa ff2d2101
1223 bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1))->dpp16().abs[0] =
1224 true;
1225
1226 //! v_cmp_eq_f16_e64_dpp vcc, -v129, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d402006a 200204fa ff1d2181
1227 bld.vopc_dpp(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2, dpp_row_rr(1))
1228 ->dpp16()
1229 .neg[0] = true;
1230
1231 //! v_cmp_eq_f16_e64_dpp vcc, |v129|, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d402016a 000204fa ff2d2181
1232 bld.vopc_dpp(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2, dpp_row_rr(1))
1233 ->dpp16()
1234 .abs[0] = true;
1235
1236 finish_assembler_test();
1237 }
1238 END_TEST
1239
1240 BEGIN_TEST(assembler.vop3_dpp)
1241 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1242 if (!setup_cs(NULL, gfx))
1243 continue;
1244
1245 Definition dst_v0 = bld.def(v1);
1246 dst_v0.setFixed(PhysReg(256));
1247
1248 Definition dst_non_vcc = bld.def(s2);
1249 dst_non_vcc.setFixed(PhysReg(4));
1250
1251 Operand op_v1(bld.tmp(v1));
1252 op_v1.setFixed(PhysReg(256 + 1));
1253
1254 Operand op_v2(bld.tmp(v1));
1255 op_v2.setFixed(PhysReg(256 + 2));
1256
1257 Operand op_s1(bld.tmp(s1));
1258 op_s1.setFixed(PhysReg(1));
1259
1260 //>> BB0:
1261 //! v_fma_f32_e64_dpp v0, v1, v2, s1 clamp row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d6138000 000604fa ff0d2101
1262 bld.vop3_dpp(aco_opcode::v_fma_f32, dst_v0, op_v1, op_v2, op_s1, dpp_row_rr(1))->valu().clamp =
1263 true;
1264
1265 //! v_fma_mix_f32_e64_dpp v0, |v1|, |v2|, |s1| op_sel:[1,0,0] op_sel_hi:[1,0,1] row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; cc204f00 080604fa ffad2101
1266 bld.vop3p_dpp(aco_opcode::v_fma_mix_f32, dst_v0, op_v1, op_v2, op_s1, 0x1, 0x5, dpp_row_rr(1))
1267 ->valu()
1268 .abs = 0x7;
1269
1270 //! v_fma_f32_e64_dpp v0, -v1, -v2, -s1 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d6130000 e00604ea 00000001
1271 bld.vop3_dpp8(aco_opcode::v_fma_f32, dst_v0, op_v1, op_v2, op_s1)->valu().neg = 0x7;
1272
1273 //! v_fma_mix_f32_e64_dpp v0, -v1, -v2, s1 op_sel_hi:[1,1,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; cc204000 780604ea 00000001
1274 bld.vop3p_dpp8(aco_opcode::v_fma_mix_f32, dst_v0, op_v1, op_v2, op_s1, 0x0, 0x7)->valu().neg =
1275 0x3;
1276
1277 //! v_add_f32_e64_dpp v0, v1, v2 clamp row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5038000 000204fa ff0d2101
1278 bld.vop2_e64_dpp(aco_opcode::v_add_f32, dst_v0, op_v1, op_v2, dpp_row_rr(1))->valu().clamp =
1279 true;
1280
1281 //! v_sqrt_f32_e64_dpp v0, v1 clamp row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5b38000 000000fa ff0d2101
1282 bld.vop1_e64_dpp(aco_opcode::v_sqrt_f32, dst_v0, op_v1, dpp_row_rr(1))->valu().clamp = true;
1283
1284 //! v_cmp_lt_f32_e64_dpp s[4:5], |v1|, |v2| row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d4110304 000204fa ffad2101
1285 bld.vopc_e64_dpp(aco_opcode::v_cmp_lt_f32, dst_non_vcc, op_v1, op_v2, dpp_row_rr(1))
1286 ->valu()
1287 .abs = 0x3;
1288
1289 //! v_add_f32_e64_dpp v0, v1, v2 mul:4 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5030000 100204ea 00000001
1290 bld.vop2_e64_dpp8(aco_opcode::v_add_f32, dst_v0, op_v1, op_v2)->valu().omod = 2;
1291
1292 //! v_sqrt_f32_e64_dpp v0, v1 clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5b38000 000000ea 00000001
1293 bld.vop1_e64_dpp8(aco_opcode::v_sqrt_f32, dst_v0, op_v1)->valu().clamp = true;
1294
1295 //! v_cmp_lt_f32_e64_dpp s[4:5], |v1|, v2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d4110104 000204ea 00000001
1296 bld.vopc_e64_dpp8(aco_opcode::v_cmp_lt_f32, dst_non_vcc, op_v1, op_v2)->valu().abs = 0x1;
1297
1298 finish_assembler_test();
1299 }
1300 END_TEST
1301
1302 BEGIN_TEST(assembler.vopd)
1303 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1304 if (!setup_cs(NULL, gfx))
1305 continue;
1306
1307 Definition dst_v0 = bld.def(v1);
1308 dst_v0.setFixed(PhysReg(256));
1309
1310 Definition dst_v1 = bld.def(v1);
1311 dst_v1.setFixed(PhysReg(256 + 1));
1312
1313 Operand op_v0(bld.tmp(v1));
1314 op_v0.setFixed(PhysReg(256 + 0));
1315
1316 Operand op_v1(bld.tmp(v1));
1317 op_v1.setFixed(PhysReg(256 + 1));
1318
1319 Operand op_v2(bld.tmp(v1));
1320 op_v2.setFixed(PhysReg(256 + 2));
1321
1322 Operand op_v3(bld.tmp(v1));
1323 op_v3.setFixed(PhysReg(256 + 3));
1324
1325 Operand op_s0(bld.tmp(s1));
1326 op_s0.setFixed(PhysReg(0));
1327
1328 Operand op_vcc(bld.tmp(s1));
1329 op_vcc.setFixed(vcc);
1330
1331 //>> BB0:
1332 //! v_dual_mov_b32 v0, v0 :: v_dual_mov_b32 v1, v1 ; ca100100 00000101
1333 bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_v0, op_v1,
1334 aco_opcode::v_dual_mov_b32);
1335
1336 //! v_dual_mov_b32 v0, 0x60 :: v_dual_mov_b32 v1, s0 ; ca1000ff 00000000 00000060
1337 bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, Operand::c32(96), op_s0,
1338 aco_opcode::v_dual_mov_b32);
1339
1340 //! v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0x60 ; ca100000 000000ff 00000060
1341 bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_s0, Operand::c32(96),
1342 aco_opcode::v_dual_mov_b32);
1343
1344 //! v_dual_mul_f32 v0, v0, v1 :: v_dual_mov_b32 v1, v2 ; c8d00300 00000102
1345 bld.vopd(aco_opcode::v_dual_mul_f32, dst_v0, dst_v1, op_v0, op_v1, op_v2,
1346 aco_opcode::v_dual_mov_b32);
1347
1348 //! v_dual_fmac_f32 v0, v1, v2 :: v_dual_mov_b32 v1, v3 ; c8100501 00000103
1349 bld.vopd(aco_opcode::v_dual_fmac_f32, dst_v0, dst_v1, op_v1, op_v2, op_v0, op_v3,
1350 aco_opcode::v_dual_mov_b32);
1351
1352 //! v_dual_mov_b32 v0, v0 :: v_dual_and_b32 v1, v1, v2 ; ca240100 00000501
1353 bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_v0, op_v1, op_v2,
1354 aco_opcode::v_dual_and_b32);
1355
1356 //! v_dual_cndmask_b32 v0, v0, v1 :: v_dual_cndmask_b32 v1, v2, v3 ; ca520300 00000702
1357 bld.vopd(aco_opcode::v_dual_cndmask_b32, dst_v0, dst_v1, op_v0, op_v1, op_vcc, op_v2, op_v3,
1358 op_vcc, aco_opcode::v_dual_cndmask_b32);
1359
1360 finish_assembler_test();
1361 }
1362 END_TEST
1363
1364 BEGIN_TEST(assembler.pseudo_scalar_trans)
1365 if (LLVM_VERSION_MAJOR < 19 || !setup_cs(NULL, GFX12))
1366 return;
1367
1368 //>> v_s_sqrt_f32 s5, s1 ; d6880005 00000001
1369 bld.vop3(aco_opcode::v_s_sqrt_f32, Definition(PhysReg(5), s1), Operand(PhysReg(1), s1));
1370
1371 finish_assembler_test();
1372 END_TEST
1373
1374 BEGIN_TEST(assembler.vintrp_high_16bits)
1375 for (unsigned i = GFX8; i <= GFX10; i++) {
1376 if (!setup_cs(NULL, (amd_gfx_level)i))
1377 continue;
1378
1379 Definition dst_v0 = bld.def(v1);
1380 dst_v0.setFixed(PhysReg(256));
1381
1382 Definition dst_v1 = bld.def(v1);
1383 dst_v1.setFixed(PhysReg(256 + 1));
1384
1385 Operand op_v0(bld.tmp(v1));
1386 op_v0.setFixed(PhysReg(256 + 0));
1387
1388 Operand op_v1(bld.tmp(v1));
1389 op_v1.setFixed(PhysReg(256 + 1));
1390
1391 Operand op_v2(bld.tmp(v1));
1392 op_v2.setFixed(PhysReg(256 + 2));
1393
1394 Operand op_m0(bld.tmp(s1));
1395 op_m0.setFixed(m0);
1396
1397 aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
1398
1399 if (bld.program->gfx_level == GFX8)
1400 interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
1401
1402 //! BB0:
1403 //~gfx8! v_interp_p1ll_f16 v0, v1, attr4.y high ; d2740000 00020344
1404 //~gfx9! v_interp_p1ll_f16 v0, v1, attr4.y high ; d2740000 00020344
1405 //~gfx10! v_interp_p1ll_f16 v0, v1, attr4.y high ; d7420000 00020344
1406 bld.vintrp(aco_opcode::v_interp_p1ll_f16, dst_v0, op_v1, op_m0, 4, 1, true);
1407
1408 //~gfx8! v_interp_p2_f16 v1, v2, attr4.y, v0 high ; d2760001 04020544
1409 //~gfx9! v_interp_p2_f16 v1, v2, attr4.y, v0 high ; d2770001 04020544
1410 //~gfx10! v_interp_p2_f16 v1, v2, attr4.y, v0 high ; d75a0001 04020544
1411 bld.vintrp(interp_p2_op, dst_v1, op_v2, op_m0, op_v0, 4, 1, true);
1412
1413 finish_assembler_test();
1414 }
1415 END_TEST
1416