1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9
10 #include <vector>
11
12 namespace aco {
13 namespace {
14
15 /* there can also be LDS and VALU clauses, but I don't see how those are interesting */
16 enum clause_type {
17 clause_smem,
18 clause_other,
19 /* GFX10: */
20 clause_vmem,
21 clause_flat,
22 /* GFX11: */
23 clause_mimg_load,
24 clause_mimg_store,
25 clause_mimg_atomic,
26 clause_mimg_sample,
27 clause_vmem_load,
28 clause_vmem_store,
29 clause_vmem_atomic,
30 clause_flat_load,
31 clause_flat_store,
32 clause_flat_atomic,
33 clause_bvh,
34 };
35
36 void
emit_clause(Builder & bld,unsigned num_instrs,aco_ptr<Instruction> * instrs)37 emit_clause(Builder& bld, unsigned num_instrs, aco_ptr<Instruction>* instrs)
38 {
39 unsigned start = 0;
40 unsigned end = num_instrs;
41
42 if (bld.program->gfx_level < GFX11) {
43 /* skip any stores at the start */
44 for (; (start < num_instrs) && instrs[start]->definitions.empty(); start++)
45 bld.insert(std::move(instrs[start]));
46
47 for (end = start; (end < num_instrs) && !instrs[end]->definitions.empty(); end++)
48 ;
49 }
50
51 unsigned clause_size = end - start;
52 if (clause_size > 1)
53 bld.sopp(aco_opcode::s_clause, clause_size - 1);
54
55 for (unsigned i = start; i < num_instrs; i++)
56 bld.insert(std::move(instrs[i]));
57 }
58
59 clause_type
get_type(Program * program,aco_ptr<Instruction> & instr)60 get_type(Program* program, aco_ptr<Instruction>& instr)
61 {
62 if (instr->isSMEM() && !instr->operands.empty())
63 return clause_smem;
64
65 if (program->gfx_level >= GFX11) {
66 if (instr->isMIMG()) {
67 switch (instr->opcode) {
68 case aco_opcode::image_bvh_intersect_ray:
69 case aco_opcode::image_bvh64_intersect_ray: return clause_bvh;
70 case aco_opcode::image_atomic_swap:
71 case aco_opcode::image_atomic_cmpswap:
72 case aco_opcode::image_atomic_add:
73 case aco_opcode::image_atomic_sub:
74 case aco_opcode::image_atomic_rsub:
75 case aco_opcode::image_atomic_smin:
76 case aco_opcode::image_atomic_umin:
77 case aco_opcode::image_atomic_smax:
78 case aco_opcode::image_atomic_umax:
79 case aco_opcode::image_atomic_and:
80 case aco_opcode::image_atomic_or:
81 case aco_opcode::image_atomic_xor:
82 case aco_opcode::image_atomic_inc:
83 case aco_opcode::image_atomic_dec:
84 case aco_opcode::image_atomic_fcmpswap:
85 case aco_opcode::image_atomic_fmin:
86 case aco_opcode::image_atomic_fmax: return clause_mimg_atomic;
87 default:
88 if (instr->definitions.empty())
89 return clause_mimg_store;
90 else
91 return !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4
92 ? clause_mimg_sample
93 : clause_mimg_load;
94 }
95 } else if (instr->isMTBUF() || instr->isScratch()) {
96 return instr->definitions.empty() ? clause_vmem_store : clause_vmem_load;
97 } else if (instr->isMUBUF()) {
98 switch (instr->opcode) {
99 case aco_opcode::buffer_atomic_add:
100 case aco_opcode::buffer_atomic_and_x2:
101 case aco_opcode::buffer_atomic_rsub:
102 case aco_opcode::buffer_atomic_umax:
103 case aco_opcode::buffer_atomic_dec:
104 case aco_opcode::buffer_atomic_smax:
105 case aco_opcode::buffer_atomic_fmax:
106 case aco_opcode::buffer_atomic_rsub_x2:
107 case aco_opcode::buffer_atomic_smin:
108 case aco_opcode::buffer_atomic_sub:
109 case aco_opcode::buffer_atomic_sub_x2:
110 case aco_opcode::buffer_atomic_xor_x2:
111 case aco_opcode::buffer_atomic_add_f32:
112 case aco_opcode::buffer_atomic_inc:
113 case aco_opcode::buffer_atomic_swap_x2:
114 case aco_opcode::buffer_atomic_cmpswap:
115 case aco_opcode::buffer_atomic_fmin_x2:
116 case aco_opcode::buffer_atomic_umin:
117 case aco_opcode::buffer_atomic_or:
118 case aco_opcode::buffer_atomic_umax_x2:
119 case aco_opcode::buffer_atomic_smin_x2:
120 case aco_opcode::buffer_atomic_umin_x2:
121 case aco_opcode::buffer_atomic_cmpswap_x2:
122 case aco_opcode::buffer_atomic_add_x2:
123 case aco_opcode::buffer_atomic_swap:
124 case aco_opcode::buffer_atomic_and:
125 case aco_opcode::buffer_atomic_fmin:
126 case aco_opcode::buffer_atomic_fcmpswap_x2:
127 case aco_opcode::buffer_atomic_or_x2:
128 case aco_opcode::buffer_atomic_fcmpswap:
129 case aco_opcode::buffer_atomic_xor:
130 case aco_opcode::buffer_atomic_dec_x2:
131 case aco_opcode::buffer_atomic_fmax_x2:
132 case aco_opcode::buffer_atomic_csub:
133 case aco_opcode::buffer_atomic_inc_x2:
134 case aco_opcode::buffer_atomic_smax_x2: return clause_vmem_atomic;
135 default: return instr->definitions.empty() ? clause_vmem_store : clause_vmem_load;
136 }
137 } else if (instr->isGlobal()) {
138 switch (instr->opcode) {
139 case aco_opcode::global_atomic_swap:
140 case aco_opcode::global_atomic_umax:
141 case aco_opcode::global_atomic_cmpswap:
142 case aco_opcode::global_atomic_and_x2:
143 case aco_opcode::global_atomic_fmax:
144 case aco_opcode::global_atomic_smax_x2:
145 case aco_opcode::global_atomic_fmax_x2:
146 case aco_opcode::global_atomic_dec:
147 case aco_opcode::global_atomic_dec_x2:
148 case aco_opcode::global_atomic_umin:
149 case aco_opcode::global_atomic_fcmpswap_x2:
150 case aco_opcode::global_atomic_inc:
151 case aco_opcode::global_atomic_and:
152 case aco_opcode::global_atomic_fmin:
153 case aco_opcode::global_atomic_fcmpswap:
154 case aco_opcode::global_atomic_or_x2:
155 case aco_opcode::global_atomic_smax:
156 case aco_opcode::global_atomic_sub:
157 case aco_opcode::global_atomic_xor:
158 case aco_opcode::global_atomic_swap_x2:
159 case aco_opcode::global_atomic_umax_x2:
160 case aco_opcode::global_atomic_umin_x2:
161 case aco_opcode::global_atomic_xor_x2:
162 case aco_opcode::global_atomic_inc_x2:
163 case aco_opcode::global_atomic_fmin_x2:
164 case aco_opcode::global_atomic_add_f32:
165 case aco_opcode::global_atomic_add:
166 case aco_opcode::global_atomic_or:
167 case aco_opcode::global_atomic_add_x2:
168 case aco_opcode::global_atomic_smin_x2:
169 case aco_opcode::global_atomic_smin:
170 case aco_opcode::global_atomic_csub:
171 case aco_opcode::global_atomic_sub_x2:
172 case aco_opcode::global_atomic_cmpswap_x2: return clause_vmem_atomic;
173 default: return instr->definitions.empty() ? clause_vmem_store : clause_vmem_load;
174 }
175 } else if (instr->isFlat()) {
176 switch (instr->opcode) {
177 case aco_opcode::flat_atomic_smax:
178 case aco_opcode::flat_atomic_fcmpswap_x2:
179 case aco_opcode::flat_atomic_inc_x2:
180 case aco_opcode::flat_atomic_dec:
181 case aco_opcode::flat_atomic_fmin:
182 case aco_opcode::flat_atomic_umax_x2:
183 case aco_opcode::flat_atomic_add_f32:
184 case aco_opcode::flat_atomic_or:
185 case aco_opcode::flat_atomic_smax_x2:
186 case aco_opcode::flat_atomic_umin:
187 case aco_opcode::flat_atomic_sub:
188 case aco_opcode::flat_atomic_swap:
189 case aco_opcode::flat_atomic_swap_x2:
190 case aco_opcode::flat_atomic_cmpswap_x2:
191 case aco_opcode::flat_atomic_fcmpswap:
192 case aco_opcode::flat_atomic_add:
193 case aco_opcode::flat_atomic_umin_x2:
194 case aco_opcode::flat_atomic_xor_x2:
195 case aco_opcode::flat_atomic_smin:
196 case aco_opcode::flat_atomic_fmax_x2:
197 case aco_opcode::flat_atomic_cmpswap:
198 case aco_opcode::flat_atomic_dec_x2:
199 case aco_opcode::flat_atomic_sub_x2:
200 case aco_opcode::flat_atomic_add_x2:
201 case aco_opcode::flat_atomic_umax:
202 case aco_opcode::flat_atomic_xor:
203 case aco_opcode::flat_atomic_and_x2:
204 case aco_opcode::flat_atomic_inc:
205 case aco_opcode::flat_atomic_and:
206 case aco_opcode::flat_atomic_fmin_x2:
207 case aco_opcode::flat_atomic_smin_x2:
208 case aco_opcode::flat_atomic_or_x2:
209 case aco_opcode::flat_atomic_fmax: return clause_flat_atomic;
210 default: return instr->definitions.empty() ? clause_flat_store : clause_flat_load;
211 }
212 }
213 } else {
214 if (instr->isVMEM() && !instr->operands.empty()) {
215 if (program->gfx_level == GFX10 && instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
216 return clause_other;
217 else
218 return clause_vmem;
219 } else if (instr->isScratch() || instr->isGlobal()) {
220 return clause_vmem;
221 } else if (instr->isFlat()) {
222 return clause_flat;
223 }
224 }
225 return clause_other;
226 }
227
228 } /* end namespace */
229
230 void
form_hard_clauses(Program * program)231 form_hard_clauses(Program* program)
232 {
233 /* The ISA documentation says 63 is the maximum for GFX11/12, but according to
234 * LLVM there are HW bugs with more than 32 instructions.
235 */
236 const unsigned max_clause_length = program->gfx_level >= GFX11 ? 32 : 63;
237 for (Block& block : program->blocks) {
238 unsigned num_instrs = 0;
239 aco_ptr<Instruction> current_instrs[63];
240 clause_type current_type = clause_other;
241
242 std::vector<aco_ptr<Instruction>> new_instructions;
243 new_instructions.reserve(block.instructions.size());
244 Builder bld(program, &new_instructions);
245
246 for (unsigned i = 0; i < block.instructions.size(); i++) {
247 aco_ptr<Instruction>& instr = block.instructions[i];
248
249 clause_type type = get_type(program, instr);
250 if (type != current_type || num_instrs == max_clause_length ||
251 (num_instrs && !should_form_clause(current_instrs[0].get(), instr.get()))) {
252 emit_clause(bld, num_instrs, current_instrs);
253 num_instrs = 0;
254 current_type = type;
255 }
256
257 if (type == clause_other) {
258 bld.insert(std::move(instr));
259 continue;
260 }
261
262 current_instrs[num_instrs++] = std::move(instr);
263 }
264
265 emit_clause(bld, num_instrs, current_instrs);
266
267 block.instructions = std::move(new_instructions);
268 }
269 }
270 } // namespace aco
271