1 /*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include <llvm-c/Core.h>
8 #include <llvm/Analysis/TargetLibraryInfo.h>
9 #include <llvm/IR/IRBuilder.h>
10 #include <llvm/IR/LegacyPassManager.h>
11 #include <llvm/IR/Module.h>
12 #include <llvm/IR/Verifier.h>
13 #include <llvm/Target/TargetMachine.h>
14 #include <llvm/MC/MCSubtargetInfo.h>
15 #include <llvm/Support/CommandLine.h>
16 #include <llvm/Transforms/IPO.h>
17 #include <llvm/Transforms/Scalar.h>
18 #include <llvm/Transforms/Utils.h>
19 #include <llvm/CodeGen/Passes.h>
20 #include <llvm/Transforms/IPO/AlwaysInliner.h>
21 #include <llvm/Transforms/InstCombine/InstCombine.h>
22 #include <llvm/Transforms/IPO/SCCP.h>
23 #include "llvm/CodeGen/SelectionDAGNodes.h"
24
25 #include <cstring>
26
27 /* DO NOT REORDER THE HEADERS
28 * The LLVM headers need to all be included before any Mesa header,
29 * as they use the `restrict` keyword in ways that are incompatible
30 * with our #define in include/c99_compat.h
31 */
32
33 #include "ac_binary.h"
34 #include "ac_llvm_util.h"
35 #include "ac_llvm_build.h"
36 #include "util/macros.h"
37
38 using namespace llvm;
39
40 class RunAtExitForStaticDestructors : public SDNode
41 {
42 public:
43 /* getSDVTList (protected) calls getValueTypeList (private), which contains static variables. */
RunAtExitForStaticDestructors()44 RunAtExitForStaticDestructors(): SDNode(0, 0, DebugLoc(), getSDVTList(MVT::Other))
45 {
46 }
47 };
48
ac_llvm_run_atexit_for_destructors(void)49 void ac_llvm_run_atexit_for_destructors(void)
50 {
51 /* LLVM >= 16 registers static variable destructors on the first compile, which gcc
52 * implements by calling atexit there. Before that, u_queue registers its atexit
53 * handler to kill all threads. Since exit() runs atexit handlers in the reverse order,
54 * the LLVM destructors are called first while shader compiler threads may still be
55 * running, which crashes in LLVM in SelectionDAG.cpp.
56 *
57 * The solution is to run the code that declares the LLVM static variables first,
58 * so that atexit for LLVM is registered first and u_queue is registered after that,
59 * which ensures that all u_queue threads are terminated before LLVM destructors are
60 * called.
61 *
62 * This just executes the code that declares static variables.
63 */
64 RunAtExitForStaticDestructors();
65 }
66
ac_is_llvm_processor_supported(LLVMTargetMachineRef tm,const char * processor)67 bool ac_is_llvm_processor_supported(LLVMTargetMachineRef tm, const char *processor)
68 {
69 TargetMachine *TM = reinterpret_cast<TargetMachine *>(tm);
70 return TM->getMCSubtargetInfo()->isCPUStringValid(processor);
71 }
72
ac_reset_llvm_all_options_occurrences()73 void ac_reset_llvm_all_options_occurrences()
74 {
75 cl::ResetAllOptionOccurrences();
76 }
77
ac_add_attr_dereferenceable(LLVMValueRef val,uint64_t bytes)78 void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes)
79 {
80 Argument *A = unwrap<Argument>(val);
81 A->addAttr(Attribute::getWithDereferenceableBytes(A->getContext(), bytes));
82 }
83
ac_add_attr_alignment(LLVMValueRef val,uint64_t bytes)84 void ac_add_attr_alignment(LLVMValueRef val, uint64_t bytes)
85 {
86 Argument *A = unwrap<Argument>(val);
87 A->addAttr(Attribute::getWithAlignment(A->getContext(), Align(bytes)));
88 }
89
ac_is_sgpr_param(LLVMValueRef arg)90 bool ac_is_sgpr_param(LLVMValueRef arg)
91 {
92 Argument *A = unwrap<Argument>(arg);
93 AttributeList AS = A->getParent()->getAttributes();
94 unsigned ArgNo = A->getArgNo();
95 return AS.hasParamAttr(ArgNo, Attribute::InReg);
96 }
97
ac_create_module(LLVMTargetMachineRef tm,LLVMContextRef ctx)98 LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx)
99 {
100 TargetMachine *TM = reinterpret_cast<TargetMachine *>(tm);
101 LLVMModuleRef module = LLVMModuleCreateWithNameInContext("mesa-shader", ctx);
102
103 unwrap(module)->setTargetTriple(TM->getTargetTriple().getTriple());
104 unwrap(module)->setDataLayout(TM->createDataLayout());
105 return module;
106 }
107
ac_create_builder(LLVMContextRef ctx,enum ac_float_mode float_mode)108 LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, enum ac_float_mode float_mode)
109 {
110 LLVMBuilderRef builder = LLVMCreateBuilderInContext(ctx);
111
112 FastMathFlags flags;
113
114 switch (float_mode) {
115 case AC_FLOAT_MODE_DEFAULT:
116 case AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO:
117 break;
118
119 case AC_FLOAT_MODE_DEFAULT_OPENGL:
120 /* Allow optimizations to treat the sign of a zero argument or
121 * result as insignificant.
122 */
123 flags.setNoSignedZeros(); /* nsz */
124
125 /* Allow optimizations to use the reciprocal of an argument
126 * rather than perform division.
127 */
128 flags.setAllowReciprocal(); /* arcp */
129
130 unwrap(builder)->setFastMathFlags(flags);
131 break;
132 }
133
134 return builder;
135 }
136
ac_enable_signed_zeros(struct ac_llvm_context * ctx)137 void ac_enable_signed_zeros(struct ac_llvm_context *ctx)
138 {
139 if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL) {
140 auto *b = unwrap(ctx->builder);
141 FastMathFlags flags = b->getFastMathFlags();
142
143 /* This disables the optimization of (x + 0), which is used
144 * to convert negative zero to positive zero.
145 */
146 flags.setNoSignedZeros(false);
147 b->setFastMathFlags(flags);
148 }
149 }
150
ac_disable_signed_zeros(struct ac_llvm_context * ctx)151 void ac_disable_signed_zeros(struct ac_llvm_context *ctx)
152 {
153 if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL) {
154 auto *b = unwrap(ctx->builder);
155 FastMathFlags flags = b->getFastMathFlags();
156
157 flags.setNoSignedZeros();
158 b->setFastMathFlags(flags);
159 }
160 }
161
ac_create_target_library_info(const char * triple)162 LLVMTargetLibraryInfoRef ac_create_target_library_info(const char *triple)
163 {
164 return reinterpret_cast<LLVMTargetLibraryInfoRef>(
165 new TargetLibraryInfoImpl(Triple(triple)));
166 }
167
ac_dispose_target_library_info(LLVMTargetLibraryInfoRef library_info)168 void ac_dispose_target_library_info(LLVMTargetLibraryInfoRef library_info)
169 {
170 delete reinterpret_cast<TargetLibraryInfoImpl *>(library_info);
171 }
172
173 /* Implementation of raw_pwrite_stream that works on malloc()ed memory for
174 * better compatibility with C code. */
175 struct raw_memory_ostream : public raw_pwrite_stream {
176 char *buffer;
177 size_t written;
178 size_t bufsize;
179
raw_memory_ostreamraw_memory_ostream180 raw_memory_ostream()
181 {
182 buffer = NULL;
183 written = 0;
184 bufsize = 0;
185 SetUnbuffered();
186 }
187
~raw_memory_ostreamraw_memory_ostream188 ~raw_memory_ostream()
189 {
190 free(buffer);
191 }
192
clearraw_memory_ostream193 void clear()
194 {
195 written = 0;
196 }
197
takeraw_memory_ostream198 void take(char *&out_buffer, size_t &out_size)
199 {
200 out_buffer = buffer;
201 out_size = written;
202 buffer = NULL;
203 written = 0;
204 bufsize = 0;
205 }
206
207 void flush() = delete;
208
write_implraw_memory_ostream209 void write_impl(const char *ptr, size_t size) override
210 {
211 if (unlikely(written + size < written))
212 abort();
213 if (written + size > bufsize) {
214 bufsize = MAX3(1024, written + size, bufsize / 3 * 4);
215 buffer = (char *)realloc(buffer, bufsize);
216 if (!buffer) {
217 fprintf(stderr, "amd: out of memory allocating ELF buffer\n");
218 abort();
219 }
220 }
221 memcpy(buffer + written, ptr, size);
222 written += size;
223 }
224
pwrite_implraw_memory_ostream225 void pwrite_impl(const char *ptr, size_t size, uint64_t offset) override
226 {
227 assert(offset == (size_t)offset && offset + size >= offset && offset + size <= written);
228 memcpy(buffer + offset, ptr, size);
229 }
230
current_posraw_memory_ostream231 uint64_t current_pos() const override
232 {
233 return written;
234 }
235 };
236
237 /* The LLVM compiler is represented as a pass manager containing passes for
238 * optimizations, instruction selection, and code generation.
239 */
240 struct ac_compiler_passes {
241 raw_memory_ostream ostream; /* ELF shader binary stream */
242 legacy::PassManager passmgr; /* list of passes */
243 };
244
ac_create_llvm_passes(LLVMTargetMachineRef tm)245 struct ac_compiler_passes *ac_create_llvm_passes(LLVMTargetMachineRef tm)
246 {
247 struct ac_compiler_passes *p = new ac_compiler_passes();
248 if (!p)
249 return NULL;
250
251 TargetMachine *TM = reinterpret_cast<TargetMachine *>(tm);
252
253 if (TM->addPassesToEmitFile(p->passmgr, p->ostream, nullptr,
254 #if LLVM_VERSION_MAJOR >= 18
255 CodeGenFileType::ObjectFile)) {
256 #else
257 CGFT_ObjectFile)) {
258 #endif
259 fprintf(stderr, "amd: TargetMachine can't emit a file of this type!\n");
260 delete p;
261 return NULL;
262 }
263 return p;
264 }
265
266 void ac_destroy_llvm_passes(struct ac_compiler_passes *p)
267 {
268 delete p;
269 }
270
271 /* This returns false on failure. */
272 bool ac_compile_module_to_elf(struct ac_compiler_passes *p, LLVMModuleRef module,
273 char **pelf_buffer, size_t *pelf_size)
274 {
275 p->passmgr.run(*unwrap(module));
276 p->ostream.take(*pelf_buffer, *pelf_size);
277 return true;
278 }
279
280 LLVMPassManagerRef ac_create_passmgr(LLVMTargetLibraryInfoRef target_library_info,
281 bool check_ir)
282 {
283 LLVMPassManagerRef passmgr = LLVMCreatePassManager();
284 if (!passmgr)
285 return NULL;
286
287 if (target_library_info)
288 LLVMAddTargetLibraryInfo(target_library_info, passmgr);
289
290 if (check_ir)
291 unwrap(passmgr)->add(createVerifierPass());
292
293 unwrap(passmgr)->add(createAlwaysInlinerLegacyPass());
294
295 /* Normally, the pass manager runs all passes on one function before
296 * moving onto another. Adding a barrier no-op pass forces the pass
297 * manager to run the inliner on all functions first, which makes sure
298 * that the following passes are only run on the remaining non-inline
299 * function, so it removes useless work done on dead inline functions.
300 */
301 unwrap(passmgr)->add(createBarrierNoopPass());
302
303 #if LLVM_VERSION_MAJOR >= 16
304 unwrap(passmgr)->add(createSROAPass(true));
305 #else
306 unwrap(passmgr)->add(createSROAPass());
307 #endif
308 /* TODO: restore IPSCCP */
309 unwrap(passmgr)->add(createLICMPass());
310 unwrap(passmgr)->add(createCFGSimplificationPass());
311 /* This is recommended by the instruction combining pass. */
312 unwrap(passmgr)->add(createEarlyCSEPass(true));
313 unwrap(passmgr)->add(createInstructionCombiningPass());
314 return passmgr;
315 }
316
317 LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op,
318 LLVMValueRef ptr, LLVMValueRef val, const char *sync_scope)
319 {
320 AtomicRMWInst::BinOp binop;
321 switch (op) {
322 case LLVMAtomicRMWBinOpXchg:
323 binop = AtomicRMWInst::Xchg;
324 break;
325 case LLVMAtomicRMWBinOpAdd:
326 binop = AtomicRMWInst::Add;
327 break;
328 case LLVMAtomicRMWBinOpSub:
329 binop = AtomicRMWInst::Sub;
330 break;
331 case LLVMAtomicRMWBinOpAnd:
332 binop = AtomicRMWInst::And;
333 break;
334 case LLVMAtomicRMWBinOpNand:
335 binop = AtomicRMWInst::Nand;
336 break;
337 case LLVMAtomicRMWBinOpOr:
338 binop = AtomicRMWInst::Or;
339 break;
340 case LLVMAtomicRMWBinOpXor:
341 binop = AtomicRMWInst::Xor;
342 break;
343 case LLVMAtomicRMWBinOpMax:
344 binop = AtomicRMWInst::Max;
345 break;
346 case LLVMAtomicRMWBinOpMin:
347 binop = AtomicRMWInst::Min;
348 break;
349 case LLVMAtomicRMWBinOpUMax:
350 binop = AtomicRMWInst::UMax;
351 break;
352 case LLVMAtomicRMWBinOpUMin:
353 binop = AtomicRMWInst::UMin;
354 break;
355 case LLVMAtomicRMWBinOpFAdd:
356 binop = AtomicRMWInst::FAdd;
357 break;
358 default:
359 unreachable("invalid LLVMAtomicRMWBinOp");
360 break;
361 }
362 unsigned SSID = unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope);
363 return wrap(unwrap(ctx->builder)
364 ->CreateAtomicRMW(binop, unwrap(ptr), unwrap(val),
365 MaybeAlign(0),
366 AtomicOrdering::SequentiallyConsistent, SSID));
367 }
368
369 LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr,
370 LLVMValueRef cmp, LLVMValueRef val, const char *sync_scope)
371 {
372 unsigned SSID = unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope);
373 return wrap(unwrap(ctx->builder)
374 ->CreateAtomicCmpXchg(unwrap(ptr), unwrap(cmp),
375 unwrap(val),
376 MaybeAlign(0),
377 AtomicOrdering::SequentiallyConsistent,
378 AtomicOrdering::SequentiallyConsistent, SSID));
379 }
380