1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AArch64LegalizerInfo.h"
15 #include "AArch64RegisterBankInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
20 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
21 #include "llvm/CodeGen/GlobalISel/Utils.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineRegisterInfo.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/IntrinsicsAArch64.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/MathExtras.h"
31 #include <initializer_list>
32
33 #define DEBUG_TYPE "aarch64-legalinfo"
34
35 using namespace llvm;
36 using namespace LegalizeActions;
37 using namespace LegalizeMutations;
38 using namespace LegalityPredicates;
39 using namespace MIPatternMatch;
40
AArch64LegalizerInfo(const AArch64Subtarget & ST)41 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
42 : ST(&ST) {
43 using namespace TargetOpcode;
44 const LLT p0 = LLT::pointer(0, 64);
45 const LLT s8 = LLT::scalar(8);
46 const LLT s16 = LLT::scalar(16);
47 const LLT s32 = LLT::scalar(32);
48 const LLT s64 = LLT::scalar(64);
49 const LLT s128 = LLT::scalar(128);
50 const LLT v16s8 = LLT::fixed_vector(16, 8);
51 const LLT v8s8 = LLT::fixed_vector(8, 8);
52 const LLT v4s8 = LLT::fixed_vector(4, 8);
53 const LLT v8s16 = LLT::fixed_vector(8, 16);
54 const LLT v4s16 = LLT::fixed_vector(4, 16);
55 const LLT v2s16 = LLT::fixed_vector(2, 16);
56 const LLT v2s32 = LLT::fixed_vector(2, 32);
57 const LLT v4s32 = LLT::fixed_vector(4, 32);
58 const LLT v2s64 = LLT::fixed_vector(2, 64);
59 const LLT v2p0 = LLT::fixed_vector(2, p0);
60
61 std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
62 v16s8, v8s16, v4s32,
63 v2s64, v2p0,
64 /* End 128bit types */
65 /* Begin 64bit types */
66 v8s8, v4s16, v2s32};
67
68 const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
69
70 // FIXME: support subtargets which have neon/fp-armv8 disabled.
71 if (!ST.hasNEON() || !ST.hasFPARMv8()) {
72 getLegacyLegalizerInfo().computeTables();
73 return;
74 }
75
76 // Some instructions only support s16 if the subtarget has full 16-bit FP
77 // support.
78 const bool HasFP16 = ST.hasFullFP16();
79 const LLT &MinFPScalar = HasFP16 ? s16 : s32;
80
81 const bool HasCSSC = ST.hasCSSC();
82
83 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
84 .legalFor({p0, s8, s16, s32, s64})
85 .legalFor(PackedVectorAllTypeList)
86 .widenScalarToNextPow2(0)
87 .clampScalar(0, s8, s64)
88 .fewerElementsIf(
89 [=](const LegalityQuery &Query) {
90 return Query.Types[0].isVector() &&
91 (Query.Types[0].getElementType() != s64 ||
92 Query.Types[0].getNumElements() != 2);
93 },
94 [=](const LegalityQuery &Query) {
95 LLT EltTy = Query.Types[0].getElementType();
96 if (EltTy == s64)
97 return std::make_pair(0, LLT::fixed_vector(2, 64));
98 return std::make_pair(0, EltTy);
99 });
100
101 getActionDefinitionsBuilder(G_PHI)
102 .legalFor({p0, s16, s32, s64})
103 .legalFor(PackedVectorAllTypeList)
104 .widenScalarToNextPow2(0)
105 .clampScalar(0, s16, s64)
106 // Maximum: sN * k = 128
107 .clampMaxNumElements(0, s8, 16)
108 .clampMaxNumElements(0, s16, 8)
109 .clampMaxNumElements(0, s32, 4)
110 .clampMaxNumElements(0, s64, 2)
111 .clampMaxNumElements(0, p0, 2);
112
113 getActionDefinitionsBuilder(G_BSWAP)
114 .legalFor({s32, s64, v4s32, v2s32, v2s64})
115 .widenScalarToNextPow2(0)
116 .clampScalar(0, s32, s64);
117
118 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
119 .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8})
120 .scalarizeIf(
121 [=](const LegalityQuery &Query) {
122 return Query.Opcode == G_MUL && Query.Types[0] == v2s64;
123 },
124 0)
125 .legalFor({v2s64})
126 .widenScalarToNextPow2(0)
127 .clampScalar(0, s32, s64)
128 .clampNumElements(0, v2s32, v4s32)
129 .clampNumElements(0, v2s64, v2s64)
130 .moreElementsToNextPow2(0);
131
132 getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
133 .customIf([=](const LegalityQuery &Query) {
134 const auto &SrcTy = Query.Types[0];
135 const auto &AmtTy = Query.Types[1];
136 return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
137 AmtTy.getSizeInBits() == 32;
138 })
139 .legalFor({
140 {s32, s32},
141 {s32, s64},
142 {s64, s64},
143 {v8s8, v8s8},
144 {v16s8, v16s8},
145 {v4s16, v4s16},
146 {v8s16, v8s16},
147 {v2s32, v2s32},
148 {v4s32, v4s32},
149 {v2s64, v2s64},
150 })
151 .widenScalarToNextPow2(0)
152 .clampScalar(1, s32, s64)
153 .clampScalar(0, s32, s64)
154 .clampNumElements(0, v2s32, v4s32)
155 .clampNumElements(0, v2s64, v2s64)
156 .moreElementsToNextPow2(0)
157 .minScalarSameAs(1, 0);
158
159 getActionDefinitionsBuilder(G_PTR_ADD)
160 .legalFor({{p0, s64}, {v2p0, v2s64}})
161 .clampScalar(1, s64, s64);
162
163 getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
164
165 getActionDefinitionsBuilder({G_SDIV, G_UDIV})
166 .legalFor({s32, s64})
167 .libcallFor({s128})
168 .clampScalar(0, s32, s64)
169 .widenScalarToNextPow2(0)
170 .scalarize(0);
171
172 getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
173 .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32})
174 .widenScalarOrEltToNextPow2(0)
175 .clampScalarOrElt(0, s32, s64)
176 .clampNumElements(0, v2s32, v4s32)
177 .clampNumElements(0, v2s64, v2s64)
178 .moreElementsToNextPow2(0);
179
180
181 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
182 .widenScalarToNextPow2(0, /*Min = */ 32)
183 .clampScalar(0, s32, s64)
184 .lower();
185
186 getActionDefinitionsBuilder({G_SMULH, G_UMULH})
187 .legalFor({s64, v8s16, v16s8, v4s32})
188 .lower();
189
190 auto &MinMaxActions = getActionDefinitionsBuilder(
191 {G_SMIN, G_SMAX, G_UMIN, G_UMAX});
192 if (HasCSSC)
193 MinMaxActions
194 .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
195 // Making clamping conditional on CSSC extension as without legal types we
196 // lower to CMP which can fold one of the two sxtb's we'd otherwise need
197 // if we detect a type smaller than 32-bit.
198 .minScalar(0, s32);
199 else
200 MinMaxActions
201 .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32});
202 MinMaxActions
203 .clampNumElements(0, v8s8, v16s8)
204 .clampNumElements(0, v4s16, v8s16)
205 .clampNumElements(0, v2s32, v4s32)
206 // FIXME: This sholdn't be needed as v2s64 types are going to
207 // be expanded anyway, but G_ICMP doesn't support splitting vectors yet
208 .clampNumElements(0, v2s64, v2s64)
209 .lower();
210
211 getActionDefinitionsBuilder(
212 {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
213 .legalFor({{s32, s32}, {s64, s32}})
214 .clampScalar(0, s32, s64)
215 .clampScalar(1, s32, s64)
216 .widenScalarToNextPow2(0);
217
218 getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG})
219 .legalFor({MinFPScalar, s32, s64, v2s64, v4s32, v2s32})
220 .clampScalar(0, MinFPScalar, s64)
221 .clampNumElements(0, v2s32, v4s32)
222 .clampNumElements(0, v2s64, v2s64);
223
224 getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64});
225
226 getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT,
227 G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND,
228 G_FNEARBYINT, G_INTRINSIC_LRINT})
229 // If we don't have full FP16 support, then scalarize the elements of
230 // vectors containing fp16 types.
231 .fewerElementsIf(
232 [=, &ST](const LegalityQuery &Query) {
233 const auto &Ty = Query.Types[0];
234 return Ty.isVector() && Ty.getElementType() == s16 &&
235 !ST.hasFullFP16();
236 },
237 [=](const LegalityQuery &Query) { return std::make_pair(0, s16); })
238 // If we don't have full FP16 support, then widen s16 to s32 if we
239 // encounter it.
240 .widenScalarIf(
241 [=, &ST](const LegalityQuery &Query) {
242 return Query.Types[0] == s16 && !ST.hasFullFP16();
243 },
244 [=](const LegalityQuery &Query) { return std::make_pair(0, s32); })
245 .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16});
246
247 getActionDefinitionsBuilder(
248 {G_FCOS, G_FSIN, G_FLOG10, G_FLOG, G_FLOG2, G_FEXP, G_FEXP2, G_FPOW})
249 // We need a call for these, so we always need to scalarize.
250 .scalarize(0)
251 // Regardless of FP16 support, widen 16-bit elements to 32-bits.
252 .minScalar(0, s32)
253 .libcallFor({s32, s64, v2s32, v4s32, v2s64});
254
255 getActionDefinitionsBuilder(G_INSERT)
256 .legalIf(all(typeInSet(0, {s32, s64, p0}),
257 typeInSet(1, {s8, s16, s32}), smallerThan(1, 0)))
258 .widenScalarToNextPow2(0)
259 .clampScalar(0, s32, s64)
260 .widenScalarToNextPow2(1)
261 .minScalar(1, s8)
262 .maxScalarIf(typeInSet(0, {s32}), 1, s16)
263 .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32);
264
265 getActionDefinitionsBuilder(G_EXTRACT)
266 .legalIf(all(typeInSet(0, {s16, s32, s64, p0}),
267 typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1)))
268 .widenScalarToNextPow2(1)
269 .clampScalar(1, s32, s128)
270 .widenScalarToNextPow2(0)
271 .minScalar(0, s16)
272 .maxScalarIf(typeInSet(1, {s32}), 0, s16)
273 .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32)
274 .maxScalarIf(typeInSet(1, {s128}), 0, s64);
275
276
277 for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
278 auto &Actions = getActionDefinitionsBuilder(Op);
279
280 if (Op == G_SEXTLOAD)
281 Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered));
282
283 // Atomics have zero extending behavior.
284 Actions
285 .legalForTypesWithMemDesc({{s32, p0, s8, 8},
286 {s32, p0, s16, 8},
287 {s32, p0, s32, 8},
288 {s64, p0, s8, 2},
289 {s64, p0, s16, 2},
290 {s64, p0, s32, 4},
291 {s64, p0, s64, 8},
292 {p0, p0, s64, 8},
293 {v2s32, p0, s64, 8}})
294 .widenScalarToNextPow2(0)
295 .clampScalar(0, s32, s64)
296 // TODO: We could support sum-of-pow2's but the lowering code doesn't know
297 // how to do that yet.
298 .unsupportedIfMemSizeNotPow2()
299 // Lower anything left over into G_*EXT and G_LOAD
300 .lower();
301 }
302
303 auto IsPtrVecPred = [=](const LegalityQuery &Query) {
304 const LLT &ValTy = Query.Types[0];
305 if (!ValTy.isVector())
306 return false;
307 const LLT EltTy = ValTy.getElementType();
308 return EltTy.isPointer() && EltTy.getAddressSpace() == 0;
309 };
310
311 getActionDefinitionsBuilder(G_LOAD)
312 .customIf([=](const LegalityQuery &Query) {
313 return Query.Types[0] == s128 &&
314 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
315 })
316 .legalForTypesWithMemDesc({{s8, p0, s8, 8},
317 {s16, p0, s16, 8},
318 {s32, p0, s32, 8},
319 {s64, p0, s64, 8},
320 {p0, p0, s64, 8},
321 {s128, p0, s128, 8},
322 {v8s8, p0, s64, 8},
323 {v16s8, p0, s128, 8},
324 {v4s16, p0, s64, 8},
325 {v8s16, p0, s128, 8},
326 {v2s32, p0, s64, 8},
327 {v4s32, p0, s128, 8},
328 {v2s64, p0, s128, 8}})
329 // These extends are also legal
330 .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}})
331 .widenScalarToNextPow2(0, /* MinSize = */8)
332 .lowerIfMemSizeNotByteSizePow2()
333 .clampScalar(0, s8, s64)
334 .narrowScalarIf([=](const LegalityQuery &Query) {
335 // Clamp extending load results to 32-bits.
336 return Query.Types[0].isScalar() &&
337 Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
338 Query.Types[0].getSizeInBits() > 32;
339 },
340 changeTo(0, s32))
341 .clampMaxNumElements(0, s8, 16)
342 .clampMaxNumElements(0, s16, 8)
343 .clampMaxNumElements(0, s32, 4)
344 .clampMaxNumElements(0, s64, 2)
345 .clampMaxNumElements(0, p0, 2)
346 .customIf(IsPtrVecPred)
347 .scalarizeIf(typeIs(0, v2s16), 0);
348
349 getActionDefinitionsBuilder(G_STORE)
350 .customIf([=](const LegalityQuery &Query) {
351 return Query.Types[0] == s128 &&
352 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
353 })
354 .legalForTypesWithMemDesc({{s8, p0, s8, 8},
355 {s16, p0, s8, 8}, // truncstorei8 from s16
356 {s32, p0, s8, 8}, // truncstorei8 from s32
357 {s64, p0, s8, 8}, // truncstorei8 from s64
358 {s16, p0, s16, 8},
359 {s32, p0, s16, 8}, // truncstorei16 from s32
360 {s64, p0, s16, 8}, // truncstorei16 from s64
361 {s32, p0, s8, 8},
362 {s32, p0, s16, 8},
363 {s32, p0, s32, 8},
364 {s64, p0, s64, 8},
365 {s64, p0, s32, 8}, // truncstorei32 from s64
366 {p0, p0, s64, 8},
367 {s128, p0, s128, 8},
368 {v16s8, p0, s128, 8},
369 {v8s8, p0, s64, 8},
370 {v4s16, p0, s64, 8},
371 {v8s16, p0, s128, 8},
372 {v2s32, p0, s64, 8},
373 {v4s32, p0, s128, 8},
374 {v2s64, p0, s128, 8}})
375 .clampScalar(0, s8, s64)
376 .lowerIf([=](const LegalityQuery &Query) {
377 return Query.Types[0].isScalar() &&
378 Query.Types[0] != Query.MMODescrs[0].MemoryTy;
379 })
380 // Maximum: sN * k = 128
381 .clampMaxNumElements(0, s8, 16)
382 .clampMaxNumElements(0, s16, 8)
383 .clampMaxNumElements(0, s32, 4)
384 .clampMaxNumElements(0, s64, 2)
385 .clampMaxNumElements(0, p0, 2)
386 .lowerIfMemSizeNotPow2()
387 .customIf(IsPtrVecPred)
388 .scalarizeIf(typeIs(0, v2s16), 0);
389
390 // Constants
391 getActionDefinitionsBuilder(G_CONSTANT)
392 .legalFor({p0, s8, s16, s32, s64})
393 .widenScalarToNextPow2(0)
394 .clampScalar(0, s8, s64);
395 getActionDefinitionsBuilder(G_FCONSTANT)
396 .legalIf([=](const LegalityQuery &Query) {
397 const auto &Ty = Query.Types[0];
398 if (HasFP16 && Ty == s16)
399 return true;
400 return Ty == s32 || Ty == s64 || Ty == s128;
401 })
402 .clampScalar(0, MinFPScalar, s128);
403
404 getActionDefinitionsBuilder({G_ICMP, G_FCMP})
405 .legalFor({{s32, s32},
406 {s32, s64},
407 {s32, p0},
408 {v4s32, v4s32},
409 {v2s32, v2s32},
410 {v2s64, v2s64},
411 {v2s64, v2p0},
412 {v4s16, v4s16},
413 {v8s16, v8s16},
414 {v8s8, v8s8},
415 {v16s8, v16s8}})
416 .widenScalarOrEltToNextPow2(1)
417 .clampScalar(1, s32, s64)
418 .clampScalar(0, s32, s32)
419 .minScalarEltSameAsIf(
420 [=](const LegalityQuery &Query) {
421 const LLT &Ty = Query.Types[0];
422 const LLT &SrcTy = Query.Types[1];
423 return Ty.isVector() && !SrcTy.getElementType().isPointer() &&
424 Ty.getElementType() != SrcTy.getElementType();
425 },
426 0, 1)
427 .minScalarOrEltIf(
428 [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
429 1, s32)
430 .minScalarOrEltIf(
431 [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
432 s64)
433 .clampNumElements(0, v2s32, v4s32);
434
435 // Extensions
436 auto ExtLegalFunc = [=](const LegalityQuery &Query) {
437 unsigned DstSize = Query.Types[0].getSizeInBits();
438
439 if (DstSize == 128 && !Query.Types[0].isVector())
440 return false; // Extending to a scalar s128 needs narrowing.
441
442 // Make sure that we have something that will fit in a register, and
443 // make sure it's a power of 2.
444 if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize))
445 return false;
446
447 const LLT &SrcTy = Query.Types[1];
448
449 // Make sure we fit in a register otherwise. Don't bother checking that
450 // the source type is below 128 bits. We shouldn't be allowing anything
451 // through which is wider than the destination in the first place.
452 unsigned SrcSize = SrcTy.getSizeInBits();
453 if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
454 return false;
455
456 return true;
457 };
458 getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
459 .legalIf(ExtLegalFunc)
460 .clampScalar(0, s64, s64); // Just for s128, others are handled above.
461
462 getActionDefinitionsBuilder(G_TRUNC)
463 .minScalarOrEltIf(
464 [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
465 0, s8)
466 .customIf([=](const LegalityQuery &Query) {
467 LLT DstTy = Query.Types[0];
468 LLT SrcTy = Query.Types[1];
469 return DstTy == v8s8 && SrcTy.getSizeInBits() > 128;
470 })
471 .alwaysLegal();
472
473 getActionDefinitionsBuilder(G_SEXT_INREG)
474 .legalFor({s32, s64})
475 .legalFor(PackedVectorAllTypeList)
476 .lower();
477
478 // FP conversions
479 getActionDefinitionsBuilder(G_FPTRUNC)
480 .legalFor(
481 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
482 .clampMaxNumElements(0, s32, 2);
483 getActionDefinitionsBuilder(G_FPEXT)
484 .legalFor(
485 {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
486 .clampMaxNumElements(0, s64, 2);
487
488 // Conversions
489 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
490 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
491 .widenScalarToNextPow2(0)
492 .clampScalar(0, s32, s64)
493 .widenScalarToNextPow2(1)
494 .clampScalar(1, s32, s64);
495
496 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
497 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
498 .clampScalar(1, s32, s64)
499 .minScalarSameAs(1, 0)
500 .clampScalar(0, s32, s64)
501 .widenScalarToNextPow2(0);
502
503 // Control-flow
504 getActionDefinitionsBuilder(G_BRCOND)
505 .legalFor({s32})
506 .clampScalar(0, s32, s32);
507 getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
508
509 getActionDefinitionsBuilder(G_SELECT)
510 .legalFor({{s32, s32}, {s64, s32}, {p0, s32}})
511 .widenScalarToNextPow2(0)
512 .clampScalar(0, s32, s64)
513 .clampScalar(1, s32, s32)
514 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
515 .lowerIf(isVector(0));
516
517 // Pointer-handling
518 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
519
520 if (TM.getCodeModel() == CodeModel::Small)
521 getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
522 else
523 getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
524
525 getActionDefinitionsBuilder(G_PTRTOINT)
526 .legalFor({{s64, p0}, {v2s64, v2p0}})
527 .widenScalarToNextPow2(0, 64)
528 .clampScalar(0, s64, s64);
529
530 getActionDefinitionsBuilder(G_INTTOPTR)
531 .unsupportedIf([&](const LegalityQuery &Query) {
532 return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
533 })
534 .legalFor({{p0, s64}, {v2p0, v2s64}});
535
536 // Casts for 32 and 64-bit width type are just copies.
537 // Same for 128-bit width type, except they are on the FPR bank.
538 getActionDefinitionsBuilder(G_BITCAST)
539 // FIXME: This is wrong since G_BITCAST is not allowed to change the
540 // number of bits but it's what the previous code described and fixing
541 // it breaks tests.
542 .legalForCartesianProduct({s8, s16, s32, s64, s128, v16s8, v8s8, v4s8,
543 v8s16, v4s16, v2s16, v4s32, v2s32, v2s64,
544 v2p0});
545
546 getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
547
548 // va_list must be a pointer, but most sized types are pretty easy to handle
549 // as the destination.
550 getActionDefinitionsBuilder(G_VAARG)
551 .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
552 .clampScalar(0, s8, s64)
553 .widenScalarToNextPow2(0, /*Min*/ 8);
554
555 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
556 .lowerIf(
557 all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
558
559 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
560 .customIf([](const LegalityQuery &Query) {
561 return Query.Types[0].getSizeInBits() == 128;
562 })
563 .clampScalar(0, s32, s64)
564 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
565
566 getActionDefinitionsBuilder(
567 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
568 G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
569 G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
570 .clampScalar(0, s32, s64)
571 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
572
573 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0});
574
575 // Merge/Unmerge
576 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
577 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
578 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
579 getActionDefinitionsBuilder(Op)
580 .widenScalarToNextPow2(LitTyIdx, 8)
581 .widenScalarToNextPow2(BigTyIdx, 32)
582 .clampScalar(LitTyIdx, s8, s64)
583 .clampScalar(BigTyIdx, s32, s128)
584 .legalIf([=](const LegalityQuery &Q) {
585 switch (Q.Types[BigTyIdx].getSizeInBits()) {
586 case 32:
587 case 64:
588 case 128:
589 break;
590 default:
591 return false;
592 }
593 switch (Q.Types[LitTyIdx].getSizeInBits()) {
594 case 8:
595 case 16:
596 case 32:
597 case 64:
598 return true;
599 default:
600 return false;
601 }
602 });
603 }
604
605 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
606 .unsupportedIf([=](const LegalityQuery &Query) {
607 const LLT &EltTy = Query.Types[1].getElementType();
608 return Query.Types[0] != EltTy;
609 })
610 .minScalar(2, s64)
611 .legalIf([=](const LegalityQuery &Query) {
612 const LLT &VecTy = Query.Types[1];
613 return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
614 VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 ||
615 VecTy == v8s8 || VecTy == v16s8 || VecTy == v2s32 ||
616 VecTy == v2p0;
617 })
618 .minScalarOrEltIf(
619 [=](const LegalityQuery &Query) {
620 // We want to promote to <M x s1> to <M x s64> if that wouldn't
621 // cause the total vec size to be > 128b.
622 return Query.Types[1].getNumElements() <= 2;
623 },
624 0, s64)
625 .minScalarOrEltIf(
626 [=](const LegalityQuery &Query) {
627 return Query.Types[1].getNumElements() <= 4;
628 },
629 0, s32)
630 .minScalarOrEltIf(
631 [=](const LegalityQuery &Query) {
632 return Query.Types[1].getNumElements() <= 8;
633 },
634 0, s16)
635 .minScalarOrEltIf(
636 [=](const LegalityQuery &Query) {
637 return Query.Types[1].getNumElements() <= 16;
638 },
639 0, s8)
640 .minScalarOrElt(0, s8) // Worst case, we need at least s8.
641 .clampMaxNumElements(1, s64, 2)
642 .clampMaxNumElements(1, s32, 4)
643 .clampMaxNumElements(1, s16, 8)
644 .clampMaxNumElements(1, p0, 2);
645
646 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
647 .legalIf(typeInSet(0, {v8s16, v2s32, v4s32, v2s64}));
648
649 getActionDefinitionsBuilder(G_BUILD_VECTOR)
650 .legalFor({{v8s8, s8},
651 {v16s8, s8},
652 {v2s16, s16},
653 {v4s16, s16},
654 {v8s16, s16},
655 {v2s32, s32},
656 {v4s32, s32},
657 {v2p0, p0},
658 {v2s64, s64}})
659 .clampNumElements(0, v4s32, v4s32)
660 .clampNumElements(0, v2s64, v2s64)
661 .minScalarOrElt(0, s8)
662 .minScalarSameAs(1, 0);
663
664 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
665
666 getActionDefinitionsBuilder(G_CTLZ)
667 .legalForCartesianProduct(
668 {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
669 .scalarize(1);
670 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
671
672 // TODO: Custom lowering for v2s32, v4s32, v2s64.
673 getActionDefinitionsBuilder(G_BITREVERSE)
674 .legalFor({s32, s64, v8s8, v16s8})
675 .widenScalarToNextPow2(0, /*Min = */ 32)
676 .clampScalar(0, s32, s64);
677
678 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
679
680 getActionDefinitionsBuilder(G_CTTZ)
681 .lowerIf(isVector(0))
682 .clampScalar(0, s32, s64)
683 .scalarSameSizeAs(1, 0)
684 .legalIf([=](const LegalityQuery &Query) {
685 return (HasCSSC && typeInSet(0, {s32, s64})(Query));
686 })
687 .customIf([=](const LegalityQuery &Query) {
688 return (!HasCSSC && typeInSet(0, {s32, s64})(Query));
689 });
690
691 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
692 .legalIf([=](const LegalityQuery &Query) {
693 const LLT &DstTy = Query.Types[0];
694 const LLT &SrcTy = Query.Types[1];
695 // For now just support the TBL2 variant which needs the source vectors
696 // to be the same size as the dest.
697 if (DstTy != SrcTy)
698 return false;
699 return llvm::is_contained({v2s32, v4s32, v2s64, v2p0, v16s8, v8s16},
700 DstTy);
701 })
702 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
703 // just want those lowered into G_BUILD_VECTOR
704 .lowerIf([=](const LegalityQuery &Query) {
705 return !Query.Types[1].isVector();
706 })
707 .moreElementsIf(
708 [](const LegalityQuery &Query) {
709 return Query.Types[0].isVector() && Query.Types[1].isVector() &&
710 Query.Types[0].getNumElements() >
711 Query.Types[1].getNumElements();
712 },
713 changeTo(1, 0))
714 .moreElementsToNextPow2(0)
715 .clampNumElements(0, v4s32, v4s32)
716 .clampNumElements(0, v2s64, v2s64);
717
718 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
719 .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}});
720
721 getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}});
722
723 getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) {
724 return Query.Types[0] == p0 && Query.Types[1] == s64;
725 });
726
727 getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
728
729 if (ST.hasMOPS()) {
730 // G_BZERO is not supported. Currently it is only emitted by
731 // PreLegalizerCombiner for G_MEMSET with zero constant.
732 getActionDefinitionsBuilder(G_BZERO).unsupported();
733
734 getActionDefinitionsBuilder(G_MEMSET)
735 .legalForCartesianProduct({p0}, {s64}, {s64})
736 .customForCartesianProduct({p0}, {s8}, {s64})
737 .immIdx(0); // Inform verifier imm idx 0 is handled.
738
739 getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
740 .legalForCartesianProduct({p0}, {p0}, {s64})
741 .immIdx(0); // Inform verifier imm idx 0 is handled.
742
743 // G_MEMCPY_INLINE does not have a tailcall immediate
744 getActionDefinitionsBuilder(G_MEMCPY_INLINE)
745 .legalForCartesianProduct({p0}, {p0}, {s64});
746
747 } else {
748 getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
749 .libcall();
750 }
751
752 // FIXME: Legal vector types are only legal with NEON.
753 auto &ABSActions = getActionDefinitionsBuilder(G_ABS);
754 if (HasCSSC)
755 ABSActions
756 .legalFor({s32, s64});
757 ABSActions
758 .legalFor(PackedVectorAllTypeList)
759 .lowerIf(isScalar(0));
760
761 getActionDefinitionsBuilder(G_VECREDUCE_FADD)
762 // We only have FADDP to do reduction-like operations. Lower the rest.
763 .legalFor({{s32, v2s32}, {s64, v2s64}})
764 .clampMaxNumElements(1, s64, 2)
765 .clampMaxNumElements(1, s32, 2)
766 .lower();
767
768 getActionDefinitionsBuilder(G_VECREDUCE_ADD)
769 .legalFor(
770 {{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
771 .clampMaxNumElements(1, s64, 2)
772 .clampMaxNumElements(1, s32, 4)
773 .lower();
774
775 getActionDefinitionsBuilder(
776 {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
777 // Try to break down into smaller vectors as long as they're at least 64
778 // bits. This lets us use vector operations for some parts of the
779 // reduction.
780 .fewerElementsIf(
781 [=](const LegalityQuery &Q) {
782 LLT SrcTy = Q.Types[1];
783 if (SrcTy.isScalar())
784 return false;
785 if (!isPowerOf2_32(SrcTy.getNumElements()))
786 return false;
787 // We can usually perform 64b vector operations.
788 return SrcTy.getSizeInBits() > 64;
789 },
790 [=](const LegalityQuery &Q) {
791 LLT SrcTy = Q.Types[1];
792 return std::make_pair(1, SrcTy.divide(2));
793 })
794 .scalarize(1)
795 .lower();
796
797 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
798 .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); });
799
800 getActionDefinitionsBuilder({G_FSHL, G_FSHR}).lower();
801
802 getActionDefinitionsBuilder(G_ROTR)
803 .legalFor({{s32, s64}, {s64, s64}})
804 .customIf([=](const LegalityQuery &Q) {
805 return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
806 })
807 .lower();
808 getActionDefinitionsBuilder(G_ROTL).lower();
809
810 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
811 .customFor({{s32, s32}, {s64, s64}});
812
813 auto always = [=](const LegalityQuery &Q) { return true; };
814 auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP);
815 if (HasCSSC)
816 CTPOPActions
817 .legalFor({{s32, s32},
818 {s64, s64},
819 {v8s8, v8s8},
820 {v16s8, v16s8}})
821 .customFor({{s128, s128},
822 {v2s64, v2s64},
823 {v2s32, v2s32},
824 {v4s32, v4s32},
825 {v4s16, v4s16},
826 {v8s16, v8s16}});
827 else
828 CTPOPActions
829 .legalFor({{v8s8, v8s8},
830 {v16s8, v16s8}})
831 .customFor({{s32, s32},
832 {s64, s64},
833 {s128, s128},
834 {v2s64, v2s64},
835 {v2s32, v2s32},
836 {v4s32, v4s32},
837 {v4s16, v4s16},
838 {v8s16, v8s16}});
839 CTPOPActions
840 .clampScalar(0, s32, s128)
841 .widenScalarToNextPow2(0)
842 .minScalarEltSameAsIf(always, 1, 0)
843 .maxScalarEltSameAsIf(always, 1, 0);
844
845 // TODO: Vector types.
846 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0));
847
848 // TODO: Vector types.
849 getActionDefinitionsBuilder({G_FMAXNUM, G_FMINNUM})
850 .legalFor({MinFPScalar, s32, s64})
851 .libcallFor({s128})
852 .minScalar(0, MinFPScalar);
853
854 getActionDefinitionsBuilder({G_FMAXIMUM, G_FMINIMUM})
855 .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
856 .legalIf([=](const LegalityQuery &Query) {
857 const auto &Ty = Query.Types[0];
858 return (Ty == v8s16 || Ty == v4s16) && HasFP16;
859 })
860 .minScalar(0, MinFPScalar)
861 .clampNumElements(0, v4s16, v8s16)
862 .clampNumElements(0, v2s32, v4s32)
863 .clampNumElements(0, v2s64, v2s64);
864
865 // TODO: Libcall support for s128.
866 // TODO: s16 should be legal with full FP16 support.
867 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
868 .legalFor({{s64, s32}, {s64, s64}});
869
870 // TODO: Custom legalization for vector types.
871 // TODO: Custom legalization for mismatched types.
872 // TODO: s16 support.
873 getActionDefinitionsBuilder(G_FCOPYSIGN).customFor({{s32, s32}, {s64, s64}});
874
875 getActionDefinitionsBuilder(G_FMAD).lower();
876
877 getLegacyLegalizerInfo().computeTables();
878 verify(*ST.getInstrInfo());
879 }
880
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI) const881 bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
882 MachineInstr &MI) const {
883 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
884 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
885 GISelChangeObserver &Observer = Helper.Observer;
886 switch (MI.getOpcode()) {
887 default:
888 // No idea what to do.
889 return false;
890 case TargetOpcode::G_VAARG:
891 return legalizeVaArg(MI, MRI, MIRBuilder);
892 case TargetOpcode::G_LOAD:
893 case TargetOpcode::G_STORE:
894 return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
895 case TargetOpcode::G_SHL:
896 case TargetOpcode::G_ASHR:
897 case TargetOpcode::G_LSHR:
898 return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
899 case TargetOpcode::G_GLOBAL_VALUE:
900 return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
901 case TargetOpcode::G_TRUNC:
902 return legalizeVectorTrunc(MI, Helper);
903 case TargetOpcode::G_SBFX:
904 case TargetOpcode::G_UBFX:
905 return legalizeBitfieldExtract(MI, MRI, Helper);
906 case TargetOpcode::G_ROTR:
907 return legalizeRotate(MI, MRI, Helper);
908 case TargetOpcode::G_CTPOP:
909 return legalizeCTPOP(MI, MRI, Helper);
910 case TargetOpcode::G_ATOMIC_CMPXCHG:
911 return legalizeAtomicCmpxchg128(MI, MRI, Helper);
912 case TargetOpcode::G_CTTZ:
913 return legalizeCTTZ(MI, Helper);
914 case TargetOpcode::G_BZERO:
915 case TargetOpcode::G_MEMCPY:
916 case TargetOpcode::G_MEMMOVE:
917 case TargetOpcode::G_MEMSET:
918 return legalizeMemOps(MI, Helper);
919 case TargetOpcode::G_FCOPYSIGN:
920 return legalizeFCopySign(MI, Helper);
921 }
922
923 llvm_unreachable("expected switch to return");
924 }
925
legalizeRotate(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const926 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
927 MachineRegisterInfo &MRI,
928 LegalizerHelper &Helper) const {
929 // To allow for imported patterns to match, we ensure that the rotate amount
930 // is 64b with an extension.
931 Register AmtReg = MI.getOperand(2).getReg();
932 LLT AmtTy = MRI.getType(AmtReg);
933 (void)AmtTy;
934 assert(AmtTy.isScalar() && "Expected a scalar rotate");
935 assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
936 auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
937 Helper.Observer.changingInstr(MI);
938 MI.getOperand(2).setReg(NewAmt.getReg(0));
939 Helper.Observer.changedInstr(MI);
940 return true;
941 }
942
extractParts(Register Reg,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,LLT Ty,int NumParts,SmallVectorImpl<Register> & VRegs)943 static void extractParts(Register Reg, MachineRegisterInfo &MRI,
944 MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts,
945 SmallVectorImpl<Register> &VRegs) {
946 for (int I = 0; I < NumParts; ++I)
947 VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
948 MIRBuilder.buildUnmerge(VRegs, Reg);
949 }
950
legalizeVectorTrunc(MachineInstr & MI,LegalizerHelper & Helper) const951 bool AArch64LegalizerInfo::legalizeVectorTrunc(
952 MachineInstr &MI, LegalizerHelper &Helper) const {
953 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
954 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
955 // Similar to how operand splitting is done in SelectiondDAG, we can handle
956 // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
957 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
958 // %lo16(<4 x s16>) = G_TRUNC %inlo
959 // %hi16(<4 x s16>) = G_TRUNC %inhi
960 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
961 // %res(<8 x s8>) = G_TRUNC %in16
962
963 Register DstReg = MI.getOperand(0).getReg();
964 Register SrcReg = MI.getOperand(1).getReg();
965 LLT DstTy = MRI.getType(DstReg);
966 LLT SrcTy = MRI.getType(SrcReg);
967 assert(isPowerOf2_32(DstTy.getSizeInBits()) &&
968 isPowerOf2_32(SrcTy.getSizeInBits()));
969
970 // Split input type.
971 LLT SplitSrcTy =
972 SrcTy.changeElementCount(SrcTy.getElementCount().divideCoefficientBy(2));
973 // First, split the source into two smaller vectors.
974 SmallVector<Register, 2> SplitSrcs;
975 extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs);
976
977 // Truncate the splits into intermediate narrower elements.
978 LLT InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
979 for (unsigned I = 0; I < SplitSrcs.size(); ++I)
980 SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0);
981
982 auto Concat = MIRBuilder.buildConcatVectors(
983 DstTy.changeElementSize(DstTy.getScalarSizeInBits() * 2), SplitSrcs);
984
985 Helper.Observer.changingInstr(MI);
986 MI.getOperand(1).setReg(Concat.getReg(0));
987 Helper.Observer.changedInstr(MI);
988 return true;
989 }
990
legalizeSmallCMGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const991 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
992 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
993 GISelChangeObserver &Observer) const {
994 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
995 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
996 // G_ADD_LOW instructions.
997 // By splitting this here, we can optimize accesses in the small code model by
998 // folding in the G_ADD_LOW into the load/store offset.
999 auto &GlobalOp = MI.getOperand(1);
1000 const auto* GV = GlobalOp.getGlobal();
1001 if (GV->isThreadLocal())
1002 return true; // Don't want to modify TLS vars.
1003
1004 auto &TM = ST->getTargetLowering()->getTargetMachine();
1005 unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
1006
1007 if (OpFlags & AArch64II::MO_GOT)
1008 return true;
1009
1010 auto Offset = GlobalOp.getOffset();
1011 Register DstReg = MI.getOperand(0).getReg();
1012 auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
1013 .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE);
1014 // Set the regclass on the dest reg too.
1015 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1016
1017 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
1018 // by creating a MOVK that sets bits 48-63 of the register to (global address
1019 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
1020 // prevent an incorrect tag being generated during relocation when the the
1021 // global appears before the code section. Without the offset, a global at
1022 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
1023 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
1024 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
1025 // instead of `0xf`.
1026 // This assumes that we're in the small code model so we can assume a binary
1027 // size of <= 4GB, which makes the untagged PC relative offset positive. The
1028 // binary must also be loaded into address range [0, 2^48). Both of these
1029 // properties need to be ensured at runtime when using tagged addresses.
1030 if (OpFlags & AArch64II::MO_TAGGED) {
1031 assert(!Offset &&
1032 "Should not have folded in an offset for a tagged global!");
1033 ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
1034 .addGlobalAddress(GV, 0x100000000,
1035 AArch64II::MO_PREL | AArch64II::MO_G3)
1036 .addImm(48);
1037 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1038 }
1039
1040 MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
1041 .addGlobalAddress(GV, Offset,
1042 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
1043 MI.eraseFromParent();
1044 return true;
1045 }
1046
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const1047 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
1048 MachineInstr &MI) const {
1049 switch (MI.getIntrinsicID()) {
1050 case Intrinsic::vacopy: {
1051 unsigned PtrSize = ST->isTargetILP32() ? 4 : 8;
1052 unsigned VaListSize =
1053 (ST->isTargetDarwin() || ST->isTargetWindows())
1054 ? PtrSize
1055 : ST->isTargetILP32() ? 20 : 32;
1056
1057 MachineFunction &MF = *MI.getMF();
1058 auto Val = MF.getRegInfo().createGenericVirtualRegister(
1059 LLT::scalar(VaListSize * 8));
1060 MachineIRBuilder MIB(MI);
1061 MIB.buildLoad(Val, MI.getOperand(2),
1062 *MF.getMachineMemOperand(MachinePointerInfo(),
1063 MachineMemOperand::MOLoad,
1064 VaListSize, Align(PtrSize)));
1065 MIB.buildStore(Val, MI.getOperand(1),
1066 *MF.getMachineMemOperand(MachinePointerInfo(),
1067 MachineMemOperand::MOStore,
1068 VaListSize, Align(PtrSize)));
1069 MI.eraseFromParent();
1070 return true;
1071 }
1072 case Intrinsic::get_dynamic_area_offset: {
1073 MachineIRBuilder &MIB = Helper.MIRBuilder;
1074 MIB.buildConstant(MI.getOperand(0).getReg(), 0);
1075 MI.eraseFromParent();
1076 return true;
1077 }
1078 case Intrinsic::aarch64_mops_memset_tag: {
1079 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1080 // Zext the value to 64 bit
1081 MachineIRBuilder MIB(MI);
1082 auto &Value = MI.getOperand(3);
1083 Register ZExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
1084 Value.setReg(ZExtValueReg);
1085 return true;
1086 }
1087 case Intrinsic::prefetch: {
1088 MachineIRBuilder MIB(MI);
1089 auto &AddrVal = MI.getOperand(1);
1090
1091 int64_t IsWrite = MI.getOperand(2).getImm();
1092 int64_t Locality = MI.getOperand(3).getImm();
1093 int64_t IsData = MI.getOperand(4).getImm();
1094
1095 bool IsStream = Locality == 0;
1096 if (Locality != 0) {
1097 assert(Locality <= 3 && "Prefetch locality out-of-range");
1098 // The locality degree is the opposite of the cache speed.
1099 // Put the number the other way around.
1100 // The encoding starts at 0 for level 1
1101 Locality = 3 - Locality;
1102 }
1103
1104 unsigned PrfOp =
1105 (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream;
1106
1107 MIB.buildInstr(AArch64::G_PREFETCH).addImm(PrfOp).add(AddrVal);
1108 MI.eraseFromParent();
1109 return true;
1110 }
1111 case Intrinsic::aarch64_prefetch: {
1112 MachineIRBuilder MIB(MI);
1113 auto &AddrVal = MI.getOperand(1);
1114
1115 int64_t IsWrite = MI.getOperand(2).getImm();
1116 int64_t Target = MI.getOperand(3).getImm();
1117 int64_t IsStream = MI.getOperand(4).getImm();
1118 int64_t IsData = MI.getOperand(5).getImm();
1119
1120 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
1121 (!IsData << 3) | // IsDataCache bit
1122 (Target << 1) | // Cache level bits
1123 (unsigned)IsStream; // Stream bit
1124
1125 MIB.buildInstr(AArch64::G_PREFETCH).addImm(PrfOp).add(AddrVal);
1126 MI.eraseFromParent();
1127 return true;
1128 }
1129 }
1130
1131 return true;
1132 }
1133
legalizeShlAshrLshr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const1134 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
1135 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1136 GISelChangeObserver &Observer) const {
1137 assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
1138 MI.getOpcode() == TargetOpcode::G_LSHR ||
1139 MI.getOpcode() == TargetOpcode::G_SHL);
1140 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
1141 // imported patterns can select it later. Either way, it will be legal.
1142 Register AmtReg = MI.getOperand(2).getReg();
1143 auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI);
1144 if (!VRegAndVal)
1145 return true;
1146 // Check the shift amount is in range for an immediate form.
1147 int64_t Amount = VRegAndVal->Value.getSExtValue();
1148 if (Amount > 31)
1149 return true; // This will have to remain a register variant.
1150 auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
1151 Observer.changingInstr(MI);
1152 MI.getOperand(2).setReg(ExtCst.getReg(0));
1153 Observer.changedInstr(MI);
1154 return true;
1155 }
1156
matchLDPSTPAddrMode(Register Root,Register & Base,int & Offset,MachineRegisterInfo & MRI)1157 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
1158 MachineRegisterInfo &MRI) {
1159 Base = Root;
1160 Offset = 0;
1161
1162 Register NewBase;
1163 int64_t NewOffset;
1164 if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
1165 isShiftedInt<7, 3>(NewOffset)) {
1166 Base = NewBase;
1167 Offset = NewOffset;
1168 }
1169 }
1170
1171 // FIXME: This should be removed and replaced with the generic bitcast legalize
1172 // action.
legalizeLoadStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const1173 bool AArch64LegalizerInfo::legalizeLoadStore(
1174 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1175 GISelChangeObserver &Observer) const {
1176 assert(MI.getOpcode() == TargetOpcode::G_STORE ||
1177 MI.getOpcode() == TargetOpcode::G_LOAD);
1178 // Here we just try to handle vector loads/stores where our value type might
1179 // have pointer elements, which the SelectionDAG importer can't handle. To
1180 // allow the existing patterns for s64 to fire for p0, we just try to bitcast
1181 // the value to use s64 types.
1182
1183 // Custom legalization requires the instruction, if not deleted, must be fully
1184 // legalized. In order to allow further legalization of the inst, we create
1185 // a new instruction and erase the existing one.
1186
1187 Register ValReg = MI.getOperand(0).getReg();
1188 const LLT ValTy = MRI.getType(ValReg);
1189
1190 if (ValTy == LLT::scalar(128)) {
1191 assert((*MI.memoperands_begin())->getSuccessOrdering() ==
1192 AtomicOrdering::Monotonic ||
1193 (*MI.memoperands_begin())->getSuccessOrdering() ==
1194 AtomicOrdering::Unordered);
1195 assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
1196 LLT s64 = LLT::scalar(64);
1197 MachineInstrBuilder NewI;
1198 if (MI.getOpcode() == TargetOpcode::G_LOAD) {
1199 NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {});
1200 MIRBuilder.buildMergeLikeInstr(
1201 ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
1202 } else {
1203 auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
1204 NewI = MIRBuilder.buildInstr(
1205 AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)});
1206 }
1207 Register Base;
1208 int Offset;
1209 matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
1210 NewI.addUse(Base);
1211 NewI.addImm(Offset / 8);
1212
1213 NewI.cloneMemRefs(MI);
1214 constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
1215 *MRI.getTargetRegisterInfo(),
1216 *ST->getRegBankInfo());
1217 MI.eraseFromParent();
1218 return true;
1219 }
1220
1221 if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
1222 ValTy.getElementType().getAddressSpace() != 0) {
1223 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
1224 return false;
1225 }
1226
1227 unsigned PtrSize = ValTy.getElementType().getSizeInBits();
1228 const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize);
1229 auto &MMO = **MI.memoperands_begin();
1230 MMO.setType(NewTy);
1231
1232 if (MI.getOpcode() == TargetOpcode::G_STORE) {
1233 auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
1234 MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
1235 } else {
1236 auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
1237 MIRBuilder.buildBitcast(ValReg, NewLoad);
1238 }
1239 MI.eraseFromParent();
1240 return true;
1241 }
1242
legalizeVaArg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder) const1243 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
1244 MachineRegisterInfo &MRI,
1245 MachineIRBuilder &MIRBuilder) const {
1246 MachineFunction &MF = MIRBuilder.getMF();
1247 Align Alignment(MI.getOperand(2).getImm());
1248 Register Dst = MI.getOperand(0).getReg();
1249 Register ListPtr = MI.getOperand(1).getReg();
1250
1251 LLT PtrTy = MRI.getType(ListPtr);
1252 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1253
1254 const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
1255 const Align PtrAlign = Align(PtrSize);
1256 auto List = MIRBuilder.buildLoad(
1257 PtrTy, ListPtr,
1258 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1259 PtrTy, PtrAlign));
1260
1261 MachineInstrBuilder DstPtr;
1262 if (Alignment > PtrAlign) {
1263 // Realign the list to the actual required alignment.
1264 auto AlignMinus1 =
1265 MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
1266 auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
1267 DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
1268 } else
1269 DstPtr = List;
1270
1271 LLT ValTy = MRI.getType(Dst);
1272 uint64_t ValSize = ValTy.getSizeInBits() / 8;
1273 MIRBuilder.buildLoad(
1274 Dst, DstPtr,
1275 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1276 ValTy, std::max(Alignment, PtrAlign)));
1277
1278 auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
1279
1280 auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
1281
1282 MIRBuilder.buildStore(NewList, ListPtr,
1283 *MF.getMachineMemOperand(MachinePointerInfo(),
1284 MachineMemOperand::MOStore,
1285 PtrTy, PtrAlign));
1286
1287 MI.eraseFromParent();
1288 return true;
1289 }
1290
legalizeBitfieldExtract(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const1291 bool AArch64LegalizerInfo::legalizeBitfieldExtract(
1292 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1293 // Only legal if we can select immediate forms.
1294 // TODO: Lower this otherwise.
1295 return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
1296 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
1297 }
1298
legalizeCTPOP(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const1299 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
1300 MachineRegisterInfo &MRI,
1301 LegalizerHelper &Helper) const {
1302 // When there is no integer popcount instruction (FEAT_CSSC isn't available),
1303 // it can be more efficiently lowered to the following sequence that uses
1304 // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD
1305 // registers are cheap.
1306 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
1307 // CNT V0.8B, V0.8B // 8xbyte pop-counts
1308 // ADDV B0, V0.8B // sum 8xbyte pop-counts
1309 // UMOV X0, V0.B[0] // copy byte result back to integer reg
1310 //
1311 // For 128 bit vector popcounts, we lower to the following sequence:
1312 // cnt.16b v0, v0 // v8s16, v4s32, v2s64
1313 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64
1314 // uaddlp.4s v0, v0 // v4s32, v2s64
1315 // uaddlp.2d v0, v0 // v2s64
1316 //
1317 // For 64 bit vector popcounts, we lower to the following sequence:
1318 // cnt.8b v0, v0 // v4s16, v2s32
1319 // uaddlp.4h v0, v0 // v4s16, v2s32
1320 // uaddlp.2s v0, v0 // v2s32
1321
1322 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1323 Register Dst = MI.getOperand(0).getReg();
1324 Register Val = MI.getOperand(1).getReg();
1325 LLT Ty = MRI.getType(Val);
1326 unsigned Size = Ty.getSizeInBits();
1327
1328 assert(Ty == MRI.getType(Dst) &&
1329 "Expected src and dst to have the same type!");
1330
1331 if (ST->hasCSSC() && Ty.isScalar() && Size == 128) {
1332 LLT s64 = LLT::scalar(64);
1333
1334 auto Split = MIRBuilder.buildUnmerge(s64, Val);
1335 auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0));
1336 auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1));
1337 auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2);
1338
1339 MIRBuilder.buildZExt(Dst, Add);
1340 MI.eraseFromParent();
1341 return true;
1342 }
1343
1344 if (!ST->hasNEON() ||
1345 MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) {
1346 // Use generic lowering when custom lowering is not possible.
1347 return Ty.isScalar() && (Size == 32 || Size == 64) &&
1348 Helper.lowerBitCount(MI) ==
1349 LegalizerHelper::LegalizeResult::Legalized;
1350 }
1351
1352 // Pre-conditioning: widen Val up to the nearest vector type.
1353 // s32,s64,v4s16,v2s32 -> v8i8
1354 // v8s16,v4s32,v2s64 -> v16i8
1355 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
1356 if (Ty.isScalar()) {
1357 assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!");
1358 if (Size == 32) {
1359 Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
1360 }
1361 }
1362 Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);
1363
1364 // Count bits in each byte-sized lane.
1365 auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
1366
1367 // Sum across lanes.
1368 Register HSum = CTPOP.getReg(0);
1369 unsigned Opc;
1370 SmallVector<LLT> HAddTys;
1371 if (Ty.isScalar()) {
1372 Opc = Intrinsic::aarch64_neon_uaddlv;
1373 HAddTys.push_back(LLT::scalar(32));
1374 } else if (Ty == LLT::fixed_vector(8, 16)) {
1375 Opc = Intrinsic::aarch64_neon_uaddlp;
1376 HAddTys.push_back(LLT::fixed_vector(8, 16));
1377 } else if (Ty == LLT::fixed_vector(4, 32)) {
1378 Opc = Intrinsic::aarch64_neon_uaddlp;
1379 HAddTys.push_back(LLT::fixed_vector(8, 16));
1380 HAddTys.push_back(LLT::fixed_vector(4, 32));
1381 } else if (Ty == LLT::fixed_vector(2, 64)) {
1382 Opc = Intrinsic::aarch64_neon_uaddlp;
1383 HAddTys.push_back(LLT::fixed_vector(8, 16));
1384 HAddTys.push_back(LLT::fixed_vector(4, 32));
1385 HAddTys.push_back(LLT::fixed_vector(2, 64));
1386 } else if (Ty == LLT::fixed_vector(4, 16)) {
1387 Opc = Intrinsic::aarch64_neon_uaddlp;
1388 HAddTys.push_back(LLT::fixed_vector(4, 16));
1389 } else if (Ty == LLT::fixed_vector(2, 32)) {
1390 Opc = Intrinsic::aarch64_neon_uaddlp;
1391 HAddTys.push_back(LLT::fixed_vector(4, 16));
1392 HAddTys.push_back(LLT::fixed_vector(2, 32));
1393 } else
1394 llvm_unreachable("unexpected vector shape");
1395 MachineInstrBuilder UADD;
1396 for (LLT HTy : HAddTys) {
1397 UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}, /*HasSideEffects =*/false)
1398 .addUse(HSum);
1399 HSum = UADD.getReg(0);
1400 }
1401
1402 // Post-conditioning.
1403 if (Ty.isScalar() && (Size == 64 || Size == 128))
1404 MIRBuilder.buildZExt(Dst, UADD);
1405 else
1406 UADD->getOperand(0).setReg(Dst);
1407 MI.eraseFromParent();
1408 return true;
1409 }
1410
legalizeAtomicCmpxchg128(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const1411 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
1412 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1413 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1414 LLT s64 = LLT::scalar(64);
1415 auto Addr = MI.getOperand(1).getReg();
1416 auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2));
1417 auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3));
1418 auto DstLo = MRI.createGenericVirtualRegister(s64);
1419 auto DstHi = MRI.createGenericVirtualRegister(s64);
1420
1421 MachineInstrBuilder CAS;
1422 if (ST->hasLSE()) {
1423 // We have 128-bit CASP instructions taking XSeqPair registers, which are
1424 // s128. We need the merge/unmerge to bracket the expansion and pair up with
1425 // the rest of the MIR so we must reassemble the extracted registers into a
1426 // 128-bit known-regclass one with code like this:
1427 //
1428 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input
1429 // %out = CASP %in1, ...
1430 // %OldLo = G_EXTRACT %out, 0
1431 // %OldHi = G_EXTRACT %out, 64
1432 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
1433 unsigned Opcode;
1434 switch (Ordering) {
1435 case AtomicOrdering::Acquire:
1436 Opcode = AArch64::CASPAX;
1437 break;
1438 case AtomicOrdering::Release:
1439 Opcode = AArch64::CASPLX;
1440 break;
1441 case AtomicOrdering::AcquireRelease:
1442 case AtomicOrdering::SequentiallyConsistent:
1443 Opcode = AArch64::CASPALX;
1444 break;
1445 default:
1446 Opcode = AArch64::CASPX;
1447 break;
1448 }
1449
1450 LLT s128 = LLT::scalar(128);
1451 auto CASDst = MRI.createGenericVirtualRegister(s128);
1452 auto CASDesired = MRI.createGenericVirtualRegister(s128);
1453 auto CASNew = MRI.createGenericVirtualRegister(s128);
1454 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {})
1455 .addUse(DesiredI->getOperand(0).getReg())
1456 .addImm(AArch64::sube64)
1457 .addUse(DesiredI->getOperand(1).getReg())
1458 .addImm(AArch64::subo64);
1459 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {})
1460 .addUse(NewI->getOperand(0).getReg())
1461 .addImm(AArch64::sube64)
1462 .addUse(NewI->getOperand(1).getReg())
1463 .addImm(AArch64::subo64);
1464
1465 CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr});
1466
1467 MIRBuilder.buildExtract({DstLo}, {CASDst}, 0);
1468 MIRBuilder.buildExtract({DstHi}, {CASDst}, 64);
1469 } else {
1470 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
1471 // can take arbitrary registers so it just has the normal GPR64 operands the
1472 // rest of AArch64 is expecting.
1473 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
1474 unsigned Opcode;
1475 switch (Ordering) {
1476 case AtomicOrdering::Acquire:
1477 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
1478 break;
1479 case AtomicOrdering::Release:
1480 Opcode = AArch64::CMP_SWAP_128_RELEASE;
1481 break;
1482 case AtomicOrdering::AcquireRelease:
1483 case AtomicOrdering::SequentiallyConsistent:
1484 Opcode = AArch64::CMP_SWAP_128;
1485 break;
1486 default:
1487 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
1488 break;
1489 }
1490
1491 auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1492 CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
1493 {Addr, DesiredI->getOperand(0),
1494 DesiredI->getOperand(1), NewI->getOperand(0),
1495 NewI->getOperand(1)});
1496 }
1497
1498 CAS.cloneMemRefs(MI);
1499 constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(),
1500 *MRI.getTargetRegisterInfo(),
1501 *ST->getRegBankInfo());
1502
1503 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi});
1504 MI.eraseFromParent();
1505 return true;
1506 }
1507
legalizeCTTZ(MachineInstr & MI,LegalizerHelper & Helper) const1508 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
1509 LegalizerHelper &Helper) const {
1510 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1511 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1512 LLT Ty = MRI.getType(MI.getOperand(1).getReg());
1513 auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1));
1514 MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse);
1515 MI.eraseFromParent();
1516 return true;
1517 }
1518
legalizeMemOps(MachineInstr & MI,LegalizerHelper & Helper) const1519 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
1520 LegalizerHelper &Helper) const {
1521 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1522
1523 // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
1524 if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
1525 // Zext the value operand to 64 bit
1526 auto &Value = MI.getOperand(1);
1527 Register ZExtValueReg =
1528 MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
1529 Value.setReg(ZExtValueReg);
1530 return true;
1531 }
1532
1533 return false;
1534 }
1535
legalizeFCopySign(MachineInstr & MI,LegalizerHelper & Helper) const1536 bool AArch64LegalizerInfo::legalizeFCopySign(MachineInstr &MI,
1537 LegalizerHelper &Helper) const {
1538 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1539 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1540 Register Dst = MI.getOperand(0).getReg();
1541 LLT DstTy = MRI.getType(Dst);
1542 assert(DstTy.isScalar() && "Only expected scalars right now!");
1543 const unsigned DstSize = DstTy.getSizeInBits();
1544 assert((DstSize == 32 || DstSize == 64) && "Unexpected dst type!");
1545 assert(MRI.getType(MI.getOperand(2).getReg()) == DstTy &&
1546 "Expected homogeneous types!");
1547
1548 // We want to materialize a mask with the high bit set.
1549 uint64_t EltMask;
1550 LLT VecTy;
1551
1552 // TODO: s16 support.
1553 switch (DstSize) {
1554 default:
1555 llvm_unreachable("Unexpected type for G_FCOPYSIGN!");
1556 case 64: {
1557 // AdvSIMD immediate moves cannot materialize out mask in a single
1558 // instruction for 64-bit elements. Instead, materialize zero and then
1559 // negate it.
1560 EltMask = 0;
1561 VecTy = LLT::fixed_vector(2, DstTy);
1562 break;
1563 }
1564 case 32:
1565 EltMask = 0x80000000ULL;
1566 VecTy = LLT::fixed_vector(4, DstTy);
1567 break;
1568 }
1569
1570 // Widen In1 and In2 to 128 bits. We want these to eventually become
1571 // INSERT_SUBREGs.
1572 auto Undef = MIRBuilder.buildUndef(VecTy);
1573 auto Zero = MIRBuilder.buildConstant(DstTy, 0);
1574 auto Ins1 = MIRBuilder.buildInsertVectorElement(
1575 VecTy, Undef, MI.getOperand(1).getReg(), Zero);
1576 auto Ins2 = MIRBuilder.buildInsertVectorElement(
1577 VecTy, Undef, MI.getOperand(2).getReg(), Zero);
1578
1579 // Construct the mask.
1580 auto Mask = MIRBuilder.buildConstant(VecTy, EltMask);
1581 if (DstSize == 64)
1582 Mask = MIRBuilder.buildFNeg(VecTy, Mask);
1583
1584 auto Sel = MIRBuilder.buildInstr(AArch64::G_BIT, {VecTy}, {Ins1, Ins2, Mask});
1585
1586 // Build an unmerge whose 0th elt is the original G_FCOPYSIGN destination. We
1587 // want this to eventually become an EXTRACT_SUBREG.
1588 SmallVector<Register, 2> DstRegs(1, Dst);
1589 for (unsigned I = 1, E = VecTy.getNumElements(); I < E; ++I)
1590 DstRegs.push_back(MRI.createGenericVirtualRegister(DstTy));
1591 MIRBuilder.buildUnmerge(DstRegs, Sel);
1592 MI.eraseFromParent();
1593 return true;
1594 }
1595