xref: /aosp_15_r20/external/XNNPACK/src/jit/aarch64-assembler.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2022 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <xnnpack/aarch64-assembler.h>
7 #include <xnnpack/common.h>
8 #include <xnnpack/math.h>
9 
10 #include <cmath>
11 
12 namespace xnnpack {
13 namespace aarch64 {
14 // Min and max values for the imm7 for ldp, will be shifted right by 3 when encoding.
15 constexpr int32_t kImm7Min = -512;
16 constexpr int32_t kImm7Max = 504;
17 constexpr uint32_t kImm7Mask = 0x7F;
18 // Max value for imm12, will be shifted right by 3 when encoding.
19 constexpr int32_t kImm12Max = 32760;
20 constexpr uint32_t kUint12Max = 4095;
21 
22 constexpr int32_t kInt9Max = 255;
23 constexpr int32_t kInt9Min = -256;
24 constexpr uint32_t kImm9Mask = 0x1FF;
25 
26 // Constants used for checking branch offset bounds.
27 // Conditional bounds are +/-1MB.
28 constexpr ptrdiff_t kConditionalBranchImmMax = 1048572;
29 constexpr ptrdiff_t kConditionalBranchImmMin = -1048576;
30 // TBZ and TBNZ bounds are +/-32KB.
31 constexpr ptrdiff_t kTbxzImmMax = 32764;
32 constexpr ptrdiff_t kTbxzImmMin = -32768;
33 // Unconditional bounds are +/-128MB.
34 constexpr ptrdiff_t kUnconditionalBranchImmMax = 134217727;
35 constexpr ptrdiff_t kUnconditionalBranchImmMin = -134217728;
36 
37 constexpr uint32_t kConditionalImmMask = 0x0007FFFF;
38 constexpr uint32_t kTbxzImmMask = 0x3FFF;
39 constexpr uint32_t kUnconditionalImmMask = 0x03FFFFFF;
40 
rd(Reg rn)41 template <typename Reg> inline uint32_t rd(Reg rn) { return rn.code; }
rt(Reg rn)42 template <typename Reg> inline uint32_t rt(Reg rn) { return rn.code; }
rt2(Reg rn)43 template <typename Reg> inline uint32_t rt2(Reg rn) { return rn.code << 10; }
rm(Reg rn)44 template <typename Reg> inline uint32_t rm(Reg rn) { return rn.code << 16; }
rn(Reg rn)45 template <typename Reg> inline uint32_t rn(Reg rn) { return rn.code << 5; }
q(VRegister vt)46 inline uint32_t q(VRegister vt) { return vt.q << 30; }
size(VRegister vt)47 inline uint32_t size(VRegister vt) { return vt.size << 10; }
fp_sz(VRegister vn)48 inline uint32_t fp_sz(VRegister vn) { return vn.is_s() ? 0 : 1 << 22; }
postindex(MemOperand op)49 inline uint32_t postindex(MemOperand op) { return (op.mode == AddressingMode::kPostIndex) ? 0 : 1 << 24; }
wb(MemOperand op)50 inline uint32_t wb(MemOperand op) { return op.mode == AddressingMode::kOffset ? 0 : 1 << 23; }
51 
imm9(int32_t imm)52 inline uint32_t imm9(int32_t imm) {
53   assert(!(imm < kInt9Min || imm > kInt9Max));
54   return (imm & kImm9Mask) << 12;
55 }
56 
is_same_shape(VRegister vt1,VRegister vt2)57 inline bool is_same_shape(VRegister vt1, VRegister vt2) {
58   return vt1.size == vt2.size && vt1.q == vt2.q;
59 }
60 
61 template <typename Reg, typename... Regs>
is_same_shape(Reg reg1,Reg reg2,Regs...regs)62 inline bool is_same_shape(Reg reg1, Reg reg2, Regs... regs) {
63   return is_same_shape(reg1, reg2) && is_same_shape(reg2, regs...);
64 }
65 
is_same_shape(VRegisterList vs)66 inline bool is_same_shape(VRegisterList vs) {
67   switch (vs.length) {
68     case 1:
69       return true;
70     case 2:
71       return is_same_shape(vs.vt1, vs.vt2);
72     case 3:
73       return is_same_shape(vs.vt1, vs.vt2, vs.vt3);
74     case 4:
75       return is_same_shape(vs.vt1, vs.vt2, vs.vt3, vs.vt4);
76     default:
77       XNN_UNREACHABLE;
78   }
79 }
80 
is_same_data_type(VRegister vt1,VRegisterLane vt2)81 inline bool is_same_data_type(VRegister vt1, VRegisterLane vt2) {
82   return vt1.size == vt2.size;
83 }
84 
is_consecutive(VRegister vt1,VRegister vt2)85 inline bool is_consecutive(VRegister vt1, VRegister vt2) {
86   return (vt1.code + 1) % 32 == vt2.code;
87 }
88 
89 template <typename Reg, typename... Regs>
is_consecutive(Reg reg1,Reg reg2,Regs...regs)90 inline bool is_consecutive(Reg reg1, Reg reg2, Regs... regs) {
91   return is_consecutive(reg1, reg2) && is_consecutive(reg2, regs...);
92 }
93 
is_consecutive(VRegisterList vs)94 inline bool is_consecutive(VRegisterList vs) {
95   switch (vs.length) {
96     case 1:
97       return true;
98     case 2:
99       return is_consecutive(vs.vt1, vs.vt2);
100     case 3:
101       return is_consecutive(vs.vt1, vs.vt2, vs.vt3);
102     case 4:
103       return is_consecutive(vs.vt1, vs.vt2, vs.vt3, vs.vt4);
104     default:
105       XNN_UNREACHABLE;
106   }
107 }
108 
109 // Check if a branch offset is valid, it must fit in 19 bits.
branch_offset_valid(ptrdiff_t offset,BranchType branch_type)110 inline bool branch_offset_valid(ptrdiff_t offset, BranchType branch_type) {
111   switch (branch_type) {
112     case BranchType::kConditional:
113       return offset < kConditionalBranchImmMax && offset > kConditionalBranchImmMin;
114     case BranchType::kTbxz:
115       return offset < kTbxzImmMax && offset > kTbxzImmMin;
116     case BranchType::kUnconditional:
117       return offset < kUnconditionalBranchImmMax && offset > kUnconditionalBranchImmMin;
118     default:
119       XNN_UNREACHABLE;
120   }
121   return false;
122 }
123 
instruction_branch_type(uint32_t instr)124 inline BranchType instruction_branch_type(uint32_t instr) {
125   const uint32_t masked = instr & 0xFE000000;
126   switch (masked) {
127     case 0xB6000000:
128     case 0x36000000:
129       return BranchType::kTbxz;
130     case 0x54000000:
131       return BranchType::kConditional;
132     case 0x14000000:
133     case 0x16000000:
134       return BranchType::kUnconditional;
135     default:
136       XNN_UNREACHABLE;
137   }
138 }
139 
mask(BranchType branch_type)140 inline uint32_t mask(BranchType branch_type) {
141   switch (branch_type) {
142     case BranchType::kConditional:
143       return kConditionalImmMask;
144     case BranchType::kTbxz:
145       return kTbxzImmMask;
146     case BranchType::kUnconditional:
147       return kUnconditionalImmMask;
148     default:
149       XNN_UNREACHABLE;
150   }
151 }
152 
shift(BranchType branch_type)153 inline uint8_t shift(BranchType branch_type) {
154   switch (branch_type) {
155     case BranchType::kConditional:
156       return 5;
157     case BranchType::kTbxz:
158       return 5;
159     case BranchType::kUnconditional:
160       return 0;
161     default:
162       XNN_UNREACHABLE;
163   }
164 }
165 
branch_imm(ptrdiff_t offset,BranchType bt)166 inline uint32_t branch_imm(ptrdiff_t offset, BranchType bt) {
167   return ((offset >> kInstructionSizeInBytesLog2) & mask(bt)) << shift(bt);
168 }
169 
hl(VRegisterLane vl)170 inline uint32_t hl(VRegisterLane vl) {
171   if (vl.is_s()) {
172     return (vl.lane & 1) << 21 | ((vl.lane & 2) << 10);
173   } else {
174     return (vl.lane & 1) << 11;
175   }
176 }
177 
lane_index_valid(uint8_t q,uint8_t size,uint8_t lane)178 inline bool lane_index_valid(uint8_t q, uint8_t size, uint8_t lane) {
179   // The logic here is something like:
180   // if (q && size == 0) {
181   //   return lane < 16;
182   // } else if (q && size == 1) {
183   //   return lane < 8;
184   // } else if (q && size == 2) {
185   //   return lane < 4;
186   // } else if (q && size == 3) {
187   //   return lane < 2;
188   // }
189   // then repeat for !q with maximum lane size halved.
190   // translated into this formula.
191   return lane < ((q + 1) << (3 - size));
192 }
193 
load_store_opcode(uint8_t register_length)194 inline uint8_t load_store_opcode(uint8_t register_length) {
195   switch (register_length) {
196     case 1:
197       return 0x7;
198     case 2:
199       return 0xA;
200     case 3:
201       return 0x6;
202     case 4:
203       return 0x2;
204     default:
205       XNN_UNREACHABLE;
206   }
207 }
208 
imm7_offset_valid(int32_t imm,XRegister)209 inline bool imm7_offset_valid(int32_t imm, XRegister) {
210   return imm >= kImm7Min && imm <= kImm7Max && (imm & 0x7) == 0;
211 }
212 
imm7_offset_valid(int32_t imm,DRegister)213 inline bool imm7_offset_valid(int32_t imm, DRegister) {
214   return imm >= kImm7Min && imm <= kImm7Max && (imm & 0x7) == 0;
215 }
216 
imm7_offset_valid(int32_t imm,QRegister)217 inline bool imm7_offset_valid(int32_t imm, QRegister) {
218   return imm >= (kImm7Min * 2) && imm <= (kImm7Max * 2) && (imm & 0xF) == 0;
219 }
220 
221 // Base instructions.
222 
add(XRegister xd,XRegister xn,uint16_t imm12)223 void Assembler::add(XRegister xd, XRegister xn, uint16_t imm12) {
224   // The instruction supports larger numbers using the shift by (left shift by 12), but that's unused in kernels.
225   if (imm12 > kUint12Max) {
226     error_ = Error::kInvalidOperand;
227     return;
228   }
229 
230   emit32(0x91000000 | imm12 << 10 | rn(xn) | rd(xd));
231 }
232 
add(XRegister xd,XRegister xn,XRegister xm)233 void Assembler::add(XRegister xd, XRegister xn, XRegister xm) {
234   emit32(0x8B000000 | rd(xd) | rn(xn) | rm(xm));
235 }
236 
b(Label & l)237 void Assembler::b(Label& l) {
238   return branch_to_label(0x14000000, BranchType::kUnconditional, l);
239 }
240 
cmp(XRegister xn,uint16_t imm12)241 void Assembler::cmp(XRegister xn, uint16_t imm12) {
242   if (imm12 > kUint12Max) {
243     error_ = Error::kInvalidOperand;
244     return;
245   }
246   emit32(0xF100001F | imm12 << 10 | rn(xn));
247 }
248 
cmp(XRegister xn,XRegister xm)249 void Assembler::cmp(XRegister xn, XRegister xm) {
250   emit32(0xEB00001F | rm(xm) | rn(xn));
251 }
252 
csel(XRegister xd,XRegister xn,XRegister xm,Condition c)253 void Assembler::csel(XRegister xd, XRegister xn, XRegister xm, Condition c) {
254   emit32(0x9A800000 | rm(xm) | c << 12 | rn(xn) | rd(xd));
255 }
256 
hlt()257 void Assembler::hlt() {
258   emit32(0xD4400000);
259 }
260 
ldp(XRegister xt1,XRegister xt2,MemOperand xn)261 void Assembler::ldp(XRegister xt1, XRegister xt2, MemOperand xn) {
262   if (!imm7_offset_valid(xn.offset, xt1)) {
263     error_ = Error::kInvalidOperand;
264     return;
265   }
266 
267   const uint32_t offset = (xn.offset >> 3) & kImm7Mask;
268 
269   emit32(0xA8400000 | postindex(xn) | wb(xn) | offset << 15 | rt2(xt2) | rn(xn.base) | xt1.code);
270 }
271 
ldp(XRegister xt1,XRegister xt2,MemOperand xn,int32_t imm)272 void Assembler::ldp(XRegister xt1, XRegister xt2, MemOperand xn, int32_t imm) {
273   if (xn.offset != 0) {
274     error_ = Error::kInvalidOperand;
275     return;
276   }
277   return ldp(xt1, xt2, {xn.base, imm, AddressingMode::kPostIndex});
278 }
279 
ldr(XRegister xt,MemOperand xn)280 void Assembler::ldr(XRegister xt, MemOperand xn) {
281   const int32_t imm = xn.offset;
282   if (xn.mode != AddressingMode::kOffset || imm < 0 || imm > (kUint12Max << 3) || (imm & 7) != 0) {
283     error_ = Error::kInvalidOperand;
284     return;
285   }
286 
287   emit32(0xF9400000 | imm >> 3 << 10 | rn(xn.base) | xt.code);
288 }
289 
ldr(XRegister xt,MemOperand xn,int32_t imm)290 void Assembler::ldr(XRegister xt, MemOperand xn, int32_t imm) {
291   if (imm < kInt9Min || imm > kInt9Max) {
292     error_ = Error::kInvalidOperand;
293     return;
294   }
295 
296   emit32(0xF8400400 | imm9(imm) | rn(xn.base) | rt(xt));
297 }
298 
mov(XRegister xd,XRegister xn)299 void Assembler::mov(XRegister xd, XRegister xn) {
300   emit32(0xAA0003E0 | rm(xn) | rd(xd));
301 }
302 
nop()303 void Assembler::nop() {
304   emit32(0xD503201F);
305 }
306 
prfm(PrefetchOp prfop,MemOperand xn)307 void Assembler::prfm(PrefetchOp prfop, MemOperand xn) {
308   if (xn.offset < 0 || xn.offset > kImm12Max) {
309     error_ = Error::kInvalidOperand;
310     return;
311   }
312 
313   emit32(0xF9800000 | xn.offset >> 3 << 10 | rn(xn.base) | prfop);
314 }
315 
ret()316 void Assembler::ret() {
317   emit32(0xD65F0000 | rn(x30));
318 }
319 
stp(XRegister xt1,XRegister xt2,MemOperand xn)320 void Assembler::stp(XRegister xt1, XRegister xt2, MemOperand xn) {
321   if (!imm7_offset_valid(xn.offset, xt1)) {
322     error_ = Error::kInvalidOperand;
323     return;
324   }
325 
326   const uint32_t offset = (xn.offset >> 3) & kImm7Mask;
327   emit32(0xA9000000 | wb(xn) | offset << 15 | rt2(xt2) | rn(xn.base) | rt(xt1));
328 }
329 
str(XRegister xt1,MemOperand xn)330 void Assembler::str(XRegister xt1, MemOperand xn) {
331   const int32_t offset = xn.offset;
332   if (xn.mode == AddressingMode::kPreIndex) {
333     if (offset < kInt9Min || offset > kInt9Max) {
334       error_ = Error::kInvalidOperand;
335       return;
336     }
337     emit32(0xF8000C00 | imm9(offset) | rn(xn.base) | rt(xt1));
338   } else if (xn.mode == AddressingMode::kOffset) {
339     if (offset < 0 || offset > kImm12Max || offset % 8 != 0) {
340       error_ = Error::kInvalidOperand;
341       return;
342     }
343     emit32(0xF9000000 | offset >> 3 << 10 | rn(xn.base) | rt(xt1));
344   } else {
345     XNN_UNREACHABLE;
346   }
347 }
348 
sub(XRegister xd,XRegister xn,XRegister xm)349 void Assembler::sub(XRegister xd, XRegister xn, XRegister xm) {
350   emit32(0xCB000000 | rm(xm) | rn(xn) | rd(xd));
351 }
352 
subs(XRegister xd,XRegister xn,uint16_t imm12)353 void Assembler::subs(XRegister xd, XRegister xn, uint16_t imm12) {
354   if (imm12 > kUint12Max) {
355     error_ = Error::kInvalidOperand;
356     return;
357   }
358 
359   emit32(0xF1000000 | imm12 << 10 | rn(xn) | rd(xd));
360 }
361 
tbnz(XRegister xd,uint8_t bit,Label & l)362 void Assembler::tbnz(XRegister xd, uint8_t bit, Label& l) {
363   return tb_helper(0x37000000, xd, bit, l);
364 }
365 
tbz(XRegister xd,uint8_t bit,Label & l)366 void Assembler::tbz(XRegister xd, uint8_t bit, Label& l) {
367   return tb_helper(0x36000000, xd, bit, l);
368 }
369 
tst(XRegister xn,uint8_t imm)370 void Assembler::tst(XRegister xn, uint8_t imm) {
371   // Encoding of immediate is quite complicated, we only support po2-1, which is what assembly microkernel uses.
372   uint32_t imm_po2 = imm + 1;
373   if (!is_po2(imm_po2)) {
374     error_ = Error::kUnimplemented;
375     return;
376   }
377 
378   const uint32_t imm_s = (math_ctz_u32(imm_po2) - 1) << 10;
379   emit32(0xF240001F | imm_s | rn(xn));
380 }
381 
382 // SIMD instructions.
383 
dup(DRegister dd,VRegisterLane vn)384 void Assembler::dup(DRegister dd, VRegisterLane vn) {
385   if (vn.size != 3 || vn.lane > 1) {
386     error_ = Error::kInvalidOperand;
387     return;
388   }
389   const uint8_t imm5 = 0b1000 | (vn.lane & 1) << 4;
390   emit32(0x5E000400 | imm5 << 16 | rn(vn) | rd(dd));
391 }
392 
fabs(VRegister vd,VRegister vn)393 void Assembler::fabs(VRegister vd, VRegister vn) {
394   if (!is_same_shape(vd, vn)) {
395     error_ = Error::kInvalidOperand;
396     return;
397   }
398 
399   emit32(0x0EA0F800 | q(vd) | fp_sz(vn) | rn(vn) | rd(vd));
400 }
401 
fadd(VRegister vd,VRegister vn,VRegister vm)402 void Assembler::fadd(VRegister vd, VRegister vn, VRegister vm) {
403   if (!is_same_shape(vd, vn, vm)) {
404     error_ = Error::kInvalidOperand;
405     return;
406   }
407 
408   emit32(0x0E20D400 | q(vd) | fp_sz(vn) | rm(vm) | rn(vn) | rd(vd));
409 }
410 
fmax(VRegister vd,VRegister vn,VRegister vm)411 void Assembler::fmax(VRegister vd, VRegister vn, VRegister vm) {
412   if (!is_same_shape(vd, vn, vm)) {
413     error_ = Error::kInvalidOperand;
414     return;
415   }
416 
417   emit32(0x0E20F400 | q(vd) | fp_sz(vn) | rm(vm) | rn(vn) | rd(vd));
418 }
419 
fmin(VRegister vd,VRegister vn,VRegister vm)420 void Assembler::fmin(VRegister vd, VRegister vn, VRegister vm) {
421   if (!is_same_shape(vd, vn, vm)) {
422     error_ = Error::kInvalidOperand;
423     return;
424   }
425 
426   emit32(0x0EA0F400 | q(vd) | fp_sz(vn) | rm(vm) | rn(vn) | rd(vd));
427 }
428 
fmla(VRegister vd,VRegister vn,VRegisterLane vm)429 void Assembler::fmla(VRegister vd, VRegister vn, VRegisterLane vm) {
430   if (!is_same_shape(vd, vn) || !is_same_data_type(vd, vm)) {
431     error_ = Error::kInvalidOperand;
432     return;
433   }
434   if (!lane_index_valid(vd.q, vm.size, vm.lane)) {
435     error_ = Error::kInvalidLaneIndex;
436     return;
437   }
438 
439   emit32(0x0F801000 | q(vd) | fp_sz(vd) | hl(vm) | rm(vm) | rn(vn) | rd(vd));
440 }
441 
fmul(VRegister vd,VRegister vn,VRegister vm)442 void Assembler::fmul(VRegister vd, VRegister vn, VRegister vm) {
443   if (!is_same_shape(vd, vn, vm)) {
444     error_ = Error::kInvalidOperand;
445     return;
446   }
447 
448   emit32(0x2E20DC00 | q(vd) | fp_sz(vn) | rm(vm) | rn(vn) | rd(vd));
449 }
450 
fneg(VRegister vd,VRegister vn)451 void Assembler::fneg(VRegister vd, VRegister vn) {
452   if (!is_same_shape(vd, vn)) {
453     error_ = Error::kInvalidOperand;
454     return;
455   }
456 
457   emit32(0x2EA0F800 | q(vd) | fp_sz(vn) | rn(vn) | rd(vd));
458 }
459 
ld1(VRegisterList vs,MemOperand xn,int32_t imm)460 void Assembler::ld1(VRegisterList vs, MemOperand xn, int32_t imm) {
461   VRegister vt = vs.vt1;
462 
463   if (!is_same_shape(vs) || !is_consecutive(vs)) {
464     error_ = Error::kInvalidOperand;
465     return;
466   }
467 
468   // imm must match number of bytes loaded.
469   if ((vt.q + 1) * 8 * vs.length != imm) {
470     error_ = Error::kInvalidOperand;
471     return;
472   }
473 
474   const uint8_t opcode = load_store_opcode(vs.length);
475 
476   emit32(0x0CDF0000 | q(vt) | opcode << 12 | size(vt) | rn(xn.base) | rt(vt));
477 }
478 
ld1r(VRegisterList xs,MemOperand xn)479 void Assembler::ld1r(VRegisterList xs, MemOperand xn) {
480   if (xs.length != 1 || xn.offset != 0) {
481     error_ = Error::kInvalidOperand;
482     return;
483   }
484 
485   emit32(0x0D40C000 | q(xs.vt1) | size(xs.vt1) | rn(xn.base) | xs.vt1.code);
486 }
487 
ld2r(VRegisterList xs,MemOperand xn)488 void Assembler::ld2r(VRegisterList xs, MemOperand xn) {
489   if (xs.length != 2 || !is_same_shape(xs.vt1, xs.vt2) || xn.offset != 0 || !is_consecutive(xs)) {
490     error_ = Error::kInvalidOperand;
491     return;
492   }
493 
494   emit32(0x0D60C000 | q(xs.vt1) | size(xs.vt1) | rn(xn.base) | xs.vt1.code);
495 }
496 
ld3r(VRegisterList xs,MemOperand xn)497 void Assembler::ld3r(VRegisterList xs, MemOperand xn) {
498   if (xs.length != 3 || !is_same_shape(xs.vt1, xs.vt2, xs.vt3) || xn.offset != 0 || !is_consecutive(xs)) {
499     error_ = Error::kInvalidOperand;
500     return;
501   }
502 
503   emit32(0x0D40E000 | q(xs.vt1) | size(xs.vt1) | rn(xn.base) | xs.vt1.code);
504 }
505 
ldp(DRegister dt1,DRegister dt2,MemOperand xn)506 void Assembler::ldp(DRegister dt1, DRegister dt2, MemOperand xn) {
507   if (!imm7_offset_valid(xn.offset, dt1)) {
508     error_ = Error::kInvalidOperand;
509     return;
510   }
511 
512   const uint32_t offset = (xn.offset >> 3) & kImm7Mask;
513   emit32(0x6C400000 | postindex(xn) | wb(xn) | offset << 15 | rt2(dt2) | rn(xn.base) | rt(dt1));
514 }
515 
ldp(DRegister dt1,DRegister dt2,MemOperand xn,int32_t imm)516 void Assembler::ldp(DRegister dt1, DRegister dt2, MemOperand xn, int32_t imm) {
517   return ldp(dt1, dt2, {xn.base, imm, AddressingMode::kPostIndex});
518 }
519 
ldp(QRegister qt1,QRegister qt2,MemOperand xn,int32_t imm)520 void Assembler::ldp(QRegister qt1, QRegister qt2, MemOperand xn, int32_t imm) {
521   if (!imm7_offset_valid(imm, qt1)) {
522     error_ = Error::kInvalidOperand;
523     return;
524   }
525   const uint32_t offset = (imm >> 4) & kImm7Mask;
526 
527   emit32(0xACC00000 | offset << 15 | rt2(qt2) | rn(xn.base) | qt1.code);
528 }
529 
ldr(DRegister dt,MemOperand xn,int32_t imm)530 void Assembler::ldr(DRegister dt, MemOperand xn, int32_t imm) {
531   return ldr(/*size=*/3, /*opc=*/1, xn, imm, dt.code);
532 }
533 
ldr(QRegister qt,MemOperand xn,int32_t imm)534 void Assembler::ldr(QRegister qt, MemOperand xn, int32_t imm) {
535   return ldr(/*size=*/0, /*opc=*/3, xn, imm, qt.code);
536 }
537 
ldr(SRegister st,MemOperand xn,int32_t imm)538 void Assembler::ldr(SRegister st, MemOperand xn, int32_t imm) {
539   return ldr(/*size=*/2, /*opc=*/1, xn, imm, st.code);
540 }
541 
mov(VRegister vd,VRegister vn)542 void Assembler::mov(VRegister vd, VRegister vn) {
543   if (!is_same_shape(vd, vn)) {
544     error_ = Error::kInvalidOperand;
545     return;
546   }
547   emit32(0x0EA01C00 | q(vd) | rm(vn) | rn(vn) | rd(vd));
548 }
549 
movi(VRegister vd,uint8_t imm)550 void Assembler::movi(VRegister vd, uint8_t imm) {
551   if (imm != 0) {
552     error_ = Error::kUnimplemented;
553     return;
554   }
555 
556   uint32_t cmode = 0;
557   switch (vd.size) {
558     case 0:
559       cmode = 0xE;
560       break;
561     case 1:
562       cmode = 0x8;
563       break;
564     case 2:
565       cmode = 0x0;
566       break;
567     default:
568       error_ = Error::kUnimplemented;
569       return;
570   }
571 
572   emit32(0x0F000400 | q(vd) | cmode << 12 | vd.code);
573 }
574 
st1(VRegisterList vs,MemOperand xn,XRegister xm)575 void Assembler::st1(VRegisterList vs, MemOperand xn, XRegister xm) {
576   if (!is_same_shape(vs) || !is_consecutive(vs)) {
577     error_ = Error::kInvalidOperand;
578     return;
579   }
580 
581   VRegister vt = vs.vt1;
582 
583   const uint8_t opcode = load_store_opcode(vs.length);
584   emit32(0x0C800000 | q(vt) | rm(xm) | opcode << 12 | size(vt) | rn(xn.base) | rt(vt));
585 }
586 
stp(DRegister dt1,DRegister dt2,MemOperand xn)587 void Assembler::stp(DRegister dt1, DRegister dt2, MemOperand xn) {
588   if (!imm7_offset_valid(xn.offset, dt1)) {
589     error_ = Error::kInvalidOperand;
590     return;
591   }
592 
593   const uint32_t offset = (xn.offset >> 3) & kImm7Mask;
594   emit32(0x6D000000 | wb(xn) | offset << 15 | rt2(dt2) | rn(xn.base) | rt(dt1));
595 }
596 
stp(QRegister qt1,QRegister qt2,MemOperand xn)597 void Assembler::stp(QRegister qt1, QRegister qt2, MemOperand xn) {
598   if (!imm7_offset_valid(xn.offset, qt1)) {
599     error_ = Error::kInvalidOperand;
600     return;
601   }
602 
603   const uint32_t offset = (xn.offset >> 4) & kImm7Mask;
604   emit32(0xAD000000 | wb(xn) | offset << 15 | rt2(qt2) | rn(xn.base) | rt(qt1));
605 }
606 
stp(QRegister qt1,QRegister qt2,MemOperand xn,int32_t imm)607 void Assembler::stp(QRegister qt1, QRegister qt2, MemOperand xn, int32_t imm) {
608   if (!imm7_offset_valid(imm, qt1)) {
609     error_ = Error::kInvalidOperand;
610     return;
611   }
612 
613   const uint32_t offset = (imm >> 4) & kImm7Mask;
614   emit32(0xAC800000 | offset << 15 | rt2(qt2) | rn(xn.base) | rt(qt1));
615 }
616 
str(DRegister dt,MemOperand xn,int32_t imm)617 void Assembler::str(DRegister dt, MemOperand xn, int32_t imm) {
618   return str(/*size=*/3, /*opc=*/0, xn, imm, dt.code);
619 }
620 
str(QRegister qt,MemOperand xn,int32_t imm)621 void Assembler::str(QRegister qt, MemOperand xn, int32_t imm) {
622   return str(/*size=*/0, /*opc=*/2, xn, imm, qt.code);
623 }
624 
str(SRegister st,MemOperand xn)625 void Assembler::str(SRegister st, MemOperand xn) {
626   const int32_t imm = xn.offset;
627   if (imm < 0 || imm > (kUint12Max << 2) || (imm & 0x3) != 0) {
628     error_ = Error::kInvalidOperand;
629     return;
630   }
631 
632   emit32(0xBD000000 | imm >> 2 << 10 | rn(xn.base) | rt(st));
633 }
634 
str(SRegister st,MemOperand xn,int32_t imm)635 void Assembler::str(SRegister st, MemOperand xn, int32_t imm) {
636   return str(/*size=*/2, /*opc=*/0, xn, imm, st.code);
637 }
638 
align(uint8_t n,AlignInstruction instr)639 void Assembler::align(uint8_t n, AlignInstruction instr) {
640   if (!is_po2(n) || (n % kInstructionSizeInBytes != 0)) {
641     error_ = Error::kInvalidOperand;
642     return;
643   }
644 
645   uintptr_t cursor = reinterpret_cast<uintptr_t>(cursor_);
646   const uintptr_t target = round_up_po2(cursor, n);
647   while (cursor < target) {
648     switch (instr) {
649       case AlignInstruction::kHlt:
650         hlt();
651         break;
652       case AlignInstruction::kNop:
653         nop();
654         break;
655       default:
656         XNN_UNREACHABLE;
657     }
658     cursor += kInstructionSizeInBytes;
659   }
660 }
661 
bind(Label & l)662 void Assembler::bind(Label& l) {
663   if (error_ != Error::kNoError) {
664     return;
665   }
666 
667   if (l.bound) {
668     error_ = Error::kLabelAlreadyBound;
669     return;
670   }
671 
672   l.bound = true;
673   l.offset = cursor_;
674 
675   // Patch all users.
676   for (size_t i = 0; i < l.num_users; i++) {
677     byte* user = l.users[i];
678     const ptrdiff_t offset = l.offset - user;
679     uint32_t* instr = reinterpret_cast<uint32_t*>(user);
680 
681     const BranchType bt = instruction_branch_type(*instr);
682     if (!branch_offset_valid(offset, bt)) {
683       error_ = Error::kLabelOffsetOutOfBounds;
684       return;
685     }
686 
687     *instr |= branch_imm(offset, bt);
688   }
689 }
690 
b(Condition c,Label & l)691 void Assembler::b(Condition c, Label& l) {
692   return branch_to_label(0x54000000 | c, BranchType::kConditional, l);
693 }
694 
branch_to_label(uint32_t opcode,BranchType bt,Label & l)695 void Assembler::branch_to_label(uint32_t opcode, BranchType bt, Label& l) {
696   if (l.bound) {
697     const ptrdiff_t offset = l.offset - cursor_;
698     if (!branch_offset_valid(offset, bt)) {
699       error_ = Error::kLabelOffsetOutOfBounds;
700       return;
701     }
702     emit32(opcode | branch_imm(offset, bt));
703   } else {
704     if (!l.add_use(cursor_)) {
705       error_ = Error::kLabelHasTooManyUsers;
706       return;
707     }
708     emit32(opcode);
709   }
710 }
711 
ldr(uint32_t size,uint32_t opc,MemOperand xn,int32_t imm,uint8_t rt_code)712 void Assembler::ldr(uint32_t size, uint32_t opc, MemOperand xn, int32_t imm, uint8_t rt_code) {
713   if (xn.mode != AddressingMode::kOffset || xn.offset != 0 || imm < kInt9Min || imm > kInt9Max) {
714     error_ = Error::kInvalidOperand;
715     return;
716   }
717 
718   emit32(0x3C400400 | size << 30 | opc << 22 | imm9(imm) | rn(xn.base) | rt_code);
719 }
720 
str(uint32_t size,uint32_t opc,MemOperand xn,int32_t imm,uint8_t rt_code)721 void Assembler::str(uint32_t size, uint32_t opc, MemOperand xn, int32_t imm, uint8_t rt_code) {
722   if (imm < kInt9Min || imm > kInt9Max) {
723     error_ = Error::kInvalidOperand;
724     return;
725   }
726 
727   emit32(0x3C000400 | size << 30 | opc << 22 | imm9(imm) | rn(xn.base) | rt_code);
728 }
729 
tb_helper(uint32_t op,XRegister xd,uint8_t bit,Label & l)730 void Assembler::tb_helper(uint32_t op, XRegister xd, uint8_t bit, Label& l) {
731   if (bit > 63) {
732     error_ = Error::kInvalidOperand;
733     return;
734   }
735 
736   const uint32_t bit_pos = (bit & 0x20) >> 5 << 31 | (bit & 0x1F) << 19;
737   return branch_to_label(op | bit_pos | xd.code, BranchType::kTbxz, l);
738 }
739 
740 }  // namespace aarch64
741 }  // namespace xnnpack
742