xref: /aosp_15_r20/external/XNNPACK/src/jit/aarch32-assembler.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include "xnnpack/aarch32-assembler.h"
7 #include "xnnpack/assembler.h"
8 #include "xnnpack/math.h"
9 
10 #include <cmath>
11 #include <cstddef>
12 
13 namespace xnnpack {
14 namespace aarch32 {
15 // Max value of imm for vldr/str (takes imm8, but shift right by 2 when encoding).
16 constexpr int32_t kUint10Max = 1023;
17 // Max value of imm that fits in ldr/str encoding (takes imm12, with a separate bit for sign).
18 constexpr int32_t kUint12Max = 4095;
19 
20 // PC register contains current address of instruction + 8 (2 instructions).
21 constexpr ptrdiff_t kPCDelta = 8;
22 // Constants used for checking branch offsets bounds.
23 constexpr ptrdiff_t kInt24Max = 8388607;
24 constexpr ptrdiff_t kInt24Min = -8388608;
25 
26 // Check if a branch offset is valid, it must fit in 24 bits.
branch_offset_valid(ptrdiff_t offset)27 bool branch_offset_valid(ptrdiff_t offset) {
28   return offset < kInt24Max && offset > kInt24Min;
29 }
30 
invalid_register_list(DRegisterList regs)31 bool invalid_register_list(DRegisterList regs) {
32   return regs.length == 0 || regs.length > 16 || regs.start.code + regs.length > 32;
33 }
34 
invalid_register_list(SRegisterList regs)35 bool invalid_register_list(SRegisterList regs) {
36   return regs.length == 0 || regs.start.code + regs.length > 32;
37 }
38 
encode(SRegister r,uint32_t single_bit_pos,uint32_t four_bits_pos)39 uint32_t encode(SRegister r, uint32_t single_bit_pos, uint32_t four_bits_pos) {
40   return r.d() << single_bit_pos | r.vd() << four_bits_pos;
41 }
42 
encode(DRegister r,uint32_t single_bit_pos,uint32_t four_bits_pos)43 uint32_t encode(DRegister r, uint32_t single_bit_pos, uint32_t four_bits_pos) {
44   return r.d() << single_bit_pos | r.vd() << four_bits_pos;
45 }
46 
encode(DRegisterLane r,uint32_t single_bit_pos,uint32_t four_bits_pos)47 uint32_t encode(DRegisterLane r, uint32_t single_bit_pos, uint32_t four_bits_pos) {
48   return r.d() << single_bit_pos | r.vd() << four_bits_pos;
49 }
50 
encode(QRegister r,uint32_t single_bit_pos,uint32_t four_bits_pos)51 uint32_t encode(QRegister r, uint32_t single_bit_pos, uint32_t four_bits_pos) {
52   return r.d() << single_bit_pos | r.vd() << four_bits_pos;
53 }
54 
encode(SRegisterList regs,uint32_t single_bit_pos,uint32_t four_bits_pos)55 uint32_t encode(SRegisterList regs, uint32_t single_bit_pos, uint32_t four_bits_pos) {
56   const SRegister r = regs.start;
57   return r.d() << single_bit_pos | r.vd() << four_bits_pos | regs.length;
58 }
59 
encode(DRegisterList regs,uint32_t single_bit_pos,uint32_t four_bits_pos)60 uint32_t encode(DRegisterList regs, uint32_t single_bit_pos, uint32_t four_bits_pos) {
61   const DRegister r = regs.start;
62   return r.d() << single_bit_pos | r.vd() << four_bits_pos | regs.length * 2;
63 }
64 
encode_mem_puw(MemOperand op)65 uint32_t encode_mem_puw(MemOperand op) {
66   return op.p() << 24 | op.u() << 23 | op.w() << 21 | op.base().code << 16;
67 }
68 
69 // Return value of 0 is invalid, indicates error.
encode_regs_length_to_type(DRegisterList regs)70 uint32_t encode_regs_length_to_type(DRegisterList regs) {
71   switch (regs.length) {
72     case 1:
73       return 0x7;
74     case 2:
75       return 0xA;
76     case 3:
77       return 0x6;
78     case 4:
79       return 0x2;
80   }
81   return 0;
82 }
83 
add(CoreRegister rd,CoreRegister rn,CoreRegister rm)84 void Assembler::add(CoreRegister rd, CoreRegister rn, CoreRegister rm) {
85   emit32(kAL | 0x8 << 20 | rn.code << 16 | rd.code << 12 | rm.code);
86 }
87 
add(CoreRegister rd,CoreRegister rn,uint8_t imm)88 void Assembler::add(CoreRegister rd, CoreRegister rn, uint8_t imm) {
89   // Rotation = 0, since imm is limited to 8 bits and fits in encoding.
90   emit32(kAL | 0x28 << 20 | rn.code << 16 | rd.code << 12 | imm);
91 }
92 
adds(CoreRegister rd,CoreRegister rn,uint8_t imm)93 void Assembler::adds(CoreRegister rd, CoreRegister rn, uint8_t imm) {
94   // Rotation = 0, since imm is limited to 8 bits and fits in encoding.
95   emit32(kAL | 0x29 << 20 | rn.code << 16 | rd.code << 12 | imm);
96 }
97 
and_(CoreRegister rd,CoreRegister rn,uint8_t imm)98 void Assembler::and_(CoreRegister rd, CoreRegister rn, uint8_t imm) {
99   // Rotation = 0, since imm is limited to 8 bits and fits in encoding.
100   emit32(kAL | 1 << 25 | rn.code << 16 | rd.code << 12 | imm);
101 }
102 
b(Condition c,Label & l)103 void Assembler::b(Condition c, Label& l) {
104   if (l.bound) {
105     // Offset is relative to after this b instruction + kPCDelta.
106     const ptrdiff_t offset = l.offset - cursor_ - kPCDelta;
107     if (!branch_offset_valid(offset)) {
108       error_ = Error::kLabelOffsetOutOfBounds;
109       return;
110     }
111 
112     // No need to shift by 2 since our offset is already in terms of uint32_t.
113     emit32(c | 0xA << 24 | ((offset >> kInstructionSizeInBytesLog2) & 0x00FFFFFF));
114   } else {
115     if (!l.add_use(cursor_)) {
116       error_ = Error::kLabelHasTooManyUsers;
117       return;
118     }
119     // Emit 0 offset first, will patch it up when label is bound later.
120     emit32(c | 0xA << 24);
121   }
122 }
123 
bind(Label & l)124 void Assembler::bind(Label& l) {
125   if (error_ != Error::kNoError) {
126     return;
127   }
128 
129   if (l.bound) {
130     error_ = Error::kLabelAlreadyBound;
131     return;
132   }
133 
134   l.bound = true;
135   l.offset = cursor_;
136 
137   // Patch all users.
138   for (size_t i = 0; i < l.num_users; i++) {
139     byte* user = l.users[i];
140     const ptrdiff_t offset = l.offset - user - kPCDelta;
141     uint32_t* instr = reinterpret_cast<uint32_t*>(user);
142 
143     if (!branch_offset_valid(offset)) {
144       error_ = Error::kLabelOffsetOutOfBounds;
145       return;
146     }
147 
148     *instr |= (offset >> kInstructionSizeInBytesLog2) & 0x00FFFFFF;
149   }
150 }
151 
bic(CoreRegister rd,CoreRegister rn,uint8_t imm)152 void Assembler::bic(CoreRegister rd, CoreRegister rn, uint8_t imm) {
153   emit32(kAL | 0x03C00000 | rn.code << 16 | rd.code << 12 | imm);
154 }
155 
bx(CoreRegister rm)156 void Assembler::bx(CoreRegister rm) {
157   emit32(kAL | 0x12fff10 | rm.code);
158 }
159 
cmp(CoreRegister rn,uint8_t imm)160 void Assembler::cmp(CoreRegister rn, uint8_t imm) {
161   emit32(kAL | 0x35 << 20 | rn.code << 16 | imm);
162 }
163 
cmp(CoreRegister rn,CoreRegister rm)164 void Assembler::cmp(CoreRegister rn, CoreRegister rm) {
165   emit32(kAL | 0x01500000 | rn.code << 16 | rm.code);
166 }
167 
ldr(CoreRegister rt,MemOperand op,int32_t offset)168 void Assembler::ldr(CoreRegister rt, MemOperand op, int32_t offset) {
169   ldr(rt, MemOperand(op.base(), offset, AddressingMode::kPostIndexed));
170 }
171 
ldr(CoreRegister rt,MemOperand op)172 void Assembler::ldr(CoreRegister rt, MemOperand op) {
173   const int32_t offset = op.offset();
174   if (std::abs(offset) > kUint12Max) {
175     error_ = Error::kInvalidOperand;
176     return;
177   }
178 
179   emit32(kAL | 0x41 << 20 | encode_mem_puw(op) | rt.code << 12 | offset);
180 }
181 
ldrd(CoreRegister rt,CoreRegister rt2,MemOperand op)182 void Assembler::ldrd(CoreRegister rt, CoreRegister rt2, MemOperand op) {
183   const int32_t offset = op.offset();
184   if ((std::abs(op.offset()) > UINT8_MAX) || (rt.code + 1 != rt2.code)) {
185     error_ = Error::kInvalidOperand;
186     return;
187   }
188   const uint32_t offset_top = (offset & 0xF0) << 4;
189   const uint32_t offset_bot = (offset & 0xF);
190 
191   emit32(kAL | 0x004000D0 | encode_mem_puw(op) | rt.code << 12 | offset_top | offset_bot);
192 }
193 
mov(CoreRegister rd,CoreRegister rm)194 void Assembler::mov(CoreRegister rd, CoreRegister rm) {
195   mov(kAL, rd, rm);
196 }
197 
mov(Condition c,CoreRegister Rd,CoreRegister Rm)198 void Assembler::mov(Condition c, CoreRegister Rd, CoreRegister Rm) {
199   emit32(c | 0x1A << 20 | Rd.code << 12 | Rm.code);
200 }
201 
nop()202 void Assembler::nop() {
203   emit32(kAL | 0x0320F000);
204 }
205 
pld(MemOperand op)206 void Assembler::pld(MemOperand op) {
207   emit32(0xF550F000 | op.u() << 23 | op.base().code << 16 | op.offset());
208 }
209 
pop(CoreRegisterList regs)210 void Assembler::pop(CoreRegisterList regs) {
211   if (!regs.has_more_than_one_register()) {
212     // TODO(zhin): there is a different valid encoding for single register.
213     error_ = Error::kInvalidOperand;
214     return;
215   }
216 
217   emit32(kAL | 0x8BD << 16 | regs.list);
218 }
219 
push(CoreRegisterList regs)220 void Assembler::push(CoreRegisterList regs) {
221   if (!regs.has_more_than_one_register()) {
222     // TODO(zhin): there is a different valid encoding for single register.
223     error_ = Error::kInvalidOperand;
224     return;
225   }
226 
227   emit32(kAL | 0x92D << 16 | regs.list);
228 }
229 
str(CoreRegister rt,MemOperand op)230 void Assembler::str(CoreRegister rt, MemOperand op) {
231   const int32_t offset = op.offset();
232   if (std::abs(offset) > kUint12Max) {
233     error_ = Error::kInvalidOperand;
234     return;
235   }
236   emit32(kAL | 1 << 26 | encode_mem_puw(op) | rt.code << 12 | offset);
237 }
238 
sub(CoreRegister rd,CoreRegister rn,uint8_t imm)239 void Assembler::sub(CoreRegister rd, CoreRegister rn, uint8_t imm) {
240   emit32(kAL | 0x24 << 20 | rn.code << 16 | rd.code << 12 | imm);
241 }
242 
sub(CoreRegister rd,CoreRegister rn,CoreRegister rm)243 void Assembler::sub(CoreRegister rd, CoreRegister rn, CoreRegister rm) {
244   emit32(kAL | 0x4 << 20 | rn.code << 16 | rd.code << 12 | rm.code);
245 }
246 
subs(CoreRegister rd,CoreRegister rn,uint8_t imm)247 void Assembler::subs(CoreRegister rd, CoreRegister rn, uint8_t imm) {
248   // Rotation = 0, since imm is limited to 8 bits and fits in encoding.
249   emit32(kAL | 0x25 << 20 | rn.code << 16 | rd.code << 12 | imm);
250 }
251 
tst(CoreRegister rn,uint8_t imm)252 void Assembler::tst(CoreRegister rn, uint8_t imm) {
253   // Rotation = 0, since imm is limited to 8 bits and fits in encoding.
254   emit32(kAL | 0x31 << 20 | rn.code << 16 | imm);
255 }
256 
vabs_f32(QRegister qd,QRegister qm)257 void Assembler::vabs_f32(QRegister qd, QRegister qm) {
258   emit32(0xF3B90740 | encode(qd, 22, 12) | encode(qm, 5, 0));
259 }
260 
vadd_f32(QRegister qd,QRegister qn,QRegister qm)261 void Assembler::vadd_f32(QRegister qd, QRegister qn, QRegister qm) {
262   emit32(0xF2000D40 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
263 }
264 
vcmpe_f32(SRegister sd,SRegister sm)265 void Assembler::vcmpe_f32(SRegister sd, SRegister sm) {
266   emit32(kAL | 0x0EB40AC0 | encode(sd, 22, 12) | encode(sm, 5, 0));
267 }
268 
vcvt_f32_s32(QRegister qd,QRegister qm)269 void Assembler::vcvt_f32_s32(QRegister qd, QRegister qm) {
270   emit32(0xF3BB0640 | encode(qd, 22, 12) | encode(qm, 5, 0));
271 }
272 
vcvt_s32_f32(QRegister qd,QRegister qm)273 void Assembler::vcvt_s32_f32(QRegister qd, QRegister qm) {
274   emit32(0xF3BB0740 | encode(qd, 22, 12) | encode(qm, 5, 0));
275 }
276 
vcvtn_s32_f32(QRegister qd,QRegister qm)277 void Assembler::vcvtn_s32_f32(QRegister qd, QRegister qm) {
278   emit32(0xF3BB0140 | encode(qd, 22, 12) | encode(qm, 5, 0));
279 }
280 
vdup(DataSize size,QRegister qd,DRegisterLane dm)281 void Assembler::vdup(DataSize size, QRegister qd, DRegisterLane dm) {
282   uint8_t imm4 = 0;
283   switch (size) {
284     case k8:
285       if (dm.lane > 7) {
286         error_ = Error::kInvalidLaneIndex;
287         return;
288       }
289       imm4 = 1 | ((dm.lane & 0x7) << 1);
290       break;
291     case k16:
292       if (dm.lane > 3) {
293         error_ = Error::kInvalidLaneIndex;
294         return;
295       }
296       imm4 = 2 | ((dm.lane & 0x3) << 2);
297       break;
298     case k32:
299       if (dm.lane > 1) {
300         error_ = Error::kInvalidLaneIndex;
301         return;
302       }
303       imm4 = 4 | ((dm.lane & 0x1) << 3);
304       break;
305   }
306   emit32(0xF3B00C40 | imm4 << 16 | encode(qd, 22, 12) | encode(dm, 5, 0));
307 }
308 
vext_8(QRegister qd,QRegister qn,QRegister qm,uint8_t imm4)309 void Assembler::vext_8(QRegister qd, QRegister qn, QRegister qm, uint8_t imm4) {
310   if (imm4 > 15) {
311     error_ = Error::kInvalidOperand;
312     return;
313   }
314   emit32(0xF2B00040 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0) | imm4 << 8);
315 }
316 
vld1(DataSize size,DRegisterList regs,MemOperand op)317 void Assembler::vld1(DataSize size, DRegisterList regs, MemOperand op) {
318   const uint8_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
319   vld1(size, regs, op, CoreRegister{rm});
320 }
321 
vld1(DataSize size,DRegisterList regs,MemOperand op,CoreRegister rm)322 void Assembler::vld1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm) {
323   const uint8_t type = encode_regs_length_to_type(regs);
324   if (!type) {
325     error_ = Error::kInvalidRegisterListLength;
326     return;
327   }
328 
329   emit32(0xF4200000 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | size << 6 | rm.code);
330 }
331 
vld1_32(DRegisterLane dd,MemOperand op)332 void Assembler::vld1_32(DRegisterLane dd, MemOperand op) {
333   if (dd.lane > 1) {
334     error_ = Error::kInvalidLaneIndex;
335     return;
336   }
337   const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
338   emit32(kAL | 0xF4A00800 | dd.lane << 7 | encode(dd, 22, 12) | op.base().code << 16 | rm);
339 }
340 
vld1r_32(DRegisterList regs,MemOperand op)341 void Assembler::vld1r_32(DRegisterList regs, MemOperand op) {
342   if ((op.mode() == AddressingMode::kOffset && op.offset() != 0) || regs.length > 2) {
343     error_ = Error::kInvalidOperand;
344     return;
345   }
346 
347   const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
348   emit32(0xF4A00C80 | encode(regs.start, 22, 12) | op.base().code << 16 | (regs.length - 1) << 5 | rm);
349 }
350 
vld2r_32(VLoadStoreRegList regs,MemOperand op)351 void Assembler::vld2r_32(VLoadStoreRegList regs, MemOperand op) {
352   if ((op.mode() == AddressingMode::kOffset && op.offset() != 0)) {
353     error_ = Error::kInvalidOperand;
354     return;
355   }
356   uint8_t spacing = regs.double_spaced ? 2 : 1;
357   if (regs.reg1.code != regs.reg2.code - spacing) {
358     error_ = Error::kInvalidOperand;
359     return;
360   }
361 
362   size_t t = spacing - 1;
363   const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? op.base().code : 0xF;
364   emit32(0xF4A00D80 | encode(regs.reg1, 22, 12) | op.base().code << 16 | t << 5 | rm);
365 }
366 
367 
vld3r_32(VLoadStoreRegList regs,MemOperand op)368 void Assembler::vld3r_32(VLoadStoreRegList regs, MemOperand op) {
369   if ((op.mode() == AddressingMode::kOffset && op.offset() != 0)) {
370     error_ = Error::kInvalidOperand;
371     return;
372   }
373   uint8_t spacing = regs.double_spaced ? 2 : 1;
374   if (regs.reg1.code != regs.reg2.code - spacing || regs.reg2.code != regs.reg3.code - spacing) {
375     error_ = Error::kInvalidOperand;
376     return;
377   }
378 
379   size_t t = spacing - 1;
380   const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? op.base().code : 0xF;
381   emit32(0xF4A00E80 | encode(regs.reg1, 22, 12) | op.base().code << 16 | t << 5 | rm);
382 }
383 
vldm(MemOperand rn,SRegisterList regs)384 void Assembler::vldm(MemOperand rn, SRegisterList regs) {
385   if (invalid_register_list(regs)) {
386     error_ = Error::kInvalidRegisterListLength;
387     return;
388   }
389   uint32_t w = (rn.mode() == AddressingMode::kOffset ? 0 : 1) << 21;
390   emit32(kAL | 0x0C900A00 | w | rn.base().code << 16 | encode(regs, 22, 12));
391 }
392 
vldm(MemOperand rn,DRegisterList regs)393 void Assembler::vldm(MemOperand rn, DRegisterList regs) {
394   if (invalid_register_list(regs)) {
395     error_ = Error::kInvalidRegisterListLength;
396     return;
397   }
398   uint32_t w = (rn.mode() == AddressingMode::kOffset ? 0 : 1) << 21;
399   emit32(kAL | 0x0C900B00 | w | rn.base().code << 16 | encode(regs, 22, 12));
400 }
401 
vldr(SRegister sd,MemOperand op)402 void Assembler::vldr(SRegister sd, MemOperand op) {
403   const uint32_t offset = std::abs(op.offset());
404   if (op.mode() != AddressingMode::kOffset || offset > kUint10Max || offset % 4 != 0) {
405     error_ = Error::kInvalidOperand;
406     return;
407   }
408 
409   emit32(kAL | 0x0D100A00 | op.u() << 23 | encode(sd, 22, 12) | op.base().code << 16 | offset >> 2);
410 }
411 
vldr(DRegister dd,MemOperand op)412 void Assembler::vldr(DRegister dd, MemOperand op) {
413   const uint32_t offset = std::abs(op.offset());
414   if (op.mode() != AddressingMode::kOffset || offset > kUint10Max || offset % 4 != 0) {
415     error_ = Error::kInvalidOperand;
416     return;
417   }
418 
419   emit32(kAL | 0x0D100B00 | op.u() << 23 | encode(dd, 22, 12) | op.base().code << 16 | offset >> 2);
420 }
421 
vmax_f32(QRegister qd,QRegister qn,QRegister qm)422 void Assembler::vmax_f32(QRegister qd, QRegister qn, QRegister qm) {
423   emit32(0xF2000F40 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
424 }
425 
vmax_s8(QRegister qd,QRegister qn,QRegister qm)426 void Assembler::vmax_s8(QRegister qd, QRegister qn, QRegister qm) {
427  emit32(0xF2000640 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
428 }
429 
vmin_f32(QRegister qd,QRegister qn,QRegister qm)430 void Assembler::vmin_f32(QRegister qd, QRegister qn, QRegister qm) {
431   emit32(0xF2200F40 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
432 }
433 
vmin_s8(QRegister qd,QRegister qn,QRegister qm)434 void Assembler::vmin_s8(QRegister qd, QRegister qn, QRegister qm) {
435  emit32(0xF2000650 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
436 }
437 
vmla_f32(SRegister sd,SRegister sn,SRegister sm)438 void Assembler::vmla_f32(SRegister sd, SRegister sn, SRegister sm) {
439   emit32(kAL | 0x0E000A00 | encode(sd, 22, 12) | encode (sn, 7, 16) | encode(sm, 5, 0));
440 }
441 
vmla_f32(QRegister qd,QRegister qn,DRegisterLane dm)442 void Assembler::vmla_f32(QRegister qd, QRegister qn, DRegisterLane dm) {
443   if (dm.lane > 1) {
444     error_ = Error::kInvalidLaneIndex;
445     return;
446   }
447   emit32(0xF3A00140 | encode(qd, 22, 12) | encode(qn, 7, 16) | dm.lane << 5 | dm.code);
448 }
449 
vmlal_s16(QRegister qd,DRegister dn,DRegisterLane dm)450 void Assembler::vmlal_s16(QRegister qd, DRegister dn, DRegisterLane dm) {
451   if (dm.lane > 3) {
452     error_ = Error::kInvalidLaneIndex;
453     return;
454   }
455   if (dm.code > 7) {
456     error_ = Error::kInvalidOperand;
457     return;
458   }
459 
460   uint8_t lane_top = dm.lane >> 1;
461   uint8_t lane_bot = dm.lane & 1;
462   emit32(0xF2900240 | encode(qd, 22, 12) | encode(dn, 7, 16) | lane_top << 5 | lane_bot << 3 | dm.code);
463 }
464 
vmov(QRegister qd,uint8_t imm)465 void Assembler::vmov(QRegister qd, uint8_t imm) {
466   if (imm != 0) {
467     error_ = Error::kInvalidOperand;
468     return;
469   }
470 
471   emit32(0xF2800050 | encode(qd, 22, 12));
472 }
473 
vmov(SRegister sd,SRegister sm)474 void Assembler::vmov(SRegister sd, SRegister sm) {
475   emit32(kAL | 0x0EB00A40 | encode(sd, 22, 12) | encode(sm, 5, 0));
476 }
477 
vmov(DRegister dm,CoreRegister rt,CoreRegister rt2)478 void Assembler::vmov(DRegister dm, CoreRegister rt, CoreRegister rt2) {
479   emit32(kAL | 0x0C400B10 | rt2.code << 16 | rt.code << 12 | encode(dm, 5, 0));
480 }
481 
vmov(DRegister dd,DRegister dm)482 void Assembler::vmov(DRegister dd, DRegister dm) {
483   emit32(0xF2200110 | encode(dd, 22, 12) | encode(dm, 7, 16) | encode(dm, 5, 0));
484 }
485 
vmov(QRegister qd,QRegister qm)486 void Assembler::vmov(QRegister qd, QRegister qm) {
487   emit32(0xF2200150 | encode(qd, 22, 12) | encode(qm, 7, 16) | encode(qm, 5, 0));
488 }
489 
vmov_f32(Condition c,SRegister sd,SRegister sm)490 void Assembler::vmov_f32(Condition c, SRegister sd, SRegister sm) {
491   emit32(c | 0x0EB00A40 | encode(sd, 22, 12) | encode(sm, 5, 0));
492 }
493 
vmov_f64(DRegister dd,DRegister dm)494 void Assembler::vmov_f64(DRegister dd, DRegister dm) {
495   emit32(kAL | 0x0EB00B40 | encode(dd, 22, 12) | encode(dm, 5, 0));
496 }
497 
vmovl_s8(QRegister qd,DRegister dm)498 void Assembler::vmovl_s8(QRegister qd, DRegister dm) {
499   emit32(0xF2880A10 | encode(qd, 22, 12) | encode(dm, 5, 0));
500 }
501 
vmrs(CoreRegister rt,SpecialFPRegister spec_reg)502 void Assembler::vmrs(CoreRegister rt, SpecialFPRegister spec_reg) {
503   emit32(kAL | 0x0EF00A10 | static_cast<uint32_t>(spec_reg) << 16 | rt.code << 12);
504 }
505 
vmul_f32(QRegister qd,QRegister qn,QRegister qm)506 void Assembler::vmul_f32(QRegister qd, QRegister qn, QRegister qm) {
507   emit32(0xF3000D50 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
508 }
509 
vneg_f32(QRegister qd,QRegister qm)510 void Assembler::vneg_f32(QRegister qd, QRegister qm) {
511   emit32(0xF3B907C0 | encode(qd, 22, 12) | encode(qm, 5, 0));
512 }
513 
vpop(DRegisterList regs)514 void Assembler::vpop(DRegisterList regs) {
515   if (invalid_register_list(regs)) {
516     error_ = Error::kInvalidRegisterListLength;
517     return;
518   }
519   emit32(kAL | encode(regs, 22, 12) | 0xCBD << 16 | 0xB << 8);
520 }
521 
vpush(DRegisterList regs)522 void Assembler::vpush(DRegisterList regs) {
523   if (invalid_register_list(regs)) {
524     error_ = Error::kInvalidRegisterListLength;
525     return;
526   }
527   emit32(kAL | encode(regs, 22, 12) | 0xD2D << 16 | 0xB << 8);
528 }
529 
vpush(SRegisterList regs)530 void Assembler::vpush(SRegisterList regs) {
531   if (invalid_register_list(regs)) {
532     error_ = Error::kInvalidRegisterListLength;
533     return;
534   }
535   emit32(kAL | encode(regs, 22, 12) | 0xD2D << 16 | 0xA << 8);
536 }
537 
vqadd_s16(QRegister qd,QRegister qn,QRegister qm)538 void Assembler::vqadd_s16(QRegister qd, QRegister qn, QRegister qm) {
539   emit32(0xF2100050 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
540 }
541 
vqdmulh_s32(QRegister qd,QRegister qn,DRegisterLane dm)542 void Assembler::vqdmulh_s32(QRegister qd, QRegister qn, DRegisterLane dm) {
543   if (dm.code > 15) {
544     error_ = Error::kInvalidOperand;
545     return;
546   }
547   if (dm.lane > 1) {
548     error_ = Error::kInvalidLaneIndex;
549     return;
550   }
551   emit32(0xF3A00C40 | encode(qd, 22, 12) | encode(qn, 7, 16) | dm.lane << 5 | dm.code);
552 }
553 
vqmovn_s16(DRegister dd,QRegister qm)554 void Assembler::vqmovn_s16(DRegister dd, QRegister qm) {
555   emit32(0xF3B20280 | encode(dd, 22, 12) | encode(qm, 5, 0));
556 }
557 
vqmovn_s32(DRegister dd,QRegister qm)558 void Assembler::vqmovn_s32(DRegister dd, QRegister qm) {
559   emit32(0xF3B60280 | encode(dd, 22, 12) | encode(qm, 5, 0));
560 }
561 
vqshl_s32(QRegister qd,QRegister qm,QRegister qn)562 void Assembler::vqshl_s32(QRegister qd, QRegister qm, QRegister qn) {
563   emit32(0xF2200450 | encode(qd, 22, 12) | encode(qm, 5, 0) | encode(qn, 7, 16));
564 }
565 
vrshl_s32(QRegister qd,QRegister qm,QRegister qn)566 void Assembler::vrshl_s32(QRegister qd, QRegister qm, QRegister qn) {
567   emit32(0xF2200540 | encode(qd, 22, 12) | encode(qm, 5, 0) | encode(qn, 7, 16));
568 }
569 
vsdot_s8(QRegister qd,QRegister qn,DRegisterLane dm)570 void Assembler::vsdot_s8(QRegister qd, QRegister qn, DRegisterLane dm) {
571   if (dm.lane > 1) {
572     error_ = Error::kInvalidLaneIndex;
573     return;
574   }
575   emit32(0xFE200D40 | encode(qd, 22, 12) | encode(qn, 7, 16) | dm.lane << 5 | dm.code);
576 }
577 
vst1(DataSize size,DRegisterList regs,MemOperand op)578 void Assembler::vst1(DataSize size, DRegisterList regs, MemOperand op) {
579   const uint8_t type = encode_regs_length_to_type(regs);
580   if (!type) {
581     error_ = Error::kInvalidRegisterListLength;
582     return;
583   }
584 
585   const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
586   emit32(0xF4000000 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | size << 6 | rm);
587 }
588 
vst1(DataSize size,DRegisterList regs,MemOperand op,CoreRegister rm)589 void Assembler::vst1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm) {
590   if (rm.code == 0b1101 || rm.code == 0b1111) {
591     error_ = Error::kInvalidOperand;
592     return;
593   }
594 
595   const uint8_t type = encode_regs_length_to_type(regs);
596   if (!type) {
597     error_ = Error::kInvalidRegisterListLength;
598     return;
599   }
600 
601   emit32(0xF4000000 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | size << 6 | rm.code);
602 }
603 
vst1(DataSize size,DRegisterLane dd,MemOperand op)604 void Assembler::vst1(DataSize size, DRegisterLane dd, MemOperand op) {
605   if ((size == k8 && dd.lane > 7) || (size == k16 && dd.lane > 3) || (size == k32 && dd.lane > 1)) {
606     error_ = Error::kInvalidLaneIndex;
607     return;
608   }
609 
610   const uint8_t shift = size == k8 ? 5 : size == k16 ? 6 : 7;
611   const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
612   emit32(0xF4800000 | encode(dd, 22, 12) | op.base().code << 16 | size << 10 | dd.lane << shift | rm);
613 }
614 
vstm(MemOperand rn,DRegisterList regs)615 void Assembler::vstm(MemOperand rn, DRegisterList regs) {
616   if (invalid_register_list(regs)) {
617     error_ = Error::kInvalidRegisterListLength;
618     return;
619   }
620   uint32_t w = (rn.mode() == AddressingMode::kOffset ? 0 : 1) << 21;
621   emit32(kAL | 0x0C800B00 | w | rn.base().code << 16 |  encode(regs.start, 22, 12) | regs.length << 1);
622 }
623 
vstr(SRegister rn,MemOperand op)624 void Assembler::vstr(SRegister rn, MemOperand op) {
625   const uint32_t offset = std::abs(op.offset());
626   if (op.mode() != AddressingMode::kOffset || offset > kUint10Max || offset % 4 != 0) {
627     error_ = Error::kInvalidOperand;
628     return;
629   }
630   emit32(kAL | 0x0D000A00 | op.u() << 23 | op.base().code << 16 | encode(rn, 22, 12) | offset >> 2);
631 }
632 
align(uint8_t n)633 void Assembler::align(uint8_t n) {
634   if (!is_po2(n) || (n % kInstructionSizeInBytes != 0)) {
635     error_ = Error::kInvalidOperand;
636     return;
637   }
638 
639   uintptr_t cursor = reinterpret_cast<uintptr_t>(cursor_);
640   const uintptr_t target = round_up_po2(cursor, n);
641   while (cursor < target) {
642     nop();
643     cursor += kInstructionSizeInBytes;
644   }
645 }
646 
647 }  // namespace aarch32
648 }  // namespace xnnpack
649