xref: /aosp_15_r20/external/mesa3d/src/nouveau/compiler/nak/sm50.rs (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 // Copyright © 2023 Collabora, Ltd.
2 // SPDX-License-Identifier: MIT
3 
4 use crate::ir::*;
5 use crate::legalize::{
6     src_is_reg, swap_srcs_if_not_reg, LegalizeBuildHelpers, LegalizeBuilder,
7 };
8 use bitview::*;
9 
10 use std::collections::HashMap;
11 use std::ops::Range;
12 
13 pub struct ShaderModel50 {
14     sm: u8,
15 }
16 
17 impl ShaderModel50 {
new(sm: u8) -> Self18     pub fn new(sm: u8) -> Self {
19         assert!(sm >= 50 && sm < 70);
20         Self { sm }
21     }
22 }
23 
24 impl ShaderModel for ShaderModel50 {
sm(&self) -> u825     fn sm(&self) -> u8 {
26         self.sm
27     }
28 
num_regs(&self, file: RegFile) -> u3229     fn num_regs(&self, file: RegFile) -> u32 {
30         match file {
31             RegFile::GPR => 255,
32             RegFile::UGPR => 0,
33             RegFile::Pred => 7,
34             RegFile::UPred => 0,
35             RegFile::Carry => 1,
36             RegFile::Bar => 0,
37             RegFile::Mem => RegRef::MAX_IDX + 1,
38         }
39     }
40 
crs_size(&self, max_crs_depth: u32) -> u3241     fn crs_size(&self, max_crs_depth: u32) -> u32 {
42         if max_crs_depth <= 16 {
43             0
44         } else if max_crs_depth <= 32 {
45             1024
46         } else {
47             ((max_crs_depth + 32) * 16).next_multiple_of(512)
48         }
49     }
50 
op_can_be_uniform(&self, _op: &Op) -> bool51     fn op_can_be_uniform(&self, _op: &Op) -> bool {
52         false
53     }
54 
legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op)55     fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op) {
56         as_sm50_op_mut(op).legalize(b);
57     }
58 
encode_shader(&self, s: &Shader<'_>) -> Vec<u32>59     fn encode_shader(&self, s: &Shader<'_>) -> Vec<u32> {
60         encode_sm50_shader(self, s)
61     }
62 }
63 
64 trait SM50Op {
legalize(&mut self, b: &mut LegalizeBuilder)65     fn legalize(&mut self, b: &mut LegalizeBuilder);
encode(&self, e: &mut SM50Encoder<'_>)66     fn encode(&self, e: &mut SM50Encoder<'_>);
67 }
68 
69 struct SM50Encoder<'a> {
70     sm: &'a ShaderModel50,
71     ip: usize,
72     labels: &'a HashMap<Label, usize>,
73     inst: [u32; 2],
74     sched: u32,
75 }
76 
77 impl BitViewable for SM50Encoder<'_> {
bits(&self) -> usize78     fn bits(&self) -> usize {
79         BitView::new(&self.inst).bits()
80     }
81 
get_bit_range_u64(&self, range: Range<usize>) -> u6482     fn get_bit_range_u64(&self, range: Range<usize>) -> u64 {
83         BitView::new(&self.inst).get_bit_range_u64(range)
84     }
85 }
86 
87 impl BitMutViewable for SM50Encoder<'_> {
set_bit_range_u64(&mut self, range: Range<usize>, val: u64)88     fn set_bit_range_u64(&mut self, range: Range<usize>, val: u64) {
89         BitMutView::new(&mut self.inst).set_bit_range_u64(range, val);
90     }
91 }
92 
93 impl SetFieldU64 for SM50Encoder<'_> {
set_field_u64(&mut self, range: Range<usize>, val: u64)94     fn set_field_u64(&mut self, range: Range<usize>, val: u64) {
95         BitMutView::new(&mut self.inst).set_field_u64(range, val);
96     }
97 }
98 
99 impl SM50Encoder<'_> {
set_opcode(&mut self, opcode: u16)100     fn set_opcode(&mut self, opcode: u16) {
101         self.set_field(48..64, opcode);
102     }
103 
set_pred_reg(&mut self, range: Range<usize>, reg: RegRef)104     fn set_pred_reg(&mut self, range: Range<usize>, reg: RegRef) {
105         assert!(range.len() == 3);
106         assert!(reg.file() == RegFile::Pred);
107         assert!(reg.base_idx() <= 7);
108         assert!(reg.comps() == 1);
109         self.set_field(range, reg.base_idx());
110     }
111 
set_pred(&mut self, pred: &Pred)112     fn set_pred(&mut self, pred: &Pred) {
113         assert!(!pred.is_false());
114         self.set_pred_reg(
115             16..19,
116             match pred.pred_ref {
117                 PredRef::None => RegRef::zero(RegFile::Pred, 1),
118                 PredRef::Reg(reg) => reg,
119                 PredRef::SSA(_) => panic!("SSA values must be lowered"),
120             },
121         );
122         self.set_bit(19, pred.pred_inv);
123     }
124 
set_instr_deps(&mut self, deps: &InstrDeps)125     fn set_instr_deps(&mut self, deps: &InstrDeps) {
126         let mut sched = BitMutView::new(&mut self.sched);
127 
128         sched.set_field(0..4, deps.delay);
129         sched.set_bit(4, deps.yld);
130         sched.set_field(5..8, deps.wr_bar().unwrap_or(7));
131         sched.set_field(8..11, deps.rd_bar().unwrap_or(7));
132         sched.set_field(11..17, deps.wt_bar_mask);
133         sched.set_field(17..21, deps.reuse_mask);
134     }
135 
set_reg(&mut self, range: Range<usize>, reg: RegRef)136     fn set_reg(&mut self, range: Range<usize>, reg: RegRef) {
137         assert!(range.len() == 8);
138         assert!(reg.file() == RegFile::GPR);
139         self.set_field(range, reg.base_idx());
140     }
141 
set_reg_src_ref(&mut self, range: Range<usize>, src_ref: SrcRef)142     fn set_reg_src_ref(&mut self, range: Range<usize>, src_ref: SrcRef) {
143         match src_ref {
144             SrcRef::Zero => self.set_reg(range, RegRef::zero(RegFile::GPR, 1)),
145             SrcRef::Reg(reg) => self.set_reg(range, reg),
146             _ => panic!("Not a register"),
147         }
148     }
149 
set_reg_src(&mut self, range: Range<usize>, src: Src)150     fn set_reg_src(&mut self, range: Range<usize>, src: Src) {
151         assert!(src.src_mod.is_none());
152         self.set_reg_src_ref(range, src.src_ref);
153     }
154 
set_reg_fmod_src( &mut self, range: Range<usize>, abs_bit: usize, neg_bit: usize, src: Src, )155     fn set_reg_fmod_src(
156         &mut self,
157         range: Range<usize>,
158         abs_bit: usize,
159         neg_bit: usize,
160         src: Src,
161     ) {
162         self.set_reg_src_ref(range, src.src_ref);
163         self.set_bit(abs_bit, src.src_mod.has_fabs());
164         self.set_bit(neg_bit, src.src_mod.has_fneg());
165     }
166 
set_reg_ineg_src( &mut self, range: Range<usize>, neg_bit: usize, src: Src, )167     fn set_reg_ineg_src(
168         &mut self,
169         range: Range<usize>,
170         neg_bit: usize,
171         src: Src,
172     ) {
173         self.set_reg_src_ref(range, src.src_ref);
174         self.set_bit(neg_bit, src.src_mod.is_ineg());
175     }
176 
set_reg_bnot_src( &mut self, range: Range<usize>, not_bit: usize, src: Src, )177     fn set_reg_bnot_src(
178         &mut self,
179         range: Range<usize>,
180         not_bit: usize,
181         src: Src,
182     ) {
183         self.set_reg_src_ref(range, src.src_ref);
184         self.set_bit(not_bit, src.src_mod.is_bnot());
185     }
186 
set_pred_dst(&mut self, range: Range<usize>, dst: Dst)187     fn set_pred_dst(&mut self, range: Range<usize>, dst: Dst) {
188         match dst {
189             Dst::None => {
190                 self.set_pred_reg(range, RegRef::zero(RegFile::Pred, 1));
191             }
192             Dst::Reg(reg) => self.set_pred_reg(range, reg),
193             _ => panic!("Not a register"),
194         }
195     }
196 
set_pred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src)197     fn set_pred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src) {
198         // The default for predicates is true
199         let true_reg = RegRef::new(RegFile::Pred, 7, 1);
200 
201         let (not, reg) = match src.src_ref {
202             SrcRef::True => (false, true_reg),
203             SrcRef::False => (true, true_reg),
204             SrcRef::Reg(reg) => (false, reg),
205             _ => panic!("Not a register"),
206         };
207         self.set_pred_reg(range, reg);
208         self.set_bit(not_bit, not ^ src.src_mod.is_bnot());
209     }
210 
set_dst(&mut self, dst: Dst)211     fn set_dst(&mut self, dst: Dst) {
212         let reg = match dst {
213             Dst::None => RegRef::zero(RegFile::GPR, 1),
214             Dst::Reg(reg) => reg,
215             _ => panic!("invalid dst {dst}"),
216         };
217         self.set_reg(0..8, reg);
218     }
219 
set_src_imm32(&mut self, range: Range<usize>, u: u32)220     fn set_src_imm32(&mut self, range: Range<usize>, u: u32) {
221         assert!(range.len() == 32);
222         self.set_field(range, u);
223     }
224 
set_src_imm_i20( &mut self, range: Range<usize>, sign_bit: usize, i: u32, )225     fn set_src_imm_i20(
226         &mut self,
227         range: Range<usize>,
228         sign_bit: usize,
229         i: u32,
230     ) {
231         assert!(range.len() == 19);
232         assert!((i & 0xfff80000) == 0 || (i & 0xfff80000) == 0xfff80000);
233 
234         self.set_field(range, i & 0x7ffff);
235         self.set_field(sign_bit..sign_bit + 1, (i & 0x80000) >> 19);
236     }
237 
set_src_imm_f20( &mut self, range: Range<usize>, sign_bit: usize, f: u32, )238     fn set_src_imm_f20(
239         &mut self,
240         range: Range<usize>,
241         sign_bit: usize,
242         f: u32,
243     ) {
244         assert!(range.len() == 19);
245         assert!((f & 0x00000fff) == 0);
246 
247         self.set_field(range, (f >> 12) & 0x7ffff);
248         self.set_field(sign_bit..sign_bit + 1, f >> 31);
249     }
250 
set_src_cb(&mut self, range: Range<usize>, cb: &CBufRef)251     fn set_src_cb(&mut self, range: Range<usize>, cb: &CBufRef) {
252         let mut v = BitMutView::new_subset(self, range);
253 
254         assert!(cb.offset % 4 == 0);
255 
256         v.set_field(0..14, cb.offset >> 2);
257         if let CBuf::Binding(idx) = cb.buf {
258             v.set_field(14..19, idx);
259         } else {
260             panic!("Must be a bound constant buffer");
261         }
262     }
263 
set_cb_fmod_src( &mut self, range: Range<usize>, abs_bit: usize, neg_bit: usize, src: Src, )264     fn set_cb_fmod_src(
265         &mut self,
266         range: Range<usize>,
267         abs_bit: usize,
268         neg_bit: usize,
269         src: Src,
270     ) {
271         if let SrcRef::CBuf(cb) = &src.src_ref {
272             self.set_src_cb(range, cb);
273         } else {
274             panic!("Not a CBuf source");
275         }
276 
277         self.set_bit(abs_bit, src.src_mod.has_fabs());
278         self.set_bit(neg_bit, src.src_mod.has_fneg());
279     }
280 
set_cb_ineg_src( &mut self, range: Range<usize>, neg_bit: usize, src: Src, )281     fn set_cb_ineg_src(
282         &mut self,
283         range: Range<usize>,
284         neg_bit: usize,
285         src: Src,
286     ) {
287         if let SrcRef::CBuf(cb) = &src.src_ref {
288             self.set_src_cb(range, cb);
289         } else {
290             panic!("Not a CBuf source");
291         }
292 
293         self.set_bit(neg_bit, src.src_mod.is_ineg());
294     }
295 
set_cb_bnot_src( &mut self, range: Range<usize>, not_bit: usize, src: Src, )296     fn set_cb_bnot_src(
297         &mut self,
298         range: Range<usize>,
299         not_bit: usize,
300         src: Src,
301     ) {
302         if let SrcRef::CBuf(cb) = &src.src_ref {
303             self.set_src_cb(range, cb);
304         } else {
305             panic!("Not a CBuf source");
306         }
307 
308         self.set_bit(not_bit, src.src_mod.is_bnot());
309     }
310 }
311 
312 //
313 // Legalization helpers
314 //
315 
316 pub trait SM50LegalizeBuildHelpers: LegalizeBuildHelpers {
copy_alu_src_if_fabs(&mut self, src: &mut Src, src_type: SrcType)317     fn copy_alu_src_if_fabs(&mut self, src: &mut Src, src_type: SrcType) {
318         if src.src_mod.has_fabs() {
319             self.copy_alu_src_and_lower_fmod(src, src_type);
320         }
321     }
322 
copy_alu_src_if_i20_overflow( &mut self, src: &mut Src, reg_file: RegFile, src_type: SrcType, )323     fn copy_alu_src_if_i20_overflow(
324         &mut self,
325         src: &mut Src,
326         reg_file: RegFile,
327         src_type: SrcType,
328     ) {
329         if src.as_imm_not_i20().is_some() {
330             self.copy_alu_src(src, reg_file, src_type);
331         }
332     }
333 
copy_alu_src_if_f20_overflow( &mut self, src: &mut Src, reg_file: RegFile, src_type: SrcType, )334     fn copy_alu_src_if_f20_overflow(
335         &mut self,
336         src: &mut Src,
337         reg_file: RegFile,
338         src_type: SrcType,
339     ) {
340         if src.as_imm_not_f20().is_some() {
341             self.copy_alu_src(src, reg_file, src_type);
342         }
343     }
344 }
345 
346 impl SM50LegalizeBuildHelpers for LegalizeBuilder<'_> {}
347 
348 /// Helper to legalize extended or external instructions
349 ///
350 /// These are instructions which reach out external units such as load/store
351 /// and texture ops.  They typically can't take anything but GPRs and are the
352 /// only types of instructions that support vectors.
353 ///
legalize_ext_instr(op: &mut impl SrcsAsSlice, _b: &mut LegalizeBuilder)354 fn legalize_ext_instr(op: &mut impl SrcsAsSlice, _b: &mut LegalizeBuilder) {
355     let src_types = op.src_types();
356     for (i, src) in op.srcs_as_mut_slice().iter_mut().enumerate() {
357         match src_types[i] {
358             SrcType::SSA => {
359                 assert!(src.as_ssa().is_some());
360             }
361             SrcType::GPR => {
362                 assert!(src_is_reg(src, RegFile::GPR));
363             }
364             SrcType::ALU
365             | SrcType::F16
366             | SrcType::F16v2
367             | SrcType::F32
368             | SrcType::F64
369             | SrcType::I32
370             | SrcType::B32 => {
371                 panic!("ALU srcs must be legalized explicitly");
372             }
373             SrcType::Pred => {
374                 panic!("Predicates must be legalized explicitly");
375             }
376             SrcType::Carry => {
377                 panic!("Carry values must be legalized explicitly");
378             }
379             SrcType::Bar => panic!("Barrier regs are Volta+"),
380         }
381     }
382 }
383 
384 //
385 // Implementations of SM50Op for each op we support on Maxwell/Pascal
386 //
387 
388 impl SM50Encoder<'_> {
set_rnd_mode(&mut self, range: Range<usize>, rnd_mode: FRndMode)389     fn set_rnd_mode(&mut self, range: Range<usize>, rnd_mode: FRndMode) {
390         assert!(range.len() == 2);
391         self.set_field(
392             range,
393             match rnd_mode {
394                 FRndMode::NearestEven => 0_u8,
395                 FRndMode::NegInf => 1_u8,
396                 FRndMode::PosInf => 2_u8,
397                 FRndMode::Zero => 3_u8,
398             },
399         );
400     }
401 }
402 
403 impl SM50Op for OpFAdd {
legalize(&mut self, b: &mut LegalizeBuilder)404     fn legalize(&mut self, b: &mut LegalizeBuilder) {
405         use RegFile::GPR;
406         let [src0, src1] = &mut self.srcs;
407         swap_srcs_if_not_reg(src0, src1, GPR);
408         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F32);
409     }
410 
encode(&self, e: &mut SM50Encoder<'_>)411     fn encode(&self, e: &mut SM50Encoder<'_>) {
412         if let Some(imm32) = self.srcs[1].as_imm_not_f20() {
413             e.set_opcode(0x0800);
414             e.set_dst(self.dst);
415             e.set_reg_fmod_src(8..16, 54, 56, self.srcs[0]);
416             e.set_src_imm32(20..52, imm32);
417             e.set_bit(55, self.ftz);
418         } else {
419             match &self.srcs[1].src_ref {
420                 SrcRef::Zero | SrcRef::Reg(_) => {
421                     e.set_opcode(0x5c58);
422                     e.set_reg_fmod_src(20..28, 49, 45, self.srcs[1]);
423                 }
424                 SrcRef::Imm32(imm32) => {
425                     e.set_opcode(0x3858);
426                     e.set_src_imm_f20(20..39, 56, *imm32);
427                     assert!(self.srcs[1].src_mod.is_none());
428                 }
429                 SrcRef::CBuf(_) => {
430                     e.set_opcode(0x4c58);
431                     e.set_cb_fmod_src(20..39, 49, 45, self.srcs[1]);
432                 }
433                 src => panic!("Invalid fadd src1: {src}"),
434             }
435 
436             e.set_dst(self.dst);
437             e.set_reg_fmod_src(8..16, 46, 48, self.srcs[0]);
438 
439             e.set_rnd_mode(39..41, self.rnd_mode);
440             e.set_bit(44, self.ftz);
441             e.set_bit(50, self.saturate);
442         }
443     }
444 }
445 
446 impl SM50Op for OpFFma {
legalize(&mut self, b: &mut LegalizeBuilder)447     fn legalize(&mut self, b: &mut LegalizeBuilder) {
448         use RegFile::GPR;
449         let [src0, src1, src2] = &mut self.srcs;
450         b.copy_alu_src_if_fabs(src0, SrcType::F32);
451         b.copy_alu_src_if_fabs(src1, SrcType::F32);
452         b.copy_alu_src_if_fabs(src2, SrcType::F32);
453         swap_srcs_if_not_reg(src0, src1, GPR);
454         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F32);
455         b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F32);
456         if src_is_reg(src1, GPR) {
457             b.copy_alu_src_if_imm(src2, GPR, SrcType::F32);
458         } else {
459             b.copy_alu_src_if_not_reg(src2, GPR, SrcType::F32);
460         }
461     }
462 
encode(&self, e: &mut SM50Encoder<'_>)463     fn encode(&self, e: &mut SM50Encoder<'_>) {
464         // ffma doesn't have any abs flags.
465         assert!(!self.srcs[0].src_mod.has_fabs());
466         assert!(!self.srcs[1].src_mod.has_fabs());
467         assert!(!self.srcs[2].src_mod.has_fabs());
468 
469         // There is one fneg bit shared by the two fmul sources
470         let fneg_fmul =
471             self.srcs[0].src_mod.has_fneg() ^ self.srcs[1].src_mod.has_fneg();
472         let fneg_src2 = self.srcs[2].src_mod.has_fneg();
473 
474         match &self.srcs[2].src_ref {
475             SrcRef::Zero | SrcRef::Reg(_) => {
476                 match &self.srcs[1].src_ref {
477                     SrcRef::Zero | SrcRef::Reg(_) => {
478                         e.set_opcode(0x5980);
479                         e.set_reg_src_ref(20..28, self.srcs[1].src_ref);
480                     }
481                     SrcRef::Imm32(imm32) => {
482                         e.set_opcode(0x3280);
483 
484                         // Technically, ffma also supports a 32-bit immediate,
485                         // but only in the case where the destination is the
486                         // same as src2.  We don't support that right now.
487                         e.set_src_imm_f20(20..39, 56, *imm32);
488                     }
489                     SrcRef::CBuf(cb) => {
490                         e.set_opcode(0x4980);
491                         e.set_src_cb(20..39, cb);
492                     }
493                     src => panic!("Invalid ffma src1: {src}"),
494                 }
495 
496                 e.set_reg_src_ref(39..47, self.srcs[2].src_ref);
497             }
498             SrcRef::CBuf(cb) => {
499                 e.set_opcode(0x5180);
500                 e.set_src_cb(20..39, cb);
501                 e.set_reg_src_ref(39..47, self.srcs[1].src_ref);
502             }
503             src => panic!("Invalid ffma src2: {src}"),
504         }
505 
506         e.set_dst(self.dst);
507         e.set_reg_src_ref(8..16, self.srcs[0].src_ref);
508 
509         e.set_bit(48, fneg_fmul);
510         e.set_bit(49, fneg_src2);
511         e.set_bit(50, self.saturate);
512         e.set_rnd_mode(51..53, self.rnd_mode);
513 
514         e.set_bit(53, self.ftz);
515         e.set_bit(54, self.dnz);
516     }
517 }
518 
519 impl SM50Op for OpFMnMx {
legalize(&mut self, b: &mut LegalizeBuilder)520     fn legalize(&mut self, b: &mut LegalizeBuilder) {
521         use RegFile::GPR;
522         let [src0, src1] = &mut self.srcs;
523         swap_srcs_if_not_reg(src0, src1, GPR);
524         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F32);
525         b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F32);
526     }
527 
encode(&self, e: &mut SM50Encoder<'_>)528     fn encode(&self, e: &mut SM50Encoder<'_>) {
529         match &self.srcs[1].src_ref {
530             SrcRef::Zero | SrcRef::Reg(_) => {
531                 e.set_opcode(0x5c60);
532                 e.set_reg_fmod_src(20..28, 49, 45, self.srcs[1]);
533             }
534             SrcRef::Imm32(imm32) => {
535                 e.set_opcode(0x3860);
536                 e.set_src_imm_f20(20..39, 56, *imm32);
537                 assert!(self.srcs[1].src_mod.is_none());
538             }
539             SrcRef::CBuf(_) => {
540                 e.set_opcode(0x4c60);
541                 e.set_cb_fmod_src(20..39, 49, 45, self.srcs[1]);
542             }
543             src => panic!("Invalid fmnmx src2: {src}"),
544         }
545 
546         e.set_reg_fmod_src(8..16, 46, 48, self.srcs[0]);
547         e.set_dst(self.dst);
548         e.set_pred_src(39..42, 42, self.min);
549         e.set_bit(44, self.ftz);
550     }
551 }
552 
553 impl SM50Op for OpFMul {
legalize(&mut self, b: &mut LegalizeBuilder)554     fn legalize(&mut self, b: &mut LegalizeBuilder) {
555         use RegFile::GPR;
556         let [src0, src1] = &mut self.srcs;
557         b.copy_alu_src_if_fabs(src0, SrcType::F32);
558         b.copy_alu_src_if_fabs(src1, SrcType::F32);
559         swap_srcs_if_not_reg(src0, src1, GPR);
560         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F32);
561     }
562 
encode(&self, e: &mut SM50Encoder<'_>)563     fn encode(&self, e: &mut SM50Encoder<'_>) {
564         // fmul doesn't have any abs flags.
565         assert!(!self.srcs[0].src_mod.has_fabs());
566         assert!(!self.srcs[1].src_mod.has_fabs());
567 
568         // There is one fneg bit shared by both sources
569         let fneg =
570             self.srcs[0].src_mod.has_fneg() ^ self.srcs[1].src_mod.has_fneg();
571 
572         if let Some(mut imm32) = self.srcs[1].as_imm_not_f20() {
573             e.set_opcode(0x1e00);
574 
575             e.set_bit(53, self.ftz);
576             e.set_bit(54, self.dnz);
577             e.set_bit(55, self.saturate);
578 
579             if fneg {
580                 // Flip the immediate sign bit
581                 imm32 ^= 0x80000000;
582             }
583             e.set_src_imm32(20..52, imm32);
584         } else {
585             match &self.srcs[1].src_ref {
586                 SrcRef::Zero | SrcRef::Reg(_) => {
587                     e.set_opcode(0x5c68);
588                     e.set_reg_src(20..28, self.srcs[1]);
589                 }
590                 SrcRef::Imm32(imm32) => {
591                     e.set_opcode(0x3868);
592                     e.set_src_imm_f20(20..39, 56, *imm32);
593                 }
594                 SrcRef::CBuf(cbuf) => {
595                     e.set_opcode(0x4c68);
596                     e.set_src_cb(20..39, cbuf);
597                 }
598                 src => panic!("Invalid fmul src1: {src}"),
599             }
600 
601             e.set_rnd_mode(39..41, self.rnd_mode);
602             e.set_field(41..44, 0x0_u8); // TODO: PDIV
603             e.set_bit(44, self.ftz);
604             e.set_bit(45, self.dnz);
605             e.set_bit(48, fneg);
606             e.set_bit(50, self.saturate);
607         }
608 
609         e.set_reg_src_ref(8..16, self.srcs[0].src_ref);
610         e.set_dst(self.dst);
611     }
612 }
613 
614 impl SM50Op for OpRro {
legalize(&mut self, b: &mut LegalizeBuilder)615     fn legalize(&mut self, b: &mut LegalizeBuilder) {
616         use RegFile::GPR;
617         b.copy_alu_src_if_f20_overflow(&mut self.src, GPR, SrcType::F32);
618     }
619 
encode(&self, e: &mut SM50Encoder<'_>)620     fn encode(&self, e: &mut SM50Encoder<'_>) {
621         match &self.src.src_ref {
622             SrcRef::Zero | SrcRef::Reg(_) => {
623                 e.set_opcode(0x5c90);
624                 e.set_reg_fmod_src(20..28, 49, 45, self.src);
625             }
626             SrcRef::Imm32(imm32) => {
627                 e.set_opcode(0x3890);
628                 e.set_src_imm_f20(20..39, 56, *imm32);
629                 assert!(self.src.src_mod.is_none());
630             }
631             SrcRef::CBuf(_) => {
632                 e.set_opcode(0x4c90);
633                 e.set_cb_fmod_src(20..39, 49, 45, self.src);
634             }
635             src => panic!("Invalid rro src: {src}"),
636         }
637 
638         e.set_dst(self.dst);
639         e.set_field(
640             39..40,
641             match self.op {
642                 RroOp::SinCos => 0u8,
643                 RroOp::Exp2 => 1u8,
644             },
645         );
646     }
647 }
648 
649 impl SM50Op for OpMuFu {
legalize(&mut self, b: &mut LegalizeBuilder)650     fn legalize(&mut self, b: &mut LegalizeBuilder) {
651         b.copy_alu_src_if_not_reg(&mut self.src, RegFile::GPR, SrcType::GPR);
652     }
653 
encode(&self, e: &mut SM50Encoder<'_>)654     fn encode(&self, e: &mut SM50Encoder<'_>) {
655         e.set_opcode(0x5080);
656 
657         e.set_dst(self.dst);
658         e.set_reg_fmod_src(8..16, 46, 48, self.src);
659 
660         e.set_field(
661             20..24,
662             match self.op {
663                 MuFuOp::Cos => 0_u8,
664                 MuFuOp::Sin => 1_u8,
665                 MuFuOp::Exp2 => 2_u8,
666                 MuFuOp::Log2 => 3_u8,
667                 MuFuOp::Rcp => 4_u8,
668                 MuFuOp::Rsq => 5_u8,
669                 MuFuOp::Rcp64H => 6_u8,
670                 MuFuOp::Rsq64H => 7_u8,
671                 // SQRT is only on SM52 and later
672                 MuFuOp::Sqrt if e.sm.sm >= 52 => 8_u8,
673                 MuFuOp::Sqrt => panic!("MUFU.SQRT not supported on SM50"),
674                 MuFuOp::Tanh => panic!("MUFU.TANH not supported on SM50"),
675             },
676         );
677     }
678 }
679 
680 impl SM50Encoder<'_> {
set_float_cmp_op(&mut self, range: Range<usize>, op: FloatCmpOp)681     fn set_float_cmp_op(&mut self, range: Range<usize>, op: FloatCmpOp) {
682         assert!(range.len() == 4);
683         self.set_field(
684             range,
685             match op {
686                 FloatCmpOp::OrdLt => 0x01_u8,
687                 FloatCmpOp::OrdEq => 0x02_u8,
688                 FloatCmpOp::OrdLe => 0x03_u8,
689                 FloatCmpOp::OrdGt => 0x04_u8,
690                 FloatCmpOp::OrdNe => 0x05_u8,
691                 FloatCmpOp::OrdGe => 0x06_u8,
692                 FloatCmpOp::UnordLt => 0x09_u8,
693                 FloatCmpOp::UnordEq => 0x0a_u8,
694                 FloatCmpOp::UnordLe => 0x0b_u8,
695                 FloatCmpOp::UnordGt => 0x0c_u8,
696                 FloatCmpOp::UnordNe => 0x0d_u8,
697                 FloatCmpOp::UnordGe => 0x0e_u8,
698                 FloatCmpOp::IsNum => 0x07_u8,
699                 FloatCmpOp::IsNan => 0x08_u8,
700             },
701         );
702     }
703 
set_pred_set_op(&mut self, range: Range<usize>, op: PredSetOp)704     fn set_pred_set_op(&mut self, range: Range<usize>, op: PredSetOp) {
705         assert!(range.len() == 2);
706         self.set_field(
707             range,
708             match op {
709                 PredSetOp::And => 0_u8,
710                 PredSetOp::Or => 1_u8,
711                 PredSetOp::Xor => 2_u8,
712             },
713         );
714     }
715 
set_int_cmp_op(&mut self, range: Range<usize>, op: IntCmpOp)716     fn set_int_cmp_op(&mut self, range: Range<usize>, op: IntCmpOp) {
717         assert!(range.len() == 3);
718         self.set_field(
719             range,
720             match op {
721                 IntCmpOp::Eq => 2_u8,
722                 IntCmpOp::Ne => 5_u8,
723                 IntCmpOp::Lt => 1_u8,
724                 IntCmpOp::Le => 3_u8,
725                 IntCmpOp::Gt => 4_u8,
726                 IntCmpOp::Ge => 6_u8,
727             },
728         );
729     }
730 }
731 
732 impl SM50Op for OpFSet {
legalize(&mut self, b: &mut LegalizeBuilder)733     fn legalize(&mut self, b: &mut LegalizeBuilder) {
734         use RegFile::GPR;
735         let [src0, src1] = &mut self.srcs;
736         if swap_srcs_if_not_reg(src0, src1, GPR) {
737             self.cmp_op = self.cmp_op.flip();
738         }
739         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F32);
740         b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F32);
741     }
742 
encode(&self, e: &mut SM50Encoder<'_>)743     fn encode(&self, e: &mut SM50Encoder<'_>) {
744         match &self.srcs[1].src_ref {
745             SrcRef::Zero | SrcRef::Reg(_) => {
746                 e.set_opcode(0x5800);
747                 e.set_reg_fmod_src(20..28, 44, 53, self.srcs[1]);
748             }
749             SrcRef::Imm32(imm32) => {
750                 e.set_opcode(0x3000);
751                 e.set_src_imm_f20(20..39, 56, *imm32);
752                 assert!(self.srcs[1].src_mod.is_none());
753             }
754             SrcRef::CBuf(_) => {
755                 e.set_opcode(0x4800);
756                 e.set_cb_fmod_src(20..39, 44, 6, self.srcs[1]);
757             }
758             src => panic!("Invalid fset src1: {src}"),
759         }
760 
761         e.set_reg_fmod_src(8..16, 54, 43, self.srcs[0]);
762         e.set_pred_src(39..42, 42, SrcRef::True.into());
763         e.set_float_cmp_op(48..52, self.cmp_op);
764         e.set_bit(52, true); // bool float
765         e.set_bit(55, self.ftz);
766         e.set_dst(self.dst);
767     }
768 }
769 
770 impl SM50Op for OpFSetP {
legalize(&mut self, b: &mut LegalizeBuilder)771     fn legalize(&mut self, b: &mut LegalizeBuilder) {
772         use RegFile::GPR;
773         let [src0, src1] = &mut self.srcs;
774         if swap_srcs_if_not_reg(src0, src1, GPR) {
775             self.cmp_op = self.cmp_op.flip();
776         }
777         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F32);
778         b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F32);
779     }
780 
encode(&self, e: &mut SM50Encoder<'_>)781     fn encode(&self, e: &mut SM50Encoder<'_>) {
782         match &self.srcs[1].src_ref {
783             SrcRef::Zero | SrcRef::Reg(_) => {
784                 e.set_opcode(0x5bb0);
785                 e.set_reg_fmod_src(20..28, 44, 6, self.srcs[1]);
786             }
787             SrcRef::Imm32(imm32) => {
788                 e.set_opcode(0x36b0);
789                 e.set_src_imm_f20(20..39, 56, *imm32);
790                 assert!(self.srcs[1].src_mod.is_none());
791             }
792             SrcRef::CBuf(_) => {
793                 e.set_opcode(0x4bb0);
794                 e.set_cb_fmod_src(20..39, 44, 6, self.srcs[1]);
795             }
796             src => panic!("Invalid fsetp src1: {src}"),
797         }
798 
799         e.set_pred_dst(3..6, self.dst);
800         e.set_pred_dst(0..3, Dst::None); // dst1
801         e.set_reg_fmod_src(8..16, 7, 43, self.srcs[0]);
802         e.set_pred_src(39..42, 42, self.accum);
803         e.set_pred_set_op(45..47, self.set_op);
804         e.set_bit(47, self.ftz);
805         e.set_float_cmp_op(48..52, self.cmp_op);
806     }
807 }
808 
809 impl SM50Op for OpFSwzAdd {
legalize(&mut self, b: &mut LegalizeBuilder)810     fn legalize(&mut self, b: &mut LegalizeBuilder) {
811         use RegFile::GPR;
812         b.copy_alu_src_if_not_reg(&mut self.srcs[0], GPR, SrcType::GPR);
813         b.copy_alu_src_if_not_reg(&mut self.srcs[1], GPR, SrcType::GPR);
814     }
815 
encode(&self, e: &mut SM50Encoder<'_>)816     fn encode(&self, e: &mut SM50Encoder<'_>) {
817         e.set_opcode(0x50f8);
818 
819         e.set_dst(self.dst);
820         e.set_reg_src(8..16, self.srcs[0]);
821         e.set_reg_src(20..28, self.srcs[1]);
822 
823         e.set_field(
824             39..41,
825             match self.rnd_mode {
826                 FRndMode::NearestEven => 0u8,
827                 FRndMode::NegInf => 1u8,
828                 FRndMode::PosInf => 2u8,
829                 FRndMode::Zero => 3u8,
830             },
831         );
832 
833         for (i, op) in self.ops.iter().enumerate() {
834             e.set_field(
835                 28 + i * 2..28 + (i + 1) * 2,
836                 match op {
837                     FSwzAddOp::Add => 0u8,
838                     FSwzAddOp::SubLeft => 1u8,
839                     FSwzAddOp::SubRight => 2u8,
840                     FSwzAddOp::MoveLeft => 3u8,
841                 },
842             );
843         }
844 
845         e.set_bit(38, false); /* .NDV */
846         e.set_bit(44, self.ftz);
847         e.set_bit(47, false); /* dst.CC */
848     }
849 }
850 
851 impl SM50Op for OpDAdd {
legalize(&mut self, b: &mut LegalizeBuilder)852     fn legalize(&mut self, b: &mut LegalizeBuilder) {
853         use RegFile::GPR;
854         let [src0, src1] = &mut self.srcs;
855         swap_srcs_if_not_reg(src0, src1, GPR);
856         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F64);
857         b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F64);
858     }
859 
encode(&self, e: &mut SM50Encoder<'_>)860     fn encode(&self, e: &mut SM50Encoder<'_>) {
861         match &self.srcs[1].src_ref {
862             SrcRef::Zero | SrcRef::Reg(_) => {
863                 e.set_opcode(0x5c70);
864                 e.set_reg_fmod_src(20..28, 49, 45, self.srcs[1]);
865             }
866             SrcRef::Imm32(imm32) => {
867                 e.set_opcode(0x3870);
868                 e.set_src_imm_f20(20..39, 56, *imm32);
869                 assert!(self.srcs[1].src_mod.is_none());
870             }
871             SrcRef::CBuf(_) => {
872                 e.set_opcode(0x4c70);
873                 e.set_cb_fmod_src(20..39, 49, 45, self.srcs[1]);
874             }
875             src => panic!("Invalid dadd src1: {src}"),
876         }
877 
878         e.set_dst(self.dst);
879         e.set_reg_fmod_src(8..16, 46, 48, self.srcs[0]);
880         e.set_rnd_mode(39..41, self.rnd_mode);
881     }
882 }
883 
884 impl SM50Op for OpDFma {
legalize(&mut self, b: &mut LegalizeBuilder)885     fn legalize(&mut self, b: &mut LegalizeBuilder) {
886         use RegFile::GPR;
887         let [src0, src1, src2] = &mut self.srcs;
888         b.copy_alu_src_if_fabs(src0, SrcType::F64);
889         b.copy_alu_src_if_fabs(src1, SrcType::F64);
890         b.copy_alu_src_if_fabs(src2, SrcType::F64);
891         swap_srcs_if_not_reg(src0, src1, GPR);
892         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F64);
893         b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F64);
894         if src_is_reg(src1, GPR) {
895             b.copy_alu_src_if_imm(src2, GPR, SrcType::F64);
896         } else {
897             b.copy_alu_src_if_not_reg(src2, GPR, SrcType::F64);
898         }
899     }
900 
encode(&self, e: &mut SM50Encoder<'_>)901     fn encode(&self, e: &mut SM50Encoder<'_>) {
902         // dfma doesn't have any abs flags.
903         assert!(!self.srcs[0].src_mod.has_fabs());
904         assert!(!self.srcs[1].src_mod.has_fabs());
905         assert!(!self.srcs[2].src_mod.has_fabs());
906 
907         // There is one fneg bit shared by the two fmul sources
908         let fneg_fmul =
909             self.srcs[0].src_mod.has_fneg() ^ self.srcs[1].src_mod.has_fneg();
910         let fneg_src2 = self.srcs[2].src_mod.has_fneg();
911 
912         match &self.srcs[2].src_ref {
913             SrcRef::Zero | SrcRef::Reg(_) => {
914                 match &self.srcs[1].src_ref {
915                     SrcRef::Zero | SrcRef::Reg(_) => {
916                         e.set_opcode(0x5b70);
917                         e.set_reg_src_ref(20..28, self.srcs[1].src_ref);
918                     }
919                     SrcRef::Imm32(imm32) => {
920                         e.set_opcode(0x3670);
921                         e.set_src_imm_f20(20..39, 56, *imm32);
922                     }
923                     SrcRef::CBuf(cb) => {
924                         e.set_opcode(0x4b70);
925                         e.set_src_cb(20..39, cb);
926                     }
927                     src => panic!("Invalid dfma src1: {src}"),
928                 }
929 
930                 e.set_reg_src_ref(39..47, self.srcs[2].src_ref);
931             }
932             SrcRef::CBuf(cb) => {
933                 e.set_opcode(0x5370);
934                 e.set_src_cb(20..39, cb);
935                 e.set_reg_src_ref(39..47, self.srcs[1].src_ref);
936             }
937             src => panic!("Invalid dfma src2: {src}"),
938         }
939 
940         e.set_dst(self.dst);
941         e.set_reg_src_ref(8..16, self.srcs[0].src_ref);
942 
943         e.set_bit(48, fneg_fmul);
944         e.set_bit(49, fneg_src2);
945 
946         e.set_rnd_mode(50..52, self.rnd_mode);
947     }
948 }
949 
950 impl SM50Op for OpDMnMx {
legalize(&mut self, b: &mut LegalizeBuilder)951     fn legalize(&mut self, b: &mut LegalizeBuilder) {
952         use RegFile::GPR;
953         let [src0, src1] = &mut self.srcs;
954         swap_srcs_if_not_reg(src0, src1, GPR);
955         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F64);
956         b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F64);
957     }
958 
encode(&self, e: &mut SM50Encoder<'_>)959     fn encode(&self, e: &mut SM50Encoder<'_>) {
960         match &self.srcs[1].src_ref {
961             SrcRef::Zero | SrcRef::Reg(_) => {
962                 e.set_opcode(0x5c50);
963                 e.set_reg_fmod_src(20..28, 49, 45, self.srcs[1]);
964             }
965             SrcRef::Imm32(imm32) => {
966                 e.set_opcode(0x3850);
967                 e.set_src_imm_f20(20..39, 56, *imm32);
968                 assert!(self.srcs[1].src_mod.is_none());
969             }
970             SrcRef::CBuf(_) => {
971                 e.set_opcode(0x4c50);
972                 e.set_cb_fmod_src(20..39, 49, 45, self.srcs[1]);
973             }
974             src => panic!("Invalid dmnmx src1: {src}"),
975         }
976 
977         e.set_reg_fmod_src(8..16, 46, 48, self.srcs[0]);
978         e.set_dst(self.dst);
979         e.set_pred_src(39..42, 42, self.min);
980     }
981 }
982 
983 impl SM50Op for OpDMul {
legalize(&mut self, b: &mut LegalizeBuilder)984     fn legalize(&mut self, b: &mut LegalizeBuilder) {
985         use RegFile::GPR;
986         let [src0, src1] = &mut self.srcs;
987         b.copy_alu_src_if_fabs(src0, SrcType::F64);
988         b.copy_alu_src_if_fabs(src1, SrcType::F64);
989         swap_srcs_if_not_reg(src0, src1, GPR);
990         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F64);
991         b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F64);
992     }
993 
encode(&self, e: &mut SM50Encoder<'_>)994     fn encode(&self, e: &mut SM50Encoder<'_>) {
995         assert!(!self.srcs[0].src_mod.has_fabs());
996         assert!(!self.srcs[1].src_mod.has_fabs());
997 
998         // There is one fneg bit shared by both sources
999         let fneg =
1000             self.srcs[0].src_mod.has_fneg() ^ self.srcs[1].src_mod.has_fneg();
1001 
1002         match &self.srcs[1].src_ref {
1003             SrcRef::Zero | SrcRef::Reg(_) => {
1004                 e.set_opcode(0x5c80);
1005                 e.set_reg_src_ref(20..28, self.srcs[1].src_ref);
1006             }
1007             SrcRef::Imm32(imm32) => {
1008                 e.set_opcode(0x3880);
1009                 e.set_src_imm_f20(20..39, 56, *imm32);
1010             }
1011             SrcRef::CBuf(cb) => {
1012                 e.set_opcode(0x4c80);
1013                 e.set_src_cb(20..39, cb);
1014             }
1015             src => panic!("Invalid dmul src1: {src}"),
1016         }
1017 
1018         e.set_dst(self.dst);
1019         e.set_reg_src_ref(8..16, self.srcs[0].src_ref);
1020 
1021         e.set_rnd_mode(39..41, self.rnd_mode);
1022         e.set_bit(48, fneg);
1023     }
1024 }
1025 
1026 impl SM50Op for OpDSetP {
legalize(&mut self, b: &mut LegalizeBuilder)1027     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1028         use RegFile::GPR;
1029         let [src0, src1] = &mut self.srcs;
1030         if swap_srcs_if_not_reg(src0, src1, GPR) {
1031             self.cmp_op = self.cmp_op.flip();
1032         }
1033         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F64);
1034         b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F64);
1035     }
1036 
encode(&self, e: &mut SM50Encoder<'_>)1037     fn encode(&self, e: &mut SM50Encoder<'_>) {
1038         match &self.srcs[1].src_ref {
1039             SrcRef::Zero | SrcRef::Reg(_) => {
1040                 e.set_opcode(0x5b80);
1041                 e.set_reg_fmod_src(20..28, 44, 6, self.srcs[1]);
1042             }
1043             SrcRef::Imm32(imm32) => {
1044                 e.set_opcode(0x3680);
1045                 e.set_src_imm_f20(20..39, 56, *imm32);
1046                 assert!(self.srcs[1].src_mod.is_none());
1047             }
1048             SrcRef::CBuf(_) => {
1049                 e.set_opcode(0x4b80);
1050                 e.set_reg_fmod_src(20..39, 44, 6, self.srcs[1]);
1051             }
1052             src => panic!("Invalid dsetp src1: {src}"),
1053         }
1054 
1055         e.set_pred_dst(3..6, self.dst);
1056         e.set_pred_dst(0..3, Dst::None); // dst1
1057         e.set_pred_src(39..42, 42, self.accum);
1058         e.set_pred_set_op(45..47, self.set_op);
1059         e.set_float_cmp_op(48..52, self.cmp_op);
1060         e.set_reg_fmod_src(8..16, 7, 43, self.srcs[0]);
1061     }
1062 }
1063 
1064 impl SM50Op for OpBfe {
legalize(&mut self, b: &mut LegalizeBuilder)1065     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1066         use RegFile::GPR;
1067         b.copy_alu_src_if_not_reg(&mut self.base, GPR, SrcType::ALU);
1068     }
1069 
encode(&self, e: &mut SM50Encoder<'_>)1070     fn encode(&self, e: &mut SM50Encoder<'_>) {
1071         match &self.range.src_ref {
1072             SrcRef::Zero | SrcRef::Reg(_) => {
1073                 e.set_opcode(0x5c00);
1074                 e.set_reg_src(20..28, self.range);
1075             }
1076             SrcRef::Imm32(imm32) => {
1077                 e.set_opcode(0x3800);
1078                 // Only the bottom 16 bits of the immediate matter
1079                 e.set_src_imm_i20(20..39, 56, *imm32 & 0xffff);
1080             }
1081             SrcRef::CBuf(cbuf) => {
1082                 e.set_opcode(0x4c00);
1083                 e.set_src_cb(20..39, cbuf);
1084             }
1085             src => panic!("Invalid bfe range: {src}"),
1086         }
1087 
1088         if self.signed {
1089             e.set_bit(48, true);
1090         }
1091 
1092         if self.reverse {
1093             e.set_bit(40, true);
1094         }
1095 
1096         e.set_reg_src(8..16, self.base);
1097         e.set_dst(self.dst);
1098     }
1099 }
1100 
1101 impl SM50Op for OpFlo {
legalize(&mut self, b: &mut LegalizeBuilder)1102     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1103         use RegFile::GPR;
1104         b.copy_alu_src_if_i20_overflow(&mut self.src, GPR, SrcType::ALU);
1105     }
1106 
encode(&self, e: &mut SM50Encoder<'_>)1107     fn encode(&self, e: &mut SM50Encoder<'_>) {
1108         match &self.src.src_ref {
1109             SrcRef::Zero | SrcRef::Reg(_) => {
1110                 e.set_opcode(0x5c30);
1111                 e.set_reg_src_ref(20..28, self.src.src_ref);
1112             }
1113             SrcRef::Imm32(imm32) => {
1114                 e.set_opcode(0x3830);
1115                 e.set_src_imm_i20(20..39, 56, *imm32);
1116                 assert!(self.src.src_mod.is_none());
1117             }
1118             SrcRef::CBuf(cb) => {
1119                 e.set_opcode(0x4c30);
1120                 e.set_src_cb(20..39, cb);
1121             }
1122             src => panic!("Invalid flo src: {src}"),
1123         }
1124 
1125         e.set_dst(self.dst);
1126         e.set_bit(40, self.src.src_mod.is_bnot());
1127         e.set_bit(48, self.signed);
1128         e.set_bit(41, self.return_shift_amount);
1129         e.set_bit(47, false); /* dst.CC */
1130     }
1131 }
1132 
1133 impl SM50Op for OpIAdd2 {
legalize(&mut self, b: &mut LegalizeBuilder)1134     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1135         use RegFile::GPR;
1136         let [src0, src1] = &mut self.srcs;
1137         swap_srcs_if_not_reg(src0, src1, GPR);
1138         if src0.src_mod.is_ineg() && src1.src_mod.is_ineg() {
1139             assert!(self.carry_out.is_none());
1140             let val = b.alloc_ssa(GPR, 1);
1141             b.push_op(OpIAdd2 {
1142                 dst: val.into(),
1143                 carry_out: Dst::None,
1144                 srcs: [Src::new_zero(), *src0],
1145             });
1146             *src0 = val.into();
1147         }
1148         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::I32);
1149         if !self.carry_out.is_none() {
1150             b.copy_alu_src_if_ineg_imm(src1, GPR, SrcType::I32);
1151         }
1152     }
1153 
encode(&self, e: &mut SM50Encoder<'_>)1154     fn encode(&self, e: &mut SM50Encoder<'_>) {
1155         // Hardware requires at least one of these be unmodified.  Otherwise, it
1156         // encodes as iadd.po which isn't what we want.
1157         assert!(
1158             self.srcs[0].src_mod.is_none() || self.srcs[1].src_mod.is_none()
1159         );
1160 
1161         let carry_out = match self.carry_out {
1162             Dst::Reg(reg) if reg.file() == RegFile::Carry => true,
1163             Dst::None => false,
1164             dst => panic!("Invalid iadd carry_out: {dst}"),
1165         };
1166 
1167         if let Some(imm32) = self.srcs[1].as_imm_not_i20() {
1168             e.set_opcode(0x1c00);
1169 
1170             e.set_dst(self.dst);
1171             e.set_reg_ineg_src(8..16, 56, self.srcs[0]);
1172             e.set_src_imm32(20..52, imm32);
1173 
1174             e.set_bit(52, carry_out);
1175             e.set_bit(53, false); // .X
1176         } else {
1177             match &self.srcs[1].src_ref {
1178                 SrcRef::Zero | SrcRef::Reg(_) => {
1179                     e.set_opcode(0x5c10);
1180                     e.set_reg_ineg_src(20..28, 48, self.srcs[1]);
1181                 }
1182                 SrcRef::Imm32(imm32) => {
1183                     e.set_opcode(0x3810);
1184                     e.set_src_imm_i20(20..39, 56, *imm32);
1185                     assert!(self.srcs[1].src_mod.is_none());
1186                 }
1187                 SrcRef::CBuf(_) => {
1188                     e.set_opcode(0x4c10);
1189                     e.set_cb_ineg_src(20..39, 48, self.srcs[1]);
1190                 }
1191                 src => panic!("Invalid iadd src1: {src}"),
1192             }
1193 
1194             e.set_dst(self.dst);
1195             e.set_reg_ineg_src(8..16, 49, self.srcs[0]);
1196 
1197             e.set_bit(43, false); // .X
1198             e.set_bit(47, carry_out);
1199         }
1200     }
1201 }
1202 
1203 impl SM50Op for OpIAdd2X {
legalize(&mut self, b: &mut LegalizeBuilder)1204     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1205         use RegFile::GPR;
1206         let [src0, src1] = &mut self.srcs;
1207         swap_srcs_if_not_reg(src0, src1, GPR);
1208         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::I32);
1209     }
1210 
encode(&self, e: &mut SM50Encoder<'_>)1211     fn encode(&self, e: &mut SM50Encoder<'_>) {
1212         match self.carry_in.src_ref {
1213             SrcRef::Reg(reg) if reg.file() == RegFile::Carry => (),
1214             src => panic!("Invalid iadd.x carry_in: {src}"),
1215         }
1216 
1217         let carry_out = match self.carry_out {
1218             Dst::Reg(reg) if reg.file() == RegFile::Carry => true,
1219             Dst::None => false,
1220             dst => panic!("Invalid iadd.x carry_out: {dst}"),
1221         };
1222 
1223         if let Some(imm32) = self.srcs[1].as_imm_not_i20() {
1224             e.set_opcode(0x1c00);
1225 
1226             e.set_dst(self.dst);
1227             e.set_reg_bnot_src(8..16, 56, self.srcs[0]);
1228             e.set_src_imm32(20..52, imm32);
1229 
1230             e.set_bit(52, carry_out);
1231             e.set_bit(53, true); // .X
1232         } else {
1233             match &self.srcs[1].src_ref {
1234                 SrcRef::Zero | SrcRef::Reg(_) => {
1235                     e.set_opcode(0x5c10);
1236                     e.set_reg_bnot_src(20..28, 48, self.srcs[1]);
1237                 }
1238                 SrcRef::Imm32(imm32) => {
1239                     e.set_opcode(0x3810);
1240                     e.set_src_imm_i20(20..39, 56, *imm32);
1241                     assert!(self.srcs[1].src_mod.is_none());
1242                 }
1243                 SrcRef::CBuf(_) => {
1244                     e.set_opcode(0x4c10);
1245                     e.set_cb_bnot_src(20..39, 48, self.srcs[1]);
1246                 }
1247                 src => panic!("Invalid iadd.x src1: {src}"),
1248             }
1249 
1250             e.set_dst(self.dst);
1251             e.set_reg_bnot_src(8..16, 49, self.srcs[0]);
1252 
1253             e.set_bit(43, true); // .X
1254             e.set_bit(47, carry_out);
1255         }
1256     }
1257 }
1258 
1259 impl SM50Op for OpIMad {
legalize(&mut self, b: &mut LegalizeBuilder)1260     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1261         use RegFile::GPR;
1262         let [src0, src1, src2] = &mut self.srcs;
1263         swap_srcs_if_not_reg(src0, src1, GPR);
1264         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::ALU);
1265         b.copy_alu_src_if_i20_overflow(src1, GPR, SrcType::ALU);
1266         if src_is_reg(src1, GPR) {
1267             b.copy_alu_src_if_imm(src2, GPR, SrcType::ALU);
1268         } else {
1269             b.copy_alu_src_if_not_reg(src2, GPR, SrcType::ALU);
1270         }
1271     }
1272 
encode(&self, e: &mut SM50Encoder<'_>)1273     fn encode(&self, e: &mut SM50Encoder<'_>) {
1274         // There is one ineg bit shared by the two imul sources
1275         let ineg_imul =
1276             self.srcs[0].src_mod.is_ineg() ^ self.srcs[1].src_mod.is_ineg();
1277         let ineg_src2 = self.srcs[2].src_mod.is_ineg();
1278 
1279         match &self.srcs[2].src_ref {
1280             SrcRef::Zero | SrcRef::Reg(_) => {
1281                 match &self.srcs[1].src_ref {
1282                     SrcRef::Zero | SrcRef::Reg(_) => {
1283                         e.set_opcode(0x5a00);
1284                         e.set_reg_src_ref(20..28, self.srcs[1].src_ref);
1285                     }
1286                     SrcRef::Imm32(imm32) => {
1287                         e.set_opcode(0x3400);
1288                         e.set_src_imm_i20(20..39, 56, *imm32);
1289                     }
1290                     SrcRef::CBuf(cb) => {
1291                         e.set_opcode(0x4a00);
1292                         e.set_src_cb(20..39, cb);
1293                     }
1294                     src => panic!("Invalid imad src1: {src}"),
1295                 }
1296 
1297                 e.set_reg_src_ref(39..47, self.srcs[2].src_ref);
1298             }
1299             SrcRef::CBuf(cb) => {
1300                 e.set_opcode(0x5200);
1301                 e.set_src_cb(20..39, cb);
1302                 e.set_reg_src_ref(39..47, self.srcs[1].src_ref);
1303             }
1304             src => panic!("Invalid imad src2: {src}"),
1305         }
1306 
1307         e.set_dst(self.dst);
1308         e.set_reg_src(8..16, self.srcs[0]);
1309 
1310         e.set_bit(48, self.signed); // src0 signed
1311         e.set_bit(51, ineg_imul);
1312         e.set_bit(52, ineg_src2);
1313         e.set_bit(53, self.signed); // src1 signed
1314     }
1315 }
1316 
1317 impl SM50Op for OpIMul {
legalize(&mut self, b: &mut LegalizeBuilder)1318     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1319         use RegFile::GPR;
1320         let [src0, src1] = &mut self.srcs;
1321         if swap_srcs_if_not_reg(src0, src1, GPR) {
1322             self.signed.swap(0, 1);
1323         }
1324         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::ALU);
1325     }
1326 
encode(&self, e: &mut SM50Encoder<'_>)1327     fn encode(&self, e: &mut SM50Encoder<'_>) {
1328         assert!(self.srcs[0].src_mod.is_none());
1329         assert!(self.srcs[1].src_mod.is_none());
1330 
1331         if let Some(i) = self.srcs[1].as_imm_not_i20() {
1332             e.set_opcode(0x1fc0);
1333             e.set_src_imm32(20..52, i);
1334 
1335             e.set_bit(53, self.high);
1336             e.set_bit(54, self.signed[0]);
1337             e.set_bit(55, self.signed[1]);
1338         } else {
1339             match &self.srcs[1].src_ref {
1340                 SrcRef::Zero | SrcRef::Reg(_) => {
1341                     e.set_opcode(0x5c38);
1342                     e.set_reg_src(20..28, self.srcs[1]);
1343                 }
1344                 SrcRef::Imm32(imm32) => {
1345                     e.set_opcode(0x3838);
1346                     e.set_src_imm_i20(20..39, 56, *imm32);
1347                 }
1348                 SrcRef::CBuf(cb) => {
1349                     e.set_opcode(0x4c38);
1350                     e.set_src_cb(20..39, cb);
1351                 }
1352                 src => panic!("Invalid imul src1: {src}"),
1353             };
1354 
1355             e.set_bit(39, self.high);
1356             e.set_bit(40, self.signed[0]);
1357             e.set_bit(41, self.signed[1]);
1358         }
1359 
1360         e.set_dst(self.dst);
1361         e.set_reg_src(8..16, self.srcs[0]);
1362     }
1363 }
1364 
1365 impl SM50Op for OpIMnMx {
legalize(&mut self, b: &mut LegalizeBuilder)1366     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1367         use RegFile::GPR;
1368         let [src0, src1] = &mut self.srcs;
1369         swap_srcs_if_not_reg(src0, src1, GPR);
1370         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::ALU);
1371         b.copy_alu_src_if_i20_overflow(src1, GPR, SrcType::ALU);
1372     }
1373 
encode(&self, e: &mut SM50Encoder<'_>)1374     fn encode(&self, e: &mut SM50Encoder<'_>) {
1375         match &self.srcs[1].src_ref {
1376             SrcRef::Zero | SrcRef::Reg(_) => {
1377                 e.set_opcode(0x5c20);
1378                 e.set_reg_src(20..28, self.srcs[1]);
1379             }
1380             SrcRef::Imm32(imm32) => {
1381                 e.set_opcode(0x3820);
1382                 e.set_src_imm_i20(20..39, 56, *imm32);
1383                 assert!(self.srcs[1].src_mod.is_none());
1384             }
1385             SrcRef::CBuf(cb) => {
1386                 e.set_opcode(0x4c20);
1387                 e.set_src_cb(20..39, cb);
1388             }
1389             src => panic!("Invalid imnmx src1: {src}"),
1390         }
1391 
1392         e.set_dst(self.dst);
1393         e.set_reg_src(8..16, self.srcs[0]);
1394         e.set_pred_src(39..42, 42, self.min);
1395         e.set_bit(47, false); // .CC
1396         e.set_bit(
1397             48,
1398             match self.cmp_type {
1399                 IntCmpType::U32 => false,
1400                 IntCmpType::I32 => true,
1401             },
1402         );
1403     }
1404 }
1405 
1406 impl SM50Op for OpISetP {
legalize(&mut self, b: &mut LegalizeBuilder)1407     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1408         use RegFile::GPR;
1409         let [src0, src1] = &mut self.srcs;
1410         if swap_srcs_if_not_reg(src0, src1, GPR) {
1411             self.cmp_op = self.cmp_op.flip();
1412         }
1413         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::ALU);
1414         b.copy_alu_src_if_i20_overflow(src1, GPR, SrcType::ALU);
1415     }
1416 
encode(&self, e: &mut SM50Encoder<'_>)1417     fn encode(&self, e: &mut SM50Encoder<'_>) {
1418         match &self.srcs[1].src_ref {
1419             SrcRef::Zero | SrcRef::Reg(_) => {
1420                 e.set_opcode(0x5b60);
1421                 e.set_reg_src(20..28, self.srcs[1]);
1422             }
1423             SrcRef::Imm32(imm32) => {
1424                 e.set_opcode(0x3660);
1425                 e.set_src_imm_i20(20..39, 56, *imm32);
1426                 assert!(self.srcs[1].src_mod.is_none());
1427             }
1428             SrcRef::CBuf(cb) => {
1429                 e.set_opcode(0x4b60);
1430                 e.set_src_cb(20..39, cb);
1431             }
1432             src => panic!("Invalid isetp src1: {src}"),
1433         }
1434 
1435         e.set_pred_dst(0..3, Dst::None); // dst1
1436         e.set_pred_dst(3..6, self.dst);
1437         e.set_reg_src(8..16, self.srcs[0]);
1438         e.set_pred_src(39..42, 42, self.accum);
1439 
1440         // isetp.x seems to take the accumulator into account and we don't fully
1441         // understand how.  Until we do, disallow it.
1442         assert!(!self.ex);
1443         e.set_bit(43, self.ex);
1444         e.set_pred_set_op(45..47, self.set_op);
1445 
1446         e.set_field(
1447             48..49,
1448             match self.cmp_type {
1449                 IntCmpType::U32 => 0_u32,
1450                 IntCmpType::I32 => 1_u32,
1451             },
1452         );
1453         e.set_int_cmp_op(49..52, self.cmp_op);
1454     }
1455 }
1456 
1457 impl SM50Op for OpLop2 {
legalize(&mut self, b: &mut LegalizeBuilder)1458     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1459         use RegFile::GPR;
1460         let [src0, src1] = &mut self.srcs;
1461         match self.op {
1462             LogicOp2::PassB => {
1463                 *src0 = 0.into();
1464                 b.copy_alu_src_if_i20_overflow(src1, GPR, SrcType::ALU);
1465             }
1466             LogicOp2::And | LogicOp2::Or | LogicOp2::Xor => {
1467                 swap_srcs_if_not_reg(src0, src1, GPR);
1468                 b.copy_alu_src_if_not_reg(src0, GPR, SrcType::ALU);
1469             }
1470         }
1471     }
1472 
encode(&self, e: &mut SM50Encoder<'_>)1473     fn encode(&self, e: &mut SM50Encoder<'_>) {
1474         if let Some(imm32) = self.srcs[1].as_imm_not_i20() {
1475             e.set_opcode(0x0400);
1476 
1477             e.set_dst(self.dst);
1478             e.set_reg_bnot_src(8..16, 55, self.srcs[0]);
1479             e.set_src_imm32(20..52, imm32);
1480             e.set_field(
1481                 53..55,
1482                 match self.op {
1483                     LogicOp2::And => 0_u8,
1484                     LogicOp2::Or => 1_u8,
1485                     LogicOp2::Xor => 2_u8,
1486                     LogicOp2::PassB => {
1487                         panic!("PASS_B is not supported for LOP32I");
1488                     }
1489                 },
1490             );
1491             e.set_bit(56, self.srcs[1].src_mod.is_bnot());
1492         } else {
1493             match &self.srcs[1].src_ref {
1494                 SrcRef::Zero | SrcRef::Reg(_) => {
1495                     e.set_opcode(0x5c40);
1496                     e.set_reg_bnot_src(20..28, 40, self.srcs[1]);
1497                 }
1498                 SrcRef::Imm32(imm32) => {
1499                     e.set_opcode(0x3840);
1500                     e.set_src_imm_i20(20..39, 56, *imm32);
1501                     assert!(self.srcs[1].src_mod.is_none());
1502                 }
1503                 SrcRef::CBuf(_) => {
1504                     e.set_opcode(0x4c40);
1505                     e.set_cb_bnot_src(20..39, 40, self.srcs[1]);
1506                 }
1507                 src => panic!("Invalid lop2 src1: {src}"),
1508             }
1509 
1510             e.set_dst(self.dst);
1511             e.set_reg_bnot_src(8..16, 39, self.srcs[0]);
1512 
1513             e.set_field(
1514                 41..43,
1515                 match self.op {
1516                     LogicOp2::And => 0_u8,
1517                     LogicOp2::Or => 1_u8,
1518                     LogicOp2::Xor => 2_u8,
1519                     LogicOp2::PassB => 3_u8,
1520                 },
1521             );
1522 
1523             e.set_pred_dst(48..51, Dst::None);
1524         }
1525     }
1526 }
1527 
1528 impl SM50Op for OpPopC {
legalize(&mut self, b: &mut LegalizeBuilder)1529     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1530         use RegFile::GPR;
1531         b.copy_alu_src_if_i20_overflow(&mut self.src, GPR, SrcType::ALU);
1532     }
1533 
encode(&self, e: &mut SM50Encoder<'_>)1534     fn encode(&self, e: &mut SM50Encoder<'_>) {
1535         match &self.src.src_ref {
1536             SrcRef::Zero | SrcRef::Reg(_) => {
1537                 e.set_opcode(0x5c08);
1538                 e.set_reg_bnot_src(20..28, 40, self.src);
1539             }
1540             SrcRef::Imm32(imm32) => {
1541                 e.set_opcode(0x3808);
1542                 e.set_src_imm_i20(20..39, 56, *imm32);
1543                 e.set_bit(40, self.src.src_mod.is_bnot());
1544             }
1545             SrcRef::CBuf(_) => {
1546                 e.set_opcode(0x4c08);
1547                 e.set_cb_bnot_src(20..39, 40, self.src);
1548             }
1549             src => panic!("Invalid popc src1: {src}"),
1550         }
1551 
1552         e.set_dst(self.dst);
1553     }
1554 }
1555 
1556 impl SM50Op for OpShf {
legalize(&mut self, b: &mut LegalizeBuilder)1557     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1558         use RegFile::GPR;
1559         b.copy_alu_src_if_not_reg(&mut self.high, GPR, SrcType::ALU);
1560         b.copy_alu_src_if_not_reg(&mut self.low, GPR, SrcType::GPR);
1561         b.copy_alu_src_if_not_reg_or_imm(&mut self.shift, GPR, SrcType::GPR);
1562         b.copy_alu_src_if_i20_overflow(&mut self.shift, GPR, SrcType::GPR);
1563     }
1564 
encode(&self, e: &mut SM50Encoder<'_>)1565     fn encode(&self, e: &mut SM50Encoder<'_>) {
1566         match &self.shift.src_ref {
1567             SrcRef::Zero | SrcRef::Reg(_) => {
1568                 e.set_opcode(if self.right { 0x5cf8 } else { 0x5bf8 });
1569                 e.set_reg_src(20..28, self.shift);
1570             }
1571             SrcRef::Imm32(imm32) => {
1572                 e.set_opcode(if self.right { 0x38f8 } else { 0x36f8 });
1573                 e.set_src_imm_i20(20..39, 56, *imm32);
1574                 assert!(self.shift.src_mod.is_none());
1575             }
1576             src => panic!("Invalid shf shift: {src}"),
1577         }
1578 
1579         e.set_field(
1580             37..39,
1581             match self.data_type {
1582                 IntType::I32 => 0_u8,
1583                 IntType::U32 => 0_u8,
1584                 IntType::U64 => 2_u8,
1585                 IntType::I64 => 3_u8,
1586                 _ => panic!("Invalid shift data type"),
1587             },
1588         );
1589 
1590         e.set_dst(self.dst);
1591         e.set_reg_src(8..16, self.low);
1592         e.set_reg_src(39..47, self.high);
1593 
1594         e.set_bit(47, false); // .CC
1595 
1596         // If we're shifting left, the HW will throw an illegal instrucction
1597         // encoding error if we set .high and will give us the high part anyway
1598         // if we don't.  This makes everything a bit more consistent.
1599         assert!(self.right || self.dst_high);
1600         e.set_bit(48, self.dst_high && self.right); // .high
1601 
1602         e.set_bit(49, false); // .X
1603         e.set_bit(50, self.wrap);
1604     }
1605 }
1606 
1607 impl SM50Op for OpShl {
legalize(&mut self, b: &mut LegalizeBuilder)1608     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1609         use RegFile::GPR;
1610         b.copy_alu_src_if_not_reg(&mut self.src, GPR, SrcType::GPR);
1611         b.copy_alu_src_if_i20_overflow(&mut self.shift, GPR, SrcType::ALU);
1612     }
1613 
encode(&self, e: &mut SM50Encoder<'_>)1614     fn encode(&self, e: &mut SM50Encoder<'_>) {
1615         e.set_dst(self.dst);
1616         e.set_reg_src(8..16, self.src);
1617         match &self.shift.src_ref {
1618             SrcRef::Zero | SrcRef::Reg(_) => {
1619                 e.set_opcode(0x5c48);
1620                 e.set_reg_src(20..28, self.shift);
1621             }
1622             SrcRef::Imm32(imm32) => {
1623                 e.set_opcode(0x3848);
1624                 e.set_src_imm_i20(20..39, 56, *imm32);
1625             }
1626             SrcRef::CBuf(cb) => {
1627                 e.set_opcode(0x4c48);
1628                 e.set_src_cb(20..39, cb);
1629             }
1630             src => panic!("Invalid shl shift: {src}"),
1631         }
1632 
1633         e.set_bit(39, self.wrap);
1634     }
1635 }
1636 
1637 impl SM50Op for OpShr {
legalize(&mut self, b: &mut LegalizeBuilder)1638     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1639         use RegFile::GPR;
1640         b.copy_alu_src_if_not_reg(&mut self.src, GPR, SrcType::GPR);
1641         b.copy_alu_src_if_i20_overflow(&mut self.shift, GPR, SrcType::ALU);
1642     }
1643 
encode(&self, e: &mut SM50Encoder<'_>)1644     fn encode(&self, e: &mut SM50Encoder<'_>) {
1645         e.set_dst(self.dst);
1646         e.set_reg_src(8..16, self.src);
1647         match &self.shift.src_ref {
1648             SrcRef::Zero | SrcRef::Reg(_) => {
1649                 e.set_opcode(0x5c28);
1650                 e.set_reg_src(20..28, self.shift);
1651             }
1652             SrcRef::Imm32(imm32) => {
1653                 e.set_opcode(0x3828);
1654                 e.set_src_imm_i20(20..39, 56, *imm32);
1655             }
1656             SrcRef::CBuf(cb) => {
1657                 e.set_opcode(0x4c28);
1658                 e.set_src_cb(20..39, cb);
1659             }
1660             src => panic!("Invalid shr shift: {src}"),
1661         }
1662 
1663         e.set_bit(39, self.wrap);
1664         e.set_bit(48, self.signed);
1665     }
1666 }
1667 
1668 impl SM50Op for OpF2F {
legalize(&mut self, b: &mut LegalizeBuilder)1669     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1670         use RegFile::GPR;
1671         b.copy_alu_src_if_f20_overflow(&mut self.src, GPR, SrcType::ALU);
1672     }
1673 
encode(&self, e: &mut SM50Encoder<'_>)1674     fn encode(&self, e: &mut SM50Encoder<'_>) {
1675         match &self.src.src_ref {
1676             SrcRef::Zero | SrcRef::Reg(_) => {
1677                 e.set_opcode(0x5ca8);
1678                 e.set_reg_fmod_src(20..28, 49, 45, self.src);
1679             }
1680             SrcRef::Imm32(imm32) => {
1681                 e.set_opcode(0x38a8);
1682                 e.set_src_imm_i20(20..39, 56, *imm32);
1683                 assert!(self.src.src_mod.is_none());
1684             }
1685             SrcRef::CBuf(_) => {
1686                 e.set_opcode(0x4ca8);
1687                 e.set_cb_fmod_src(20..39, 49, 45, self.src);
1688             }
1689             src => panic!("Invalid f2f src: {src}"),
1690         }
1691 
1692         // We can't span 32 bits
1693         assert!(
1694             (self.dst_type.bits() <= 32 && self.src_type.bits() <= 32)
1695                 || (self.dst_type.bits() >= 32 && self.src_type.bits() >= 32)
1696         );
1697         e.set_field(8..10, (self.dst_type.bits() / 8).ilog2());
1698         e.set_field(10..12, (self.src_type.bits() / 8).ilog2());
1699 
1700         e.set_rnd_mode(39..41, self.rnd_mode);
1701         e.set_bit(41, self.high);
1702         e.set_bit(42, self.integer_rnd);
1703         e.set_bit(44, self.ftz);
1704         e.set_bit(50, false); // saturate
1705 
1706         e.set_dst(self.dst);
1707     }
1708 }
1709 
1710 impl SM50Op for OpF2I {
legalize(&mut self, b: &mut LegalizeBuilder)1711     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1712         use RegFile::GPR;
1713         b.copy_alu_src_if_f20_overflow(&mut self.src, GPR, SrcType::ALU);
1714     }
1715 
encode(&self, e: &mut SM50Encoder<'_>)1716     fn encode(&self, e: &mut SM50Encoder<'_>) {
1717         match &self.src.src_ref {
1718             SrcRef::Zero | SrcRef::Reg(_) => {
1719                 e.set_opcode(0x5cb0);
1720                 e.set_reg_fmod_src(20..28, 49, 45, self.src);
1721             }
1722             SrcRef::Imm32(imm32) => {
1723                 e.set_opcode(0x38b0);
1724                 e.set_src_imm_f20(20..39, 56, *imm32);
1725                 assert!(self.src.src_mod.is_none());
1726             }
1727             SrcRef::CBuf(_) => {
1728                 e.set_opcode(0x4cb0);
1729                 e.set_cb_fmod_src(20..39, 49, 45, self.src);
1730             }
1731             src => panic!("Invalid f2i src: {src}"),
1732         }
1733 
1734         e.set_dst(self.dst);
1735 
1736         // We can't span 32 bits
1737         assert!(
1738             (self.dst_type.bits() <= 32 && self.src_type.bits() <= 32)
1739                 || (self.dst_type.bits() >= 32 && self.src_type.bits() >= 32)
1740         );
1741         e.set_field(8..10, (self.dst_type.bits() / 8).ilog2());
1742         e.set_field(10..12, (self.src_type.bits() / 8).ilog2());
1743         e.set_bit(12, self.dst_type.is_signed());
1744 
1745         e.set_rnd_mode(39..41, self.rnd_mode);
1746         e.set_bit(44, self.ftz);
1747         e.set_bit(47, false); // .CC
1748     }
1749 }
1750 
1751 impl SM50Op for OpI2F {
legalize(&mut self, b: &mut LegalizeBuilder)1752     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1753         use RegFile::GPR;
1754         b.copy_alu_src_if_i20_overflow(&mut self.src, GPR, SrcType::ALU);
1755     }
1756 
encode(&self, e: &mut SM50Encoder<'_>)1757     fn encode(&self, e: &mut SM50Encoder<'_>) {
1758         match &self.src.src_ref {
1759             SrcRef::Zero | SrcRef::Reg(_) => {
1760                 e.set_opcode(0x5cb8);
1761                 e.set_reg_ineg_src(20..28, 45, self.src);
1762             }
1763             SrcRef::Imm32(imm32) => {
1764                 e.set_opcode(0x38b8);
1765                 e.set_src_imm_i20(20..39, 56, *imm32);
1766                 assert!(self.src.src_mod.is_none());
1767             }
1768             SrcRef::CBuf(_) => {
1769                 e.set_opcode(0x4cb8);
1770                 e.set_cb_ineg_src(20..39, 45, self.src);
1771             }
1772             src => panic!("Invalid i2f src: {src}"),
1773         }
1774 
1775         e.set_dst(self.dst);
1776 
1777         // We can't span 32 bits
1778         assert!(
1779             (self.dst_type.bits() <= 32 && self.src_type.bits() <= 32)
1780                 || (self.dst_type.bits() >= 32 && self.src_type.bits() >= 32)
1781         );
1782         e.set_field(8..10, (self.dst_type.bits() / 8).ilog2());
1783         e.set_field(10..12, (self.src_type.bits() / 8).ilog2());
1784         e.set_bit(13, self.src_type.is_signed());
1785 
1786         e.set_rnd_mode(39..41, self.rnd_mode);
1787         e.set_field(41..43, 0_u8); // TODO: subop
1788         e.set_bit(49, false); // iabs
1789     }
1790 }
1791 
1792 impl SM50Op for OpI2I {
legalize(&mut self, b: &mut LegalizeBuilder)1793     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1794         use RegFile::GPR;
1795         b.copy_alu_src_if_i20_overflow(&mut self.src, GPR, SrcType::ALU);
1796     }
1797 
encode(&self, e: &mut SM50Encoder<'_>)1798     fn encode(&self, e: &mut SM50Encoder<'_>) {
1799         match &self.src.src_ref {
1800             SrcRef::Zero | SrcRef::Reg(_) => {
1801                 e.set_opcode(0x5ce0);
1802                 e.set_reg_src(20..28, self.src);
1803             }
1804             SrcRef::Imm32(imm32) => {
1805                 e.set_opcode(0x38e0);
1806                 e.set_src_imm_i20(20..39, 56, *imm32);
1807             }
1808             SrcRef::CBuf(cbuf) => {
1809                 e.set_opcode(0x4ce0);
1810                 e.set_src_cb(20..39, cbuf);
1811             }
1812             src => panic!("Invalid i2i src: {src}"),
1813         }
1814 
1815         e.set_dst(self.dst);
1816 
1817         // We can't span 32 bits
1818         assert!(
1819             (self.dst_type.bits() <= 32 && self.src_type.bits() <= 32)
1820                 || (self.dst_type.bits() >= 32 && self.src_type.bits() >= 32)
1821         );
1822         e.set_field(8..10, (self.dst_type.bits() / 8).ilog2());
1823         e.set_field(10..12, (self.src_type.bits() / 8).ilog2());
1824         e.set_bit(12, self.dst_type.is_signed());
1825         e.set_bit(13, self.src_type.is_signed());
1826 
1827         e.set_field(41..43, 0u8); // src.B1-3
1828         e.set_bit(45, self.neg);
1829         e.set_bit(47, false); // dst.CC
1830         e.set_bit(49, self.abs);
1831         e.set_bit(50, self.saturate);
1832     }
1833 }
1834 
1835 impl SM50Op for OpMov {
legalize(&mut self, _b: &mut LegalizeBuilder)1836     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1837         // Nothing to do
1838     }
1839 
encode(&self, e: &mut SM50Encoder<'_>)1840     fn encode(&self, e: &mut SM50Encoder<'_>) {
1841         match &self.src.src_ref {
1842             SrcRef::Zero | SrcRef::Reg(_) => {
1843                 e.set_opcode(0x5c98);
1844                 e.set_reg_src(20..28, self.src);
1845                 e.set_field(39..43, self.quad_lanes);
1846             }
1847             SrcRef::Imm32(imm32) => {
1848                 e.set_opcode(0x0100);
1849                 e.set_src_imm32(20..52, *imm32);
1850                 e.set_field(12..16, self.quad_lanes);
1851             }
1852             SrcRef::CBuf(cb) => {
1853                 e.set_opcode(0x4c98);
1854                 e.set_src_cb(20..39, cb);
1855                 e.set_field(39..43, self.quad_lanes);
1856             }
1857             src => panic!("Invalid mov src: {src}"),
1858         }
1859 
1860         e.set_dst(self.dst);
1861     }
1862 }
1863 
1864 impl SM50Op for OpPrmt {
legalize(&mut self, b: &mut LegalizeBuilder)1865     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1866         use RegFile::GPR;
1867         b.copy_alu_src_if_not_reg(&mut self.srcs[0], GPR, SrcType::GPR);
1868         b.copy_alu_src_if_not_reg(&mut self.srcs[1], GPR, SrcType::GPR);
1869     }
1870 
encode(&self, e: &mut SM50Encoder<'_>)1871     fn encode(&self, e: &mut SM50Encoder<'_>) {
1872         match &self.sel.src_ref {
1873             SrcRef::Zero | SrcRef::Reg(_) => {
1874                 e.set_opcode(0x5bc0);
1875                 e.set_reg_src(20..28, self.sel);
1876             }
1877             SrcRef::Imm32(imm32) => {
1878                 e.set_opcode(0x36c0);
1879                 // Only the bottom 16 bits matter
1880                 e.set_src_imm_i20(20..39, 56, *imm32 & 0xffff);
1881             }
1882             SrcRef::CBuf(cb) => {
1883                 e.set_opcode(0x4bc0);
1884                 e.set_src_cb(20..39, cb);
1885             }
1886             src => panic!("Invalid prmt selector: {src}"),
1887         }
1888 
1889         e.set_dst(self.dst);
1890         e.set_reg_src(8..16, self.srcs[0]);
1891         e.set_reg_src(39..47, self.srcs[1]);
1892         e.set_field(
1893             48..51,
1894             match self.mode {
1895                 PrmtMode::Index => 0_u8,
1896                 PrmtMode::Forward4Extract => 1_u8,
1897                 PrmtMode::Backward4Extract => 2_u8,
1898                 PrmtMode::Replicate8 => 3_u8,
1899                 PrmtMode::EdgeClampLeft => 4_u8,
1900                 PrmtMode::EdgeClampRight => 5_u8,
1901                 PrmtMode::Replicate16 => 6_u8,
1902             },
1903         );
1904     }
1905 }
1906 
1907 impl SM50Op for OpSel {
legalize(&mut self, b: &mut LegalizeBuilder)1908     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1909         use RegFile::GPR;
1910         let [src0, src1] = &mut self.srcs;
1911         if swap_srcs_if_not_reg(src0, src1, GPR) {
1912             self.cond = self.cond.bnot();
1913         }
1914         b.copy_alu_src_if_not_reg(src0, GPR, SrcType::ALU);
1915         b.copy_alu_src_if_i20_overflow(src1, GPR, SrcType::ALU);
1916     }
1917 
encode(&self, e: &mut SM50Encoder<'_>)1918     fn encode(&self, e: &mut SM50Encoder<'_>) {
1919         match &self.srcs[1].src_ref {
1920             SrcRef::Zero | SrcRef::Reg(_) => {
1921                 e.set_opcode(0x5ca0);
1922                 e.set_reg_src_ref(20..28, self.srcs[1].src_ref);
1923             }
1924             SrcRef::Imm32(imm32) => {
1925                 e.set_opcode(0x38a0);
1926                 e.set_src_imm_i20(20..39, 56, *imm32);
1927             }
1928             SrcRef::CBuf(cbuf) => {
1929                 e.set_opcode(0x4ca0);
1930                 e.set_src_cb(20..39, cbuf);
1931             }
1932             src => panic!("Invalid sel src1: {src}"),
1933         }
1934 
1935         e.set_dst(self.dst);
1936         e.set_reg_src(8..16, self.srcs[0]);
1937         e.set_pred_src(39..42, 42, self.cond);
1938     }
1939 }
1940 
1941 impl SM50Op for OpShfl {
legalize(&mut self, b: &mut LegalizeBuilder)1942     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1943         use RegFile::GPR;
1944         b.copy_alu_src_if_not_reg(&mut self.src, GPR, SrcType::GPR);
1945         b.copy_alu_src_if_not_reg_or_imm(&mut self.lane, GPR, SrcType::ALU);
1946         b.copy_alu_src_if_not_reg_or_imm(&mut self.c, GPR, SrcType::ALU);
1947     }
1948 
encode(&self, e: &mut SM50Encoder<'_>)1949     fn encode(&self, e: &mut SM50Encoder<'_>) {
1950         e.set_opcode(0xef10);
1951 
1952         e.set_dst(self.dst);
1953         e.set_pred_dst(48..51, self.in_bounds);
1954         e.set_reg_src(8..16, self.src);
1955 
1956         match &self.lane.src_ref {
1957             SrcRef::Zero | SrcRef::Reg(_) => {
1958                 e.set_bit(28, false);
1959                 e.set_reg_src(20..28, self.lane);
1960             }
1961             SrcRef::Imm32(imm32) => {
1962                 e.set_bit(28, true);
1963                 e.set_field(20..25, *imm32 & 0x1f);
1964             }
1965             src => panic!("Invalid shfl lane: {src}"),
1966         }
1967         match &self.c.src_ref {
1968             SrcRef::Zero | SrcRef::Reg(_) => {
1969                 e.set_bit(29, false);
1970                 e.set_reg_src(39..47, self.c);
1971             }
1972             SrcRef::Imm32(imm32) => {
1973                 e.set_bit(29, true);
1974                 e.set_field(34..47, *imm32 & 0x1f1f);
1975             }
1976             src => panic!("Invalid shfl c: {src}"),
1977         }
1978 
1979         e.set_field(
1980             30..32,
1981             match self.op {
1982                 ShflOp::Idx => 0u8,
1983                 ShflOp::Up => 1u8,
1984                 ShflOp::Down => 2u8,
1985                 ShflOp::Bfly => 3u8,
1986             },
1987         );
1988     }
1989 }
1990 
1991 impl SM50Op for OpPSetP {
legalize(&mut self, _b: &mut LegalizeBuilder)1992     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1993         // Nothing to do
1994     }
1995 
encode(&self, e: &mut SM50Encoder<'_>)1996     fn encode(&self, e: &mut SM50Encoder<'_>) {
1997         e.set_opcode(0x5090);
1998 
1999         e.set_pred_dst(3..6, self.dsts[0]);
2000         e.set_pred_dst(0..3, self.dsts[1]);
2001 
2002         e.set_pred_src(12..15, 15, self.srcs[0]);
2003         e.set_pred_src(29..32, 32, self.srcs[1]);
2004         e.set_pred_src(39..42, 42, self.srcs[2]);
2005 
2006         e.set_pred_set_op(24..26, self.ops[0]);
2007         e.set_pred_set_op(45..47, self.ops[1]);
2008     }
2009 }
2010 
2011 impl SM50Encoder<'_> {
set_tex_dim(&mut self, range: Range<usize>, dim: TexDim)2012     fn set_tex_dim(&mut self, range: Range<usize>, dim: TexDim) {
2013         assert!(range.len() == 3);
2014         self.set_field(
2015             range,
2016             match dim {
2017                 TexDim::_1D => 0_u8,
2018                 TexDim::Array1D => 1_u8,
2019                 TexDim::_2D => 2_u8,
2020                 TexDim::Array2D => 3_u8,
2021                 TexDim::_3D => 4_u8,
2022                 TexDim::Cube => 6_u8,
2023                 TexDim::ArrayCube => 7_u8,
2024             },
2025         );
2026     }
2027 
set_tex_lod_mode(&mut self, range: Range<usize>, lod_mode: TexLodMode)2028     fn set_tex_lod_mode(&mut self, range: Range<usize>, lod_mode: TexLodMode) {
2029         assert!(range.len() == 2);
2030         self.set_field(
2031             range,
2032             match lod_mode {
2033                 TexLodMode::Auto => 0_u8,
2034                 TexLodMode::Zero => 1_u8,
2035                 TexLodMode::Bias => 2_u8,
2036                 TexLodMode::Lod => 3_u8,
2037                 _ => panic!("Unknown LOD mode"),
2038             },
2039         );
2040     }
2041 }
2042 
2043 impl SM50Op for OpTex {
legalize(&mut self, b: &mut LegalizeBuilder)2044     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2045         legalize_ext_instr(self, b);
2046     }
2047 
encode(&self, e: &mut SM50Encoder<'_>)2048     fn encode(&self, e: &mut SM50Encoder<'_>) {
2049         e.set_opcode(0xdeb8);
2050 
2051         e.set_dst(self.dsts[0]);
2052         assert!(self.dsts[1].is_none());
2053         assert!(self.fault.is_none());
2054         e.set_reg_src(8..16, self.srcs[0]);
2055         e.set_reg_src(20..28, self.srcs[1]);
2056 
2057         e.set_tex_dim(28..31, self.dim);
2058         e.set_field(31..35, self.mask);
2059         e.set_bit(35, false); // ToDo: NDV
2060         e.set_bit(36, self.offset);
2061         e.set_tex_lod_mode(37..39, self.lod_mode);
2062         e.set_bit(49, false); // TODO: .NODEP
2063         e.set_bit(50, self.z_cmpr);
2064     }
2065 }
2066 
2067 impl SM50Op for OpTld {
legalize(&mut self, b: &mut LegalizeBuilder)2068     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2069         legalize_ext_instr(self, b);
2070     }
2071 
encode(&self, e: &mut SM50Encoder<'_>)2072     fn encode(&self, e: &mut SM50Encoder<'_>) {
2073         e.set_opcode(0xdd38);
2074 
2075         e.set_dst(self.dsts[0]);
2076         assert!(self.dsts[1].is_none());
2077         assert!(self.fault.is_none());
2078         e.set_reg_src(8..16, self.srcs[0]);
2079         e.set_reg_src(20..28, self.srcs[1]);
2080 
2081         e.set_tex_dim(28..31, self.dim);
2082         e.set_field(31..35, self.mask);
2083         e.set_bit(35, self.offset);
2084         e.set_bit(49, false); // TODO: .NODEP
2085         e.set_bit(50, self.is_ms);
2086 
2087         assert!(
2088             self.lod_mode == TexLodMode::Zero
2089                 || self.lod_mode == TexLodMode::Lod
2090         );
2091         e.set_bit(55, self.lod_mode == TexLodMode::Lod);
2092     }
2093 }
2094 
2095 impl SM50Op for OpTld4 {
legalize(&mut self, b: &mut LegalizeBuilder)2096     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2097         legalize_ext_instr(self, b);
2098     }
2099 
encode(&self, e: &mut SM50Encoder<'_>)2100     fn encode(&self, e: &mut SM50Encoder<'_>) {
2101         e.set_opcode(0xdef8);
2102 
2103         e.set_dst(self.dsts[0]);
2104         assert!(self.dsts[1].is_none());
2105         assert!(self.fault.is_none());
2106         e.set_reg_src(8..16, self.srcs[0]);
2107         e.set_reg_src(20..28, self.srcs[1]);
2108 
2109         e.set_tex_dim(28..31, self.dim);
2110         e.set_field(31..35, self.mask);
2111         e.set_bit(35, false); // ToDo: NDV
2112         e.set_field(
2113             36..38,
2114             match self.offset_mode {
2115                 Tld4OffsetMode::None => 0_u8,
2116                 Tld4OffsetMode::AddOffI => 1_u8,
2117                 Tld4OffsetMode::PerPx => 2_u8,
2118             },
2119         );
2120         e.set_field(38..40, self.comp);
2121         e.set_bit(49, false); // TODO: .NODEP
2122         e.set_bit(50, self.z_cmpr);
2123     }
2124 }
2125 
2126 impl SM50Op for OpTmml {
legalize(&mut self, b: &mut LegalizeBuilder)2127     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2128         legalize_ext_instr(self, b);
2129     }
2130 
encode(&self, e: &mut SM50Encoder<'_>)2131     fn encode(&self, e: &mut SM50Encoder<'_>) {
2132         e.set_opcode(0xdf60);
2133 
2134         e.set_dst(self.dsts[0]);
2135         assert!(self.dsts[1].is_none());
2136         e.set_reg_src(8..16, self.srcs[0]);
2137         e.set_reg_src(20..28, self.srcs[1]);
2138 
2139         e.set_tex_dim(28..31, self.dim);
2140         e.set_field(31..35, self.mask);
2141         e.set_bit(35, false); // ToDo: NDV
2142         e.set_bit(49, false); // TODO: .NODEP
2143     }
2144 }
2145 
2146 impl SM50Op for OpTxd {
legalize(&mut self, b: &mut LegalizeBuilder)2147     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2148         legalize_ext_instr(self, b);
2149     }
2150 
encode(&self, e: &mut SM50Encoder<'_>)2151     fn encode(&self, e: &mut SM50Encoder<'_>) {
2152         e.set_opcode(0xde78);
2153 
2154         e.set_dst(self.dsts[0]);
2155         assert!(self.dsts[1].is_none());
2156         assert!(self.fault.is_none());
2157         e.set_reg_src(8..16, self.srcs[0]);
2158         e.set_reg_src(20..28, self.srcs[1]);
2159 
2160         e.set_tex_dim(28..31, self.dim);
2161         e.set_field(31..35, self.mask);
2162         e.set_bit(35, self.offset);
2163         e.set_bit(49, false); // TODO: .NODEP
2164     }
2165 }
2166 
2167 impl SM50Op for OpTxq {
legalize(&mut self, b: &mut LegalizeBuilder)2168     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2169         legalize_ext_instr(self, b);
2170     }
2171 
encode(&self, e: &mut SM50Encoder<'_>)2172     fn encode(&self, e: &mut SM50Encoder<'_>) {
2173         e.set_opcode(0xdf50);
2174 
2175         e.set_dst(self.dsts[0]);
2176         assert!(self.dsts[1].is_none());
2177         e.set_reg_src(8..16, self.src);
2178 
2179         e.set_field(
2180             22..28,
2181             match self.query {
2182                 TexQuery::Dimension => 1_u8,
2183                 TexQuery::TextureType => 2_u8,
2184                 TexQuery::SamplerPos => 5_u8,
2185                 // TexQuery::Filter => 0x10_u8,
2186                 // TexQuery::Lod => 0x12_u8,
2187                 // TexQuery::Wrap => 0x14_u8,
2188                 // TexQuery::BorderColour => 0x16,
2189             },
2190         );
2191         e.set_field(31..35, self.mask);
2192         e.set_bit(49, false); // TODO: .NODEP
2193     }
2194 }
2195 
2196 impl SM50Encoder<'_> {
set_mem_type(&mut self, range: Range<usize>, mem_type: MemType)2197     fn set_mem_type(&mut self, range: Range<usize>, mem_type: MemType) {
2198         assert!(range.len() == 3);
2199         self.set_field(
2200             range,
2201             match mem_type {
2202                 MemType::U8 => 0_u8,
2203                 MemType::I8 => 1_u8,
2204                 MemType::U16 => 2_u8,
2205                 MemType::I16 => 3_u8,
2206                 MemType::B32 => 4_u8,
2207                 MemType::B64 => 5_u8,
2208                 MemType::B128 => 6_u8,
2209             },
2210         );
2211     }
2212 
set_mem_order(&mut self, _order: &MemOrder)2213     fn set_mem_order(&mut self, _order: &MemOrder) {
2214         // TODO: order and scope aren't present before SM70, what should we do?
2215     }
2216 
set_mem_access(&mut self, access: &MemAccess)2217     fn set_mem_access(&mut self, access: &MemAccess) {
2218         self.set_field(
2219             45..46,
2220             match access.space.addr_type() {
2221                 MemAddrType::A32 => 0_u8,
2222                 MemAddrType::A64 => 1_u8,
2223             },
2224         );
2225         self.set_mem_type(48..51, access.mem_type);
2226         self.set_mem_order(&access.order);
2227     }
2228 
set_image_dim(&mut self, range: Range<usize>, dim: ImageDim)2229     fn set_image_dim(&mut self, range: Range<usize>, dim: ImageDim) {
2230         assert!(range.len() == 3);
2231         self.set_field(
2232             range,
2233             match dim {
2234                 ImageDim::_1D => 0_u8,
2235                 ImageDim::_1DBuffer => 1_u8,
2236                 ImageDim::_1DArray => 2_u8,
2237                 ImageDim::_2D => 3_u8,
2238                 ImageDim::_2DArray => 4_u8,
2239                 ImageDim::_3D => 5_u8,
2240             },
2241         );
2242     }
2243 }
2244 
2245 impl SM50Op for OpSuLd {
legalize(&mut self, b: &mut LegalizeBuilder)2246     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2247         legalize_ext_instr(self, b);
2248     }
2249 
encode(&self, e: &mut SM50Encoder<'_>)2250     fn encode(&self, e: &mut SM50Encoder<'_>) {
2251         e.set_opcode(0xeb00);
2252 
2253         assert!(self.mask == 0x1 || self.mask == 0x3 || self.mask == 0xf);
2254         e.set_field(20..24, self.mask);
2255         e.set_image_dim(33..36, self.image_dim);
2256 
2257         // mem_eviction_policy not a thing for sm < 70
2258 
2259         let scope = match self.mem_order {
2260             MemOrder::Constant => MemScope::System,
2261             MemOrder::Weak => MemScope::CTA,
2262             MemOrder::Strong(s) => s,
2263         };
2264 
2265         e.set_field(
2266             24..26,
2267             match scope {
2268                 MemScope::CTA => 0_u8,
2269                 /* SM => 1_u8, */
2270                 MemScope::GPU => 2_u8,
2271                 MemScope::System => 3_u8,
2272             },
2273         );
2274 
2275         e.set_dst(self.dst);
2276 
2277         e.set_reg_src(8..16, self.coord);
2278         e.set_reg_src(39..47, self.handle);
2279     }
2280 }
2281 
2282 impl SM50Op for OpSuSt {
legalize(&mut self, b: &mut LegalizeBuilder)2283     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2284         legalize_ext_instr(self, b);
2285     }
2286 
encode(&self, e: &mut SM50Encoder<'_>)2287     fn encode(&self, e: &mut SM50Encoder<'_>) {
2288         e.set_opcode(0xeb20);
2289 
2290         e.set_reg_src(8..16, self.coord);
2291         e.set_reg_src(0..8, self.data);
2292         e.set_reg_src(39..47, self.handle);
2293 
2294         e.set_image_dim(33..36, self.image_dim);
2295         e.set_mem_order(&self.mem_order);
2296 
2297         assert!(self.mask == 0x1 || self.mask == 0x3 || self.mask == 0xf);
2298         e.set_field(20..24, self.mask);
2299     }
2300 }
2301 
2302 impl SM50Encoder<'_> {
set_atom_op(&mut self, range: Range<usize>, atom_op: AtomOp)2303     fn set_atom_op(&mut self, range: Range<usize>, atom_op: AtomOp) {
2304         self.set_field(
2305             range,
2306             match atom_op {
2307                 AtomOp::Add => 0_u8,
2308                 AtomOp::Min => 1_u8,
2309                 AtomOp::Max => 2_u8,
2310                 AtomOp::Inc => 3_u8,
2311                 AtomOp::Dec => 4_u8,
2312                 AtomOp::And => 5_u8,
2313                 AtomOp::Or => 6_u8,
2314                 AtomOp::Xor => 7_u8,
2315                 AtomOp::Exch => 8_u8,
2316                 AtomOp::CmpExch(_) => panic!("CmpExch is a separate opcode"),
2317             },
2318         );
2319     }
2320 }
2321 
2322 impl SM50Op for OpSuAtom {
legalize(&mut self, b: &mut LegalizeBuilder)2323     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2324         legalize_ext_instr(self, b);
2325     }
2326 
encode(&self, e: &mut SM50Encoder<'_>)2327     fn encode(&self, e: &mut SM50Encoder<'_>) {
2328         if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2329             e.set_opcode(0xeac0);
2330             assert!(cmp_src == AtomCmpSrc::Packed);
2331         } else {
2332             e.set_opcode(0xea60);
2333             e.set_atom_op(29..33, self.atom_op);
2334         }
2335 
2336         let atom_type: u8 = match self.atom_type {
2337             AtomType::U32 => 0,
2338             AtomType::I32 => 1,
2339             AtomType::F32 => 3,
2340             AtomType::U64 => 2,
2341             AtomType::I64 => 5,
2342             _ => panic!("Unsupported atom type {}", self.atom_type),
2343         };
2344 
2345         e.set_image_dim(33..36, self.image_dim);
2346         e.set_field(36..39, atom_type);
2347 
2348         // The hardware requires that we set .D on atomics.  This is safe to do
2349         // in in the emit code because it only affects format conversion, not
2350         // surface coordinates and atomics are required to be performed with
2351         // image formats that that exactly match the shader data type.  So, for
2352         // instance, a uint32_t atomic has to happen on an R32_UINT or R32_SINT
2353         // image.
2354         e.set_bit(52, true); // .D
2355 
2356         e.set_dst(self.dst);
2357 
2358         e.set_reg_src(20..28, self.data);
2359         e.set_reg_src(8..16, self.coord);
2360         e.set_reg_src(39..47, self.handle);
2361     }
2362 }
2363 
2364 impl SM50Op for OpLd {
legalize(&mut self, b: &mut LegalizeBuilder)2365     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2366         legalize_ext_instr(self, b);
2367     }
2368 
encode(&self, e: &mut SM50Encoder<'_>)2369     fn encode(&self, e: &mut SM50Encoder<'_>) {
2370         e.set_opcode(match self.access.space {
2371             MemSpace::Global(_) => 0xeed0,
2372             MemSpace::Local => 0xef40,
2373             MemSpace::Shared => 0xef48,
2374         });
2375 
2376         e.set_dst(self.dst);
2377         e.set_reg_src(8..16, self.addr);
2378         e.set_field(20..44, self.offset);
2379 
2380         e.set_mem_access(&self.access);
2381     }
2382 }
2383 
2384 impl SM50Op for OpLdc {
legalize(&mut self, b: &mut LegalizeBuilder)2385     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2386         use RegFile::GPR;
2387         b.copy_alu_src_if_not_reg(&mut self.offset, GPR, SrcType::GPR);
2388     }
2389 
encode(&self, e: &mut SM50Encoder<'_>)2390     fn encode(&self, e: &mut SM50Encoder<'_>) {
2391         assert!(self.cb.src_mod.is_none());
2392         let SrcRef::CBuf(cb) = &self.cb.src_ref else {
2393             panic!("Not a CBuf source");
2394         };
2395         let CBuf::Binding(cb_idx) = cb.buf else {
2396             panic!("Must be a bound constant buffer");
2397         };
2398 
2399         e.set_opcode(0xef90);
2400 
2401         e.set_dst(self.dst);
2402         e.set_reg_src(8..16, self.offset);
2403         e.set_field(20..36, cb.offset);
2404         e.set_field(36..41, cb_idx);
2405         e.set_field(
2406             44..46,
2407             match self.mode {
2408                 LdcMode::Indexed => 0_u8,
2409                 LdcMode::IndexedLinear => 1_u8,
2410                 LdcMode::IndexedSegmented => 2_u8,
2411                 LdcMode::IndexedSegmentedLinear => 3_u8,
2412             },
2413         );
2414         e.set_mem_type(48..51, self.mem_type);
2415     }
2416 }
2417 
2418 impl SM50Op for OpSt {
legalize(&mut self, b: &mut LegalizeBuilder)2419     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2420         legalize_ext_instr(self, b);
2421     }
2422 
encode(&self, e: &mut SM50Encoder<'_>)2423     fn encode(&self, e: &mut SM50Encoder<'_>) {
2424         e.set_opcode(match self.access.space {
2425             MemSpace::Global(_) => 0xeed8,
2426             MemSpace::Local => 0xef50,
2427             MemSpace::Shared => 0xef58,
2428         });
2429 
2430         e.set_reg_src(0..8, self.data);
2431         e.set_reg_src(8..16, self.addr);
2432         e.set_field(20..44, self.offset);
2433         e.set_mem_access(&self.access);
2434     }
2435 }
2436 
atom_src_as_ssa( b: &mut LegalizeBuilder, src: Src, atom_type: AtomType, ) -> SSARef2437 fn atom_src_as_ssa(
2438     b: &mut LegalizeBuilder,
2439     src: Src,
2440     atom_type: AtomType,
2441 ) -> SSARef {
2442     if let Some(ssa) = src.as_ssa() {
2443         return *ssa;
2444     }
2445 
2446     let tmp;
2447     if atom_type.bits() == 32 {
2448         tmp = b.alloc_ssa(RegFile::GPR, 1);
2449         b.copy_to(tmp.into(), 0.into());
2450     } else {
2451         debug_assert!(atom_type.bits() == 64);
2452         tmp = b.alloc_ssa(RegFile::GPR, 2);
2453         b.copy_to(tmp[0].into(), 0.into());
2454         b.copy_to(tmp[1].into(), 0.into());
2455     }
2456     tmp
2457 }
2458 
2459 impl SM50Op for OpAtom {
legalize(&mut self, b: &mut LegalizeBuilder)2460     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2461         if self.atom_op == AtomOp::CmpExch(AtomCmpSrc::Separate) {
2462             let cmpr = atom_src_as_ssa(b, self.cmpr, self.atom_type);
2463             let data = atom_src_as_ssa(b, self.data, self.atom_type);
2464 
2465             let mut cmpr_data = Vec::new();
2466             cmpr_data.extend_from_slice(&cmpr);
2467             cmpr_data.extend_from_slice(&data);
2468             let cmpr_data = SSARef::try_from(cmpr_data).unwrap();
2469 
2470             self.cmpr = 0.into();
2471             self.data = cmpr_data.into();
2472             self.atom_op = AtomOp::CmpExch(AtomCmpSrc::Packed);
2473         }
2474         legalize_ext_instr(self, b);
2475     }
2476 
encode(&self, e: &mut SM50Encoder<'_>)2477     fn encode(&self, e: &mut SM50Encoder<'_>) {
2478         match self.mem_space {
2479             MemSpace::Global(addr_type) => {
2480                 if self.dst.is_none() {
2481                     e.set_opcode(0xebf8);
2482 
2483                     e.set_reg_src(0..8, self.data);
2484 
2485                     let data_type = match self.atom_type {
2486                         AtomType::U32 => 0_u8,
2487                         AtomType::I32 => 1_u8,
2488                         AtomType::U64 => 2_u8,
2489                         AtomType::F32 => 3_u8,
2490                         // NOTE: U128 => 4_u8,
2491                         AtomType::I64 => 5_u8,
2492                         _ => panic!("Unsupported data type"),
2493                     };
2494                     e.set_field(20..23, data_type);
2495                     e.set_atom_op(23..26, self.atom_op);
2496                 } else if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2497                     e.set_opcode(0xee00);
2498 
2499                     e.set_dst(self.dst);
2500 
2501                     // TODO: These are all supported by the disassembler but
2502                     // only the packed layout appears to be supported by real
2503                     // hardware
2504                     let (data_src, data_layout) = match cmp_src {
2505                         AtomCmpSrc::Separate => {
2506                             if self.data.is_zero() {
2507                                 (self.cmpr, 1_u8)
2508                             } else {
2509                                 assert!(self.cmpr.is_zero());
2510                                 (self.data, 2_u8)
2511                             }
2512                         }
2513                         AtomCmpSrc::Packed => (self.data, 0_u8),
2514                     };
2515                     e.set_reg_src(20..28, data_src);
2516 
2517                     let data_type = match self.atom_type {
2518                         AtomType::U32 => 0_u8,
2519                         AtomType::U64 => 1_u8,
2520                         _ => panic!("Unsupported data type"),
2521                     };
2522                     e.set_field(49..50, data_type);
2523                     e.set_field(50..52, data_layout);
2524                     e.set_field(52..56, 15_u8); // subOp
2525                 } else {
2526                     e.set_opcode(0xed00);
2527 
2528                     e.set_dst(self.dst);
2529                     e.set_reg_src(20..28, self.data);
2530 
2531                     let data_type = match self.atom_type {
2532                         AtomType::U32 => 0_u8,
2533                         AtomType::I32 => 1_u8,
2534                         AtomType::U64 => 2_u8,
2535                         AtomType::F32 => 3_u8,
2536                         // NOTE: U128 => 4_u8,
2537                         AtomType::I64 => 5_u8,
2538                         _ => panic!("Unsupported data type"),
2539                     };
2540                     e.set_field(49..52, data_type);
2541                     e.set_atom_op(52..56, self.atom_op);
2542                 }
2543 
2544                 e.set_mem_order(&self.mem_order);
2545 
2546                 e.set_reg_src(8..16, self.addr);
2547                 e.set_field(28..48, self.addr_offset);
2548                 e.set_field(
2549                     48..49,
2550                     match addr_type {
2551                         MemAddrType::A32 => 0_u8,
2552                         MemAddrType::A64 => 1_u8,
2553                     },
2554                 );
2555             }
2556             MemSpace::Local => panic!("Atomics do not support local"),
2557             MemSpace::Shared => {
2558                 if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2559                     e.set_opcode(0xee00);
2560 
2561                     assert!(cmp_src == AtomCmpSrc::Packed);
2562                     assert!(self.cmpr.is_zero());
2563                     e.set_reg_src(20..28, self.data);
2564 
2565                     let subop = match self.atom_type {
2566                         AtomType::U32 => 4_u8,
2567                         AtomType::U64 => 5_u8,
2568                         _ => panic!("Unsupported data type"),
2569                     };
2570                     e.set_field(52..56, subop);
2571                 } else {
2572                     e.set_opcode(0xec00);
2573 
2574                     e.set_reg_src(20..28, self.data);
2575 
2576                     let data_type = match self.atom_type {
2577                         AtomType::U32 => 0_u8,
2578                         AtomType::I32 => 1_u8,
2579                         AtomType::U64 => 2_u8,
2580                         AtomType::I64 => 3_u8,
2581                         _ => panic!("Unsupported data type"),
2582                     };
2583                     e.set_field(28..30, data_type);
2584                     e.set_atom_op(52..56, self.atom_op);
2585                 }
2586 
2587                 e.set_dst(self.dst);
2588                 e.set_reg_src(8..16, self.addr);
2589                 assert_eq!(self.addr_offset % 4, 0);
2590                 e.set_field(30..52, self.addr_offset / 4);
2591             }
2592         }
2593     }
2594 }
2595 
2596 impl SM50Op for OpAL2P {
legalize(&mut self, b: &mut LegalizeBuilder)2597     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2598         legalize_ext_instr(self, b);
2599     }
2600 
encode(&self, e: &mut SM50Encoder<'_>)2601     fn encode(&self, e: &mut SM50Encoder<'_>) {
2602         e.set_opcode(0xefa0);
2603 
2604         e.set_dst(self.dst);
2605         e.set_reg_src(8..16, self.offset);
2606 
2607         e.set_field(20..31, self.access.addr);
2608         assert!(!self.access.patch);
2609         e.set_bit(32, self.access.output);
2610 
2611         e.set_field(47..49, 0_u8); // comps
2612         e.set_pred_dst(44..47, Dst::None);
2613     }
2614 }
2615 
2616 impl SM50Op for OpALd {
legalize(&mut self, b: &mut LegalizeBuilder)2617     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2618         legalize_ext_instr(self, b);
2619     }
2620 
encode(&self, e: &mut SM50Encoder<'_>)2621     fn encode(&self, e: &mut SM50Encoder<'_>) {
2622         e.set_opcode(0xefd8);
2623 
2624         e.set_dst(self.dst);
2625         if self.access.phys {
2626             assert!(!self.access.patch);
2627             assert!(self.offset.src_ref.as_reg().is_some());
2628         } else if !self.access.patch {
2629             assert!(self.offset.is_zero());
2630         }
2631         e.set_reg_src(8..16, self.offset);
2632         e.set_reg_src(39..47, self.vtx);
2633 
2634         e.set_field(20..30, self.access.addr);
2635         e.set_bit(31, self.access.patch);
2636         e.set_bit(32, self.access.output);
2637         e.set_field(47..49, self.access.comps - 1);
2638     }
2639 }
2640 
2641 impl SM50Op for OpASt {
legalize(&mut self, b: &mut LegalizeBuilder)2642     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2643         legalize_ext_instr(self, b);
2644     }
2645 
encode(&self, e: &mut SM50Encoder<'_>)2646     fn encode(&self, e: &mut SM50Encoder<'_>) {
2647         e.set_opcode(0xeff0);
2648 
2649         e.set_reg_src(0..8, self.data);
2650         e.set_reg_src(8..16, self.offset);
2651         e.set_reg_src(39..47, self.vtx);
2652 
2653         assert!(!self.access.phys);
2654         assert!(self.access.output);
2655         e.set_field(20..30, self.access.addr);
2656         e.set_bit(31, self.access.patch);
2657         e.set_bit(32, self.access.output);
2658         e.set_field(47..49, self.access.comps - 1);
2659     }
2660 }
2661 
2662 impl SM50Op for OpIpa {
legalize(&mut self, b: &mut LegalizeBuilder)2663     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2664         legalize_ext_instr(self, b);
2665     }
2666 
encode(&self, e: &mut SM50Encoder<'_>)2667     fn encode(&self, e: &mut SM50Encoder<'_>) {
2668         e.set_opcode(0xe000);
2669 
2670         e.set_dst(self.dst);
2671         e.set_reg_src(8..16, 0.into()); // addr
2672         e.set_reg_src(20..28, self.inv_w);
2673         e.set_reg_src(39..47, self.offset);
2674 
2675         assert!(self.addr % 4 == 0);
2676         e.set_field(28..38, self.addr);
2677         e.set_bit(38, false); // .IDX
2678         e.set_pred_dst(47..50, Dst::None); // TODO: What is this for?
2679         e.set_bit(51, false); // .SAT
2680         e.set_field(
2681             52..54,
2682             match self.loc {
2683                 InterpLoc::Default => 0_u8,
2684                 InterpLoc::Centroid => 1_u8,
2685                 InterpLoc::Offset => 2_u8,
2686             },
2687         );
2688         e.set_field(
2689             54..56,
2690             match self.freq {
2691                 InterpFreq::Pass => 0_u8,
2692                 InterpFreq::PassMulW => 1_u8,
2693                 InterpFreq::Constant => 2_u8,
2694                 InterpFreq::State => 3_u8,
2695             },
2696         );
2697     }
2698 }
2699 
2700 impl SM50Op for OpCCtl {
legalize(&mut self, b: &mut LegalizeBuilder)2701     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2702         legalize_ext_instr(self, b);
2703     }
2704 
encode(&self, e: &mut SM50Encoder<'_>)2705     fn encode(&self, e: &mut SM50Encoder<'_>) {
2706         match self.mem_space {
2707             MemSpace::Global(addr_type) => {
2708                 e.set_opcode(0xef60);
2709 
2710                 assert!(self.addr_offset % 4 == 0);
2711                 e.set_field(22..52, self.addr_offset / 4);
2712                 e.set_field(
2713                     52..53,
2714                     match addr_type {
2715                         MemAddrType::A32 => 0_u8,
2716                         MemAddrType::A64 => 1_u8,
2717                     },
2718                 );
2719             }
2720             MemSpace::Local => panic!("cctl does not support local"),
2721             MemSpace::Shared => {
2722                 e.set_opcode(0xef80);
2723 
2724                 assert!(self.addr_offset % 4 == 0);
2725                 e.set_field(22..44, self.addr_offset / 4);
2726             }
2727         }
2728 
2729         e.set_field(
2730             0..4,
2731             match self.op {
2732                 CCtlOp::Qry1 => 0_u8,
2733                 CCtlOp::PF1 => 1_u8,
2734                 CCtlOp::PF1_5 => 2_u8,
2735                 CCtlOp::PF2 => 3_u8,
2736                 CCtlOp::WB => 4_u8,
2737                 CCtlOp::IV => 5_u8,
2738                 CCtlOp::IVAll => 6_u8,
2739                 CCtlOp::RS => 7_u8,
2740                 CCtlOp::RSLB => 7_u8,
2741                 op => panic!("Unsupported cache control {op:?}"),
2742             },
2743         );
2744         e.set_reg_src(8..16, self.addr);
2745     }
2746 }
2747 
2748 impl SM50Op for OpMemBar {
legalize(&mut self, _b: &mut LegalizeBuilder)2749     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2750         // Nothing to do
2751     }
2752 
encode(&self, e: &mut SM50Encoder<'_>)2753     fn encode(&self, e: &mut SM50Encoder<'_>) {
2754         e.set_opcode(0xef98);
2755 
2756         e.set_field(
2757             8..10,
2758             match self.scope {
2759                 MemScope::CTA => 0_u8,
2760                 MemScope::GPU => 1_u8,
2761                 MemScope::System => 2_u8,
2762             },
2763         );
2764     }
2765 }
2766 
2767 impl SM50Encoder<'_> {
set_rel_offset(&mut self, range: Range<usize>, label: &Label)2768     fn set_rel_offset(&mut self, range: Range<usize>, label: &Label) {
2769         let ip = u32::try_from(self.ip).unwrap();
2770         let ip = i32::try_from(ip).unwrap();
2771 
2772         let target_ip = *self.labels.get(label).unwrap();
2773         let target_ip = u32::try_from(target_ip).unwrap();
2774         let target_ip = i32::try_from(target_ip).unwrap();
2775 
2776         let rel_offset = target_ip - ip - 8;
2777 
2778         self.set_field(range, rel_offset);
2779     }
2780 }
2781 
2782 impl SM50Op for OpBra {
legalize(&mut self, _b: &mut LegalizeBuilder)2783     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2784         // Nothing to do
2785     }
2786 
encode(&self, e: &mut SM50Encoder<'_>)2787     fn encode(&self, e: &mut SM50Encoder<'_>) {
2788         e.set_opcode(0xe240);
2789         e.set_rel_offset(20..44, &self.target);
2790         e.set_field(0..5, 0xF_u8); // TODO: Pred?
2791     }
2792 }
2793 
2794 impl SM50Op for OpSSy {
legalize(&mut self, _b: &mut LegalizeBuilder)2795     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2796         // Nothing to do
2797     }
2798 
encode(&self, e: &mut SM50Encoder<'_>)2799     fn encode(&self, e: &mut SM50Encoder<'_>) {
2800         e.set_opcode(0xe290);
2801         e.set_rel_offset(20..44, &self.target);
2802         e.set_field(0..5, 0xF_u8); // TODO: Pred?
2803     }
2804 }
2805 
2806 impl SM50Op for OpSync {
legalize(&mut self, _b: &mut LegalizeBuilder)2807     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2808         // Nothing to do
2809     }
2810 
encode(&self, e: &mut SM50Encoder<'_>)2811     fn encode(&self, e: &mut SM50Encoder<'_>) {
2812         e.set_opcode(0xf0f8);
2813         e.set_field(0..5, 0xF_u8); // TODO: Pred?
2814     }
2815 }
2816 
2817 impl SM50Op for OpBrk {
legalize(&mut self, _b: &mut LegalizeBuilder)2818     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2819         // Nothing to do
2820     }
2821 
encode(&self, e: &mut SM50Encoder<'_>)2822     fn encode(&self, e: &mut SM50Encoder<'_>) {
2823         e.set_opcode(0xe340);
2824         e.set_field(0..5, 0xF_u8); // TODO: Pred?
2825     }
2826 }
2827 
2828 impl SM50Op for OpPBk {
legalize(&mut self, _b: &mut LegalizeBuilder)2829     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2830         // Nothing to do
2831     }
2832 
encode(&self, e: &mut SM50Encoder<'_>)2833     fn encode(&self, e: &mut SM50Encoder<'_>) {
2834         e.set_opcode(0xe2a0);
2835         e.set_rel_offset(20..44, &self.target);
2836         e.set_field(0..5, 0xF_u8); // TODO: Pred?
2837     }
2838 }
2839 
2840 impl SM50Op for OpCont {
legalize(&mut self, _b: &mut LegalizeBuilder)2841     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2842         // Nothing to do
2843     }
2844 
encode(&self, e: &mut SM50Encoder<'_>)2845     fn encode(&self, e: &mut SM50Encoder<'_>) {
2846         e.set_opcode(0xe350);
2847         e.set_field(0..5, 0xF_u8); // TODO: Pred?
2848     }
2849 }
2850 
2851 impl SM50Op for OpPCnt {
legalize(&mut self, _b: &mut LegalizeBuilder)2852     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2853         // Nothing to do
2854     }
2855 
encode(&self, e: &mut SM50Encoder<'_>)2856     fn encode(&self, e: &mut SM50Encoder<'_>) {
2857         e.set_opcode(0xe2b0);
2858         e.set_rel_offset(20..44, &self.target);
2859         e.set_field(0..5, 0xF_u8); // TODO: Pred?
2860     }
2861 }
2862 
2863 impl SM50Op for OpExit {
legalize(&mut self, _b: &mut LegalizeBuilder)2864     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2865         // Nothing to do
2866     }
2867 
encode(&self, e: &mut SM50Encoder<'_>)2868     fn encode(&self, e: &mut SM50Encoder<'_>) {
2869         e.set_opcode(0xe300);
2870 
2871         // TODO: CC flags
2872         e.set_field(0..4, 0xf_u8); // CC.T
2873     }
2874 }
2875 
2876 impl SM50Op for OpBar {
legalize(&mut self, _b: &mut LegalizeBuilder)2877     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2878         // Nothing to do
2879     }
2880 
encode(&self, e: &mut SM50Encoder<'_>)2881     fn encode(&self, e: &mut SM50Encoder<'_>) {
2882         e.set_opcode(0xf0a8);
2883 
2884         e.set_reg_src(8..16, SrcRef::Zero.into());
2885 
2886         // 00: RED.POPC
2887         // 01: RED.AND
2888         // 02: RED.OR
2889         e.set_field(35..37, 0_u8);
2890 
2891         // 00: SYNC
2892         // 01: ARV
2893         // 02: RED
2894         // 03: SCAN
2895         e.set_field(32..35, 0_u8);
2896 
2897         e.set_pred_src(39..42, 42, SrcRef::True.into());
2898     }
2899 }
2900 
2901 impl SM50Op for OpCS2R {
legalize(&mut self, _b: &mut LegalizeBuilder)2902     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2903         // Nothing to do
2904     }
2905 
encode(&self, e: &mut SM50Encoder<'_>)2906     fn encode(&self, e: &mut SM50Encoder<'_>) {
2907         e.set_opcode(0x50c8);
2908         e.set_dst(self.dst);
2909         e.set_field(20..28, self.idx);
2910     }
2911 }
2912 
2913 impl SM50Op for OpIsberd {
legalize(&mut self, _b: &mut LegalizeBuilder)2914     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2915         // Nothing to do
2916     }
2917 
encode(&self, e: &mut SM50Encoder<'_>)2918     fn encode(&self, e: &mut SM50Encoder<'_>) {
2919         e.set_opcode(0xefd0);
2920         e.set_dst(self.dst);
2921         e.set_reg_src(8..16, self.idx);
2922     }
2923 }
2924 
2925 impl SM50Op for OpKill {
legalize(&mut self, _b: &mut LegalizeBuilder)2926     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2927         // Nothing to do
2928     }
2929 
encode(&self, e: &mut SM50Encoder<'_>)2930     fn encode(&self, e: &mut SM50Encoder<'_>) {
2931         e.set_opcode(0xe330);
2932         e.set_field(0..5, 0x0f_u8);
2933     }
2934 }
2935 
2936 impl SM50Op for OpNop {
legalize(&mut self, _b: &mut LegalizeBuilder)2937     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2938         // Nothing to do
2939     }
2940 
encode(&self, e: &mut SM50Encoder<'_>)2941     fn encode(&self, e: &mut SM50Encoder<'_>) {
2942         e.set_opcode(0x50b0);
2943 
2944         // TODO: CC flags
2945         e.set_field(8..12, 0xf_u8); // CC.T
2946     }
2947 }
2948 
2949 impl SM50Op for OpPixLd {
legalize(&mut self, _b: &mut LegalizeBuilder)2950     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2951         // Nothing to do
2952     }
2953 
encode(&self, e: &mut SM50Encoder<'_>)2954     fn encode(&self, e: &mut SM50Encoder<'_>) {
2955         e.set_opcode(0xefe8);
2956         e.set_dst(self.dst);
2957         e.set_reg_src(8..16, 0.into());
2958         e.set_field(
2959             31..34,
2960             match &self.val {
2961                 PixVal::CovMask => 1_u8,
2962                 PixVal::Covered => 2_u8,
2963                 PixVal::Offset => 3_u8,
2964                 PixVal::CentroidOffset => 4_u8,
2965                 PixVal::MyIndex => 5_u8,
2966                 other => panic!("Unsupported PixVal: {other}"),
2967             },
2968         );
2969         e.set_pred_dst(45..48, Dst::None);
2970     }
2971 }
2972 
2973 impl SM50Op for OpS2R {
legalize(&mut self, _b: &mut LegalizeBuilder)2974     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2975         // Nothing to do
2976     }
2977 
encode(&self, e: &mut SM50Encoder<'_>)2978     fn encode(&self, e: &mut SM50Encoder<'_>) {
2979         e.set_opcode(0xf0c8);
2980         e.set_dst(self.dst);
2981         e.set_field(20..28, self.idx);
2982     }
2983 }
2984 
2985 impl SM50Op for OpVote {
legalize(&mut self, _b: &mut LegalizeBuilder)2986     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2987         // Nothing to do
2988     }
2989 
encode(&self, e: &mut SM50Encoder<'_>)2990     fn encode(&self, e: &mut SM50Encoder<'_>) {
2991         e.set_opcode(0x50d8);
2992 
2993         e.set_dst(self.ballot);
2994         e.set_pred_dst(45..48, self.vote);
2995         e.set_pred_src(39..42, 42, self.pred);
2996 
2997         e.set_field(
2998             48..50,
2999             match self.op {
3000                 VoteOp::All => 0u8,
3001                 VoteOp::Any => 1u8,
3002                 VoteOp::Eq => 2u8,
3003             },
3004         );
3005     }
3006 }
3007 
3008 impl SM50Op for OpOut {
legalize(&mut self, b: &mut LegalizeBuilder)3009     fn legalize(&mut self, b: &mut LegalizeBuilder) {
3010         use RegFile::GPR;
3011         b.copy_alu_src_if_not_reg(&mut self.handle, GPR, SrcType::GPR);
3012         b.copy_alu_src_if_i20_overflow(&mut self.stream, GPR, SrcType::ALU);
3013     }
3014 
encode(&self, e: &mut SM50Encoder<'_>)3015     fn encode(&self, e: &mut SM50Encoder<'_>) {
3016         match &self.stream.src_ref {
3017             SrcRef::Zero | SrcRef::Reg(_) => {
3018                 e.set_opcode(0xfbe0);
3019                 e.set_reg_src(20..28, self.stream);
3020             }
3021             SrcRef::Imm32(imm32) => {
3022                 e.set_opcode(0xf6e0);
3023                 e.set_src_imm_i20(20..39, 56, *imm32);
3024             }
3025             SrcRef::CBuf(cbuf) => {
3026                 e.set_opcode(0xebe0);
3027                 e.set_src_cb(20..39, cbuf);
3028             }
3029             src => panic!("Invalid out stream: {src}"),
3030         }
3031 
3032         e.set_field(
3033             39..41,
3034             match self.out_type {
3035                 OutType::Emit => 1_u8,
3036                 OutType::Cut => 2_u8,
3037                 OutType::EmitThenCut => 3_u8,
3038             },
3039         );
3040 
3041         e.set_reg_src(8..16, self.handle);
3042         e.set_dst(self.dst);
3043     }
3044 }
3045 
3046 macro_rules! as_sm50_op_match {
3047     ($op: expr) => {
3048         match $op {
3049             Op::FAdd(op) => op,
3050             Op::FMnMx(op) => op,
3051             Op::FMul(op) => op,
3052             Op::FFma(op) => op,
3053             Op::FSet(op) => op,
3054             Op::FSetP(op) => op,
3055             Op::FSwzAdd(op) => op,
3056             Op::Rro(op) => op,
3057             Op::MuFu(op) => op,
3058             Op::Flo(op) => op,
3059             Op::DAdd(op) => op,
3060             Op::DFma(op) => op,
3061             Op::DMnMx(op) => op,
3062             Op::DMul(op) => op,
3063             Op::DSetP(op) => op,
3064             Op::IAdd2(op) => op,
3065             Op::IAdd2X(op) => op,
3066             Op::Mov(op) => op,
3067             Op::Sel(op) => op,
3068             Op::Shfl(op) => op,
3069             Op::Vote(op) => op,
3070             Op::PSetP(op) => op,
3071             Op::SuSt(op) => op,
3072             Op::S2R(op) => op,
3073             Op::PopC(op) => op,
3074             Op::Prmt(op) => op,
3075             Op::Ld(op) => op,
3076             Op::Ldc(op) => op,
3077             Op::St(op) => op,
3078             Op::Lop2(op) => op,
3079             Op::Shf(op) => op,
3080             Op::Shl(op) => op,
3081             Op::Shr(op) => op,
3082             Op::F2F(op) => op,
3083             Op::F2I(op) => op,
3084             Op::I2F(op) => op,
3085             Op::I2I(op) => op,
3086             Op::IMad(op) => op,
3087             Op::IMul(op) => op,
3088             Op::IMnMx(op) => op,
3089             Op::ISetP(op) => op,
3090             Op::Tex(op) => op,
3091             Op::Tld(op) => op,
3092             Op::Tld4(op) => op,
3093             Op::Tmml(op) => op,
3094             Op::Txd(op) => op,
3095             Op::Txq(op) => op,
3096             Op::Ipa(op) => op,
3097             Op::AL2P(op) => op,
3098             Op::ALd(op) => op,
3099             Op::ASt(op) => op,
3100             Op::CCtl(op) => op,
3101             Op::MemBar(op) => op,
3102             Op::Atom(op) => op,
3103             Op::Bra(op) => op,
3104             Op::SSy(op) => op,
3105             Op::Sync(op) => op,
3106             Op::Brk(op) => op,
3107             Op::PBk(op) => op,
3108             Op::Cont(op) => op,
3109             Op::PCnt(op) => op,
3110             Op::Exit(op) => op,
3111             Op::Bar(op) => op,
3112             Op::SuLd(op) => op,
3113             Op::SuAtom(op) => op,
3114             Op::Kill(op) => op,
3115             Op::CS2R(op) => op,
3116             Op::Nop(op) => op,
3117             Op::PixLd(op) => op,
3118             Op::Isberd(op) => op,
3119             Op::Out(op) => op,
3120             Op::Bfe(op) => op,
3121             _ => panic!("Unhandled instruction {}", $op),
3122         }
3123     };
3124 }
3125 
as_sm50_op(op: &Op) -> &dyn SM50Op3126 fn as_sm50_op(op: &Op) -> &dyn SM50Op {
3127     as_sm50_op_match!(op)
3128 }
3129 
as_sm50_op_mut(op: &mut Op) -> &mut dyn SM50Op3130 fn as_sm50_op_mut(op: &mut Op) -> &mut dyn SM50Op {
3131     as_sm50_op_match!(op)
3132 }
3133 
encode_instr( instr_index: usize, instr: Option<&Box<Instr>>, sm: &ShaderModel50, labels: &HashMap<Label, usize>, ip: &mut usize, sched_instr: &mut [u32; 2], ) -> [u32; 2]3134 fn encode_instr(
3135     instr_index: usize,
3136     instr: Option<&Box<Instr>>,
3137     sm: &ShaderModel50,
3138     labels: &HashMap<Label, usize>,
3139     ip: &mut usize,
3140     sched_instr: &mut [u32; 2],
3141 ) -> [u32; 2] {
3142     let mut e = SM50Encoder {
3143         sm,
3144         ip: *ip,
3145         labels,
3146         inst: [0_u32; 2],
3147         sched: 0,
3148     };
3149 
3150     if let Some(instr) = instr {
3151         as_sm50_op(&instr.op).encode(&mut e);
3152         e.set_pred(&instr.pred);
3153         e.set_instr_deps(&instr.deps);
3154     } else {
3155         let nop = OpNop { label: None };
3156         nop.encode(&mut e);
3157         e.set_pred(&true.into());
3158         e.set_instr_deps(&InstrDeps::new());
3159     }
3160 
3161     *ip += 8;
3162 
3163     BitMutView::new(sched_instr)
3164         .set_field(21 * instr_index..21 * (instr_index + 1), e.sched);
3165 
3166     e.inst
3167 }
3168 
encode_sm50_shader(sm: &ShaderModel50, s: &Shader<'_>) -> Vec<u32>3169 fn encode_sm50_shader(sm: &ShaderModel50, s: &Shader<'_>) -> Vec<u32> {
3170     assert!(s.functions.len() == 1);
3171     let func = &s.functions[0];
3172 
3173     let mut num_instrs = 0_usize;
3174     let mut labels = HashMap::new();
3175     for b in &func.blocks {
3176         // We ensure blocks will have groups of 3 instructions with a
3177         // schedule instruction before each groups.  As we should never jump
3178         // to a schedule instruction, we account for that here.
3179         labels.insert(b.label, num_instrs + 8);
3180 
3181         let block_num_instrs = b.instrs.len().next_multiple_of(3);
3182 
3183         // Every 3 instructions, we have a new schedule instruction so we
3184         // need to account for that.
3185         num_instrs += (block_num_instrs + (block_num_instrs / 3)) * 8;
3186     }
3187 
3188     let mut encoded = Vec::new();
3189     for b in &func.blocks {
3190         // A block is composed of groups of 3 instructions.
3191         let block_num_instrs = b.instrs.len().next_multiple_of(3);
3192 
3193         let mut instrs_iter = b.instrs.iter();
3194 
3195         for _ in 0..(block_num_instrs / 3) {
3196             let mut ip = ((encoded.len() / 2) + 1) * 8;
3197 
3198             let mut sched_instr = [0x0; 2];
3199 
3200             let instr0 = encode_instr(
3201                 0,
3202                 instrs_iter.next(),
3203                 sm,
3204                 &labels,
3205                 &mut ip,
3206                 &mut sched_instr,
3207             );
3208             let instr1 = encode_instr(
3209                 1,
3210                 instrs_iter.next(),
3211                 sm,
3212                 &labels,
3213                 &mut ip,
3214                 &mut sched_instr,
3215             );
3216             let instr2 = encode_instr(
3217                 2,
3218                 instrs_iter.next(),
3219                 sm,
3220                 &labels,
3221                 &mut ip,
3222                 &mut sched_instr,
3223             );
3224 
3225             encoded.extend_from_slice(&sched_instr[..]);
3226             encoded.extend_from_slice(&instr0[..]);
3227             encoded.extend_from_slice(&instr1[..]);
3228             encoded.extend_from_slice(&instr2[..]);
3229         }
3230     }
3231 
3232     encoded
3233 }
3234