xref: /aosp_15_r20/external/mesa3d/src/nouveau/compiler/nak/from_nir.rs (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 // Copyright © 2022 Collabora, Ltd.
2 // SPDX-License-Identifier: MIT
3 
4 #![allow(non_upper_case_globals)]
5 
6 use crate::api::GetDebugFlags;
7 use crate::api::DEBUG;
8 use crate::builder::*;
9 use crate::ir::*;
10 use crate::nir_instr_printer::NirInstrPrinter;
11 use crate::sph::{OutputTopology, PixelImap};
12 
13 use nak_bindings::*;
14 
15 use compiler::bindings::*;
16 use compiler::cfg::CFGBuilder;
17 use compiler::nir::*;
18 use std::cmp::max;
19 use std::collections::{HashMap, HashSet};
20 use std::ops::Index;
21 
init_info_from_nir(nir: &nir_shader) -> ShaderInfo22 fn init_info_from_nir(nir: &nir_shader) -> ShaderInfo {
23     ShaderInfo {
24         num_gprs: 0,
25         num_instrs: 0,
26         num_control_barriers: 0,
27         slm_size: nir.scratch_size,
28         max_crs_depth: 0,
29         uses_global_mem: false,
30         writes_global_mem: false,
31         // TODO: handle this.
32         uses_fp64: false,
33         stage: match nir.info.stage() {
34             MESA_SHADER_COMPUTE => {
35                 ShaderStageInfo::Compute(ComputeShaderInfo {
36                     local_size: [
37                         nir.info.workgroup_size[0],
38                         nir.info.workgroup_size[1],
39                         nir.info.workgroup_size[2],
40                     ],
41                     smem_size: nir.info.shared_size.try_into().unwrap(),
42                 })
43             }
44             MESA_SHADER_VERTEX => ShaderStageInfo::Vertex,
45             MESA_SHADER_FRAGMENT => {
46                 let info_fs = unsafe { &nir.info.__bindgen_anon_1.fs };
47                 ShaderStageInfo::Fragment(FragmentShaderInfo {
48                     uses_kill: false,
49                     does_interlock: false,
50                     post_depth_coverage: info_fs.post_depth_coverage(),
51                     early_fragment_tests: info_fs.early_fragment_tests(),
52                     uses_sample_shading: info_fs.uses_sample_shading(),
53                 })
54             }
55             MESA_SHADER_GEOMETRY => {
56                 let info_gs = unsafe { &nir.info.__bindgen_anon_1.gs };
57                 let output_topology = match info_gs.output_primitive {
58                     MESA_PRIM_POINTS => OutputTopology::PointList,
59                     MESA_PRIM_LINE_STRIP => OutputTopology::LineStrip,
60                     MESA_PRIM_TRIANGLE_STRIP => OutputTopology::TriangleStrip,
61                     _ => panic!(
62                         "Invalid GS input primitive {}",
63                         info_gs.input_primitive
64                     ),
65                 };
66 
67                 ShaderStageInfo::Geometry(GeometryShaderInfo {
68                     // TODO: Should be set if VK_NV_geometry_shader_passthrough is in use.
69                     passthrough_enable: false,
70                     stream_out_mask: info_gs.active_stream_mask(),
71                     threads_per_input_primitive: info_gs.invocations,
72                     output_topology: output_topology,
73                     max_output_vertex_count: info_gs.vertices_out,
74                 })
75             }
76             MESA_SHADER_TESS_CTRL => {
77                 let info_tess = unsafe { &nir.info.__bindgen_anon_1.tess };
78                 ShaderStageInfo::TessellationInit(TessellationInitShaderInfo {
79                     per_patch_attribute_count: 6,
80                     threads_per_patch: info_tess.tcs_vertices_out,
81                 })
82             }
83             MESA_SHADER_TESS_EVAL => {
84                 let info_tess = unsafe { &nir.info.__bindgen_anon_1.tess };
85                 ShaderStageInfo::Tessellation(TessellationShaderInfo {
86                     domain: match info_tess._primitive_mode {
87                         TESS_PRIMITIVE_TRIANGLES => {
88                             TessellationDomain::Triangle
89                         }
90                         TESS_PRIMITIVE_QUADS => TessellationDomain::Quad,
91                         TESS_PRIMITIVE_ISOLINES => TessellationDomain::Isoline,
92                         _ => panic!("Invalid tess_primitive_mode"),
93                     },
94                     spacing: match info_tess.spacing() {
95                         TESS_SPACING_EQUAL => TessellationSpacing::Integer,
96                         TESS_SPACING_FRACTIONAL_ODD => {
97                             TessellationSpacing::FractionalOdd
98                         }
99                         TESS_SPACING_FRACTIONAL_EVEN => {
100                             TessellationSpacing::FractionalEven
101                         }
102                         _ => panic!("Invalid gl_tess_spacing"),
103                     },
104                     primitives: if info_tess.point_mode() {
105                         TessellationPrimitives::Points
106                     } else if info_tess._primitive_mode
107                         == TESS_PRIMITIVE_ISOLINES
108                     {
109                         TessellationPrimitives::Lines
110                     } else if info_tess.ccw() {
111                         TessellationPrimitives::TrianglesCCW
112                     } else {
113                         TessellationPrimitives::TrianglesCW
114                     },
115                 })
116             }
117             _ => panic!("Unknown shader stage"),
118         },
119         io: match nir.info.stage() {
120             MESA_SHADER_COMPUTE => ShaderIoInfo::None,
121             MESA_SHADER_FRAGMENT => ShaderIoInfo::Fragment(FragmentIoInfo {
122                 sysvals_in: SysValInfo {
123                     // Required on fragment shaders, otherwise it cause a trap.
124                     ab: 1 << 31,
125                     c: 0,
126                 },
127                 sysvals_in_d: [PixelImap::Unused; 8],
128                 attr_in: [PixelImap::Unused; 128],
129                 barycentric_attr_in: [0; 4],
130                 reads_sample_mask: false,
131                 writes_color: 0,
132                 writes_sample_mask: false,
133                 writes_depth: false,
134             }),
135             MESA_SHADER_VERTEX
136             | MESA_SHADER_GEOMETRY
137             | MESA_SHADER_TESS_CTRL
138             | MESA_SHADER_TESS_EVAL => {
139                 let num_clip = nir.info.clip_distance_array_size();
140                 let num_cull = nir.info.cull_distance_array_size();
141                 let clip_enable = (1_u32 << num_clip) - 1;
142                 let cull_enable = ((1_u32 << num_cull) - 1) << num_clip;
143 
144                 ShaderIoInfo::Vtg(VtgIoInfo {
145                     sysvals_in: SysValInfo::default(),
146                     sysvals_in_d: 0,
147                     sysvals_out: SysValInfo::default(),
148                     sysvals_out_d: 0,
149                     attr_in: [0; 4],
150                     attr_out: [0; 4],
151 
152                     // TODO: figure out how to fill this.
153                     store_req_start: u8::MAX,
154                     store_req_end: 0,
155 
156                     clip_enable: clip_enable.try_into().unwrap(),
157                     cull_enable: cull_enable.try_into().unwrap(),
158                     xfb: if nir.xfb_info.is_null() {
159                         None
160                     } else {
161                         Some(Box::new(unsafe {
162                             nak_xfb_from_nir(nir.xfb_info)
163                         }))
164                     },
165                 })
166             }
167             _ => panic!("Unknown shader stage"),
168         },
169     }
170 }
171 
alloc_ssa_for_nir(b: &mut impl SSABuilder, ssa: &nir_def) -> Vec<SSAValue>172 fn alloc_ssa_for_nir(b: &mut impl SSABuilder, ssa: &nir_def) -> Vec<SSAValue> {
173     let (file, comps) = if ssa.bit_size == 1 {
174         (RegFile::Pred, ssa.num_components)
175     } else {
176         let bits = ssa.bit_size * ssa.num_components;
177         (RegFile::GPR, bits.div_ceil(32))
178     };
179 
180     let mut vec = Vec::new();
181     for _ in 0..comps {
182         vec.push(b.alloc_ssa(file, 1)[0]);
183     }
184     vec
185 }
186 
187 struct PhiAllocMap<'a> {
188     alloc: &'a mut PhiAllocator,
189     map: HashMap<(u32, u8), u32>,
190 }
191 
192 impl<'a> PhiAllocMap<'a> {
new(alloc: &'a mut PhiAllocator) -> PhiAllocMap<'a>193     fn new(alloc: &'a mut PhiAllocator) -> PhiAllocMap<'a> {
194         PhiAllocMap {
195             alloc: alloc,
196             map: HashMap::new(),
197         }
198     }
199 
get_phi_id(&mut self, phi: &nir_phi_instr, comp: u8) -> u32200     fn get_phi_id(&mut self, phi: &nir_phi_instr, comp: u8) -> u32 {
201         *self
202             .map
203             .entry((phi.def.index, comp))
204             .or_insert_with(|| self.alloc.alloc())
205     }
206 }
207 
208 struct PerSizeFloatControls {
209     pub ftz: bool,
210     pub rnd_mode: FRndMode,
211 }
212 
213 struct ShaderFloatControls {
214     pub fp16: PerSizeFloatControls,
215     pub fp32: PerSizeFloatControls,
216     pub fp64: PerSizeFloatControls,
217 }
218 
219 impl Default for ShaderFloatControls {
default() -> Self220     fn default() -> Self {
221         Self {
222             fp16: PerSizeFloatControls {
223                 ftz: false,
224                 rnd_mode: FRndMode::NearestEven,
225             },
226             fp32: PerSizeFloatControls {
227                 ftz: true, // Default FTZ on fp32
228                 rnd_mode: FRndMode::NearestEven,
229             },
230             fp64: PerSizeFloatControls {
231                 ftz: false,
232                 rnd_mode: FRndMode::NearestEven,
233             },
234         }
235     }
236 }
237 
238 impl ShaderFloatControls {
from_nir(nir: &nir_shader) -> ShaderFloatControls239     fn from_nir(nir: &nir_shader) -> ShaderFloatControls {
240         let nir_fc = nir.info.float_controls_execution_mode;
241         let mut fc: ShaderFloatControls = Default::default();
242 
243         if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) != 0 {
244             fc.fp16.ftz = false;
245         } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16) != 0 {
246             fc.fp16.ftz = true;
247         }
248         if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16) != 0 {
249             fc.fp16.rnd_mode = FRndMode::NearestEven;
250         } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) != 0 {
251             fc.fp16.rnd_mode = FRndMode::Zero;
252         }
253 
254         if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) != 0 {
255             fc.fp32.ftz = false;
256         } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) != 0 {
257             fc.fp32.ftz = true;
258         }
259         if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) != 0 {
260             fc.fp32.rnd_mode = FRndMode::NearestEven;
261         } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) != 0 {
262             fc.fp32.rnd_mode = FRndMode::Zero;
263         }
264 
265         if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) != 0 {
266             fc.fp64.ftz = false;
267         } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64) != 0 {
268             fc.fp64.ftz = true;
269         }
270         if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) != 0 {
271             fc.fp64.rnd_mode = FRndMode::NearestEven;
272         } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) != 0 {
273             fc.fp64.rnd_mode = FRndMode::Zero;
274         }
275 
276         fc
277     }
278 }
279 
280 impl Index<FloatType> for ShaderFloatControls {
281     type Output = PerSizeFloatControls;
282 
index(&self, idx: FloatType) -> &PerSizeFloatControls283     fn index(&self, idx: FloatType) -> &PerSizeFloatControls {
284         match idx {
285             FloatType::F16 => &self.fp16,
286             FloatType::F32 => &self.fp32,
287             FloatType::F64 => &self.fp64,
288         }
289     }
290 }
291 
292 #[derive(Clone, Copy, Eq, Hash, PartialEq)]
293 enum SyncType {
294     Sync,
295     Brk,
296     Cont,
297 }
298 
299 struct ShaderFromNir<'a> {
300     nir: &'a nir_shader,
301     sm: &'a dyn ShaderModel,
302     info: ShaderInfo,
303     float_ctl: ShaderFloatControls,
304     cfg: CFGBuilder<u32, BasicBlock>,
305     label_alloc: LabelAllocator,
306     block_label: HashMap<u32, Label>,
307     bar_label: HashMap<u32, Label>,
308     sync_blocks: HashSet<u32>,
309     crs: Vec<(u32, SyncType)>,
310     fs_out_regs: [SSAValue; 34],
311     end_block_id: u32,
312     ssa_map: HashMap<u32, Vec<SSAValue>>,
313     saturated: HashSet<*const nir_def>,
314     nir_instr_printer: NirInstrPrinter,
315 }
316 
317 impl<'a> ShaderFromNir<'a> {
new(nir: &'a nir_shader, sm: &'a dyn ShaderModel) -> Self318     fn new(nir: &'a nir_shader, sm: &'a dyn ShaderModel) -> Self {
319         Self {
320             nir: nir,
321             sm: sm,
322             info: init_info_from_nir(nir),
323             float_ctl: ShaderFloatControls::from_nir(nir),
324             cfg: CFGBuilder::new(),
325             label_alloc: LabelAllocator::new(),
326             block_label: HashMap::new(),
327             bar_label: HashMap::new(),
328             sync_blocks: HashSet::new(),
329             crs: Vec::new(),
330             fs_out_regs: [SSAValue::NONE; 34],
331             end_block_id: 0,
332             ssa_map: HashMap::new(),
333             saturated: HashSet::new(),
334             nir_instr_printer: NirInstrPrinter::new(),
335         }
336     }
337 
get_block_label(&mut self, block: &nir_block) -> Label338     fn get_block_label(&mut self, block: &nir_block) -> Label {
339         *self
340             .block_label
341             .entry(block.index)
342             .or_insert_with(|| self.label_alloc.alloc())
343     }
344 
push_crs(&mut self, target: &nir_block, sync_type: SyncType)345     fn push_crs(&mut self, target: &nir_block, sync_type: SyncType) {
346         self.sync_blocks.insert(target.index);
347         self.crs.push((target.index, sync_type));
348         let crs_depth = u32::try_from(self.crs.len()).unwrap();
349         self.info.max_crs_depth = max(self.info.max_crs_depth, crs_depth);
350     }
351 
pop_crs(&mut self, target: &nir_block, sync_type: SyncType)352     fn pop_crs(&mut self, target: &nir_block, sync_type: SyncType) {
353         if let Some((top_index, top_sync_type)) = self.crs.pop() {
354             assert!(top_index == target.index);
355             assert!(top_sync_type == sync_type);
356         } else {
357             panic!("Tried to pop an empty stack");
358         }
359     }
360 
peek_crs(&self, target: &nir_block) -> Option<SyncType>361     fn peek_crs(&self, target: &nir_block) -> Option<SyncType> {
362         for (i, (index, sync_type)) in self.crs.iter().enumerate().rev() {
363             if *index != target.index {
364                 continue;
365             }
366 
367             match sync_type {
368                 SyncType::Sync => {
369                     // Sync must always be top-of-stack
370                     assert!(i == self.crs.len() - 1);
371                 }
372                 SyncType::Brk => {
373                     // Brk cannot skip over another Brk
374                     for (_, inner_sync) in &self.crs[(i + 1)..] {
375                         assert!(*inner_sync != SyncType::Brk);
376                     }
377                 }
378                 SyncType::Cont => {
379                     // Cont can only skip over Sync
380                     for (_, inner_sync) in &self.crs[(i + 1)..] {
381                         assert!(*inner_sync == SyncType::Sync);
382                     }
383                 }
384             }
385 
386             return Some(*sync_type);
387         }
388 
389         assert!(!self.sync_blocks.contains(&target.index));
390         None
391     }
392 
get_ssa(&mut self, ssa: &nir_def) -> &[SSAValue]393     fn get_ssa(&mut self, ssa: &nir_def) -> &[SSAValue] {
394         self.ssa_map.get(&ssa.index).unwrap()
395     }
396 
set_ssa(&mut self, def: &nir_def, vec: Vec<SSAValue>)397     fn set_ssa(&mut self, def: &nir_def, vec: Vec<SSAValue>) {
398         if def.bit_size == 1 {
399             for s in &vec {
400                 assert!(s.is_predicate());
401             }
402         } else {
403             for s in &vec {
404                 assert!(!s.is_predicate());
405             }
406             let bits =
407                 usize::from(def.bit_size) * usize::from(def.num_components);
408             assert!(vec.len() == bits.div_ceil(32));
409         }
410         self.ssa_map
411             .entry(def.index)
412             .and_modify(|_| panic!("Cannot set an SSA def twice"))
413             .or_insert(vec);
414     }
415 
get_ssa_comp(&mut self, def: &nir_def, c: u8) -> (SSARef, u8)416     fn get_ssa_comp(&mut self, def: &nir_def, c: u8) -> (SSARef, u8) {
417         let vec = self.get_ssa(def);
418         match def.bit_size {
419             1 => (vec[usize::from(c)].into(), 0),
420             8 => (vec[usize::from(c / 4)].into(), c % 4),
421             16 => (vec[usize::from(c / 2)].into(), (c * 2) % 4),
422             32 => (vec[usize::from(c)].into(), 0),
423             64 => {
424                 let comps =
425                     [vec[usize::from(c) * 2 + 0], vec[usize::from(c) * 2 + 1]];
426                 (comps.into(), 0)
427             }
428             _ => panic!("Unsupported bit size: {}", def.bit_size),
429         }
430     }
431 
get_ssa_ref(&mut self, src: &nir_src) -> SSARef432     fn get_ssa_ref(&mut self, src: &nir_src) -> SSARef {
433         SSARef::try_from(self.get_ssa(src.as_def())).unwrap()
434     }
435 
get_src(&mut self, src: &nir_src) -> Src436     fn get_src(&mut self, src: &nir_src) -> Src {
437         self.get_ssa_ref(src).into()
438     }
439 
get_io_addr_offset( &mut self, addr: &nir_src, imm_bits: u8, ) -> (Src, i32)440     fn get_io_addr_offset(
441         &mut self,
442         addr: &nir_src,
443         imm_bits: u8,
444     ) -> (Src, i32) {
445         let addr = addr.as_def();
446         let addr_offset = unsafe {
447             nak_get_io_addr_offset(addr as *const _ as *mut _, imm_bits)
448         };
449 
450         if let Some(base_def) = std::ptr::NonNull::new(addr_offset.base.def) {
451             let base_def = unsafe { base_def.as_ref() };
452             let base_comp = u8::try_from(addr_offset.base.comp).unwrap();
453             let (base, _) = self.get_ssa_comp(base_def, base_comp);
454             (base.into(), addr_offset.offset)
455         } else {
456             (SrcRef::Zero.into(), addr_offset.offset)
457         }
458     }
459 
get_cbuf_addr_offset(&mut self, addr: &nir_src) -> (Src, u16)460     fn get_cbuf_addr_offset(&mut self, addr: &nir_src) -> (Src, u16) {
461         let (off, off_imm) = self.get_io_addr_offset(addr, 16);
462         if let Ok(off_imm_u16) = u16::try_from(off_imm) {
463             (off, off_imm_u16)
464         } else {
465             (self.get_src(addr), 0)
466         }
467     }
468 
set_dst(&mut self, def: &nir_def, ssa: SSARef)469     fn set_dst(&mut self, def: &nir_def, ssa: SSARef) {
470         self.set_ssa(def, (*ssa).into());
471     }
472 
try_saturate_alu_dst(&mut self, def: &nir_def) -> bool473     fn try_saturate_alu_dst(&mut self, def: &nir_def) -> bool {
474         if def.all_uses_are_fsat() {
475             self.saturated.insert(def as *const _);
476             true
477         } else {
478             false
479         }
480     }
481 
alu_src_is_saturated(&self, src: &nir_alu_src) -> bool482     fn alu_src_is_saturated(&self, src: &nir_alu_src) -> bool {
483         self.saturated
484             .get(&(src.src.as_def() as *const _))
485             .is_some()
486     }
487 
parse_alu(&mut self, b: &mut impl SSABuilder, alu: &nir_alu_instr)488     fn parse_alu(&mut self, b: &mut impl SSABuilder, alu: &nir_alu_instr) {
489         // Handle vectors and pack ops as a special case since they're the only
490         // ALU ops that can produce more than 16B. They are also the only ALU
491         // ops which we allow to consume small (8 and 16-bit) vector data
492         // scattered across multiple dwords
493         match alu.op {
494             nir_op_mov
495             | nir_op_pack_32_4x8_split
496             | nir_op_pack_32_2x16_split
497             | nir_op_pack_64_2x32_split
498             | nir_op_vec2
499             | nir_op_vec3
500             | nir_op_vec4
501             | nir_op_vec5
502             | nir_op_vec8
503             | nir_op_vec16 => {
504                 let src_bit_size = alu.get_src(0).src.bit_size();
505                 let bits = usize::from(alu.def.num_components)
506                     * usize::from(alu.def.bit_size);
507 
508                 // Collect the sources into a vec with src_bit_size per SSA
509                 // value in the vec.  This implicitly makes 64-bit sources look
510                 // like two 32-bit values
511                 let mut srcs = Vec::new();
512                 if alu.op == nir_op_mov {
513                     let src = alu.get_src(0);
514                     for c in 0..alu.def.num_components {
515                         let s = src.swizzle[usize::from(c)];
516                         let (src, byte) =
517                             self.get_ssa_comp(src.src.as_def(), s);
518                         for ssa in src.iter() {
519                             srcs.push((*ssa, byte));
520                         }
521                     }
522                 } else {
523                     for src in alu.srcs_as_slice().iter() {
524                         let s = src.swizzle[0];
525                         let (src, byte) =
526                             self.get_ssa_comp(src.src.as_def(), s);
527                         for ssa in src.iter() {
528                             srcs.push((*ssa, byte));
529                         }
530                     }
531                 }
532 
533                 let mut comps = Vec::new();
534                 match src_bit_size {
535                     1 | 32 | 64 => {
536                         for (ssa, _) in srcs {
537                             comps.push(ssa);
538                         }
539                     }
540                     8 => {
541                         for dc in 0..bits.div_ceil(32) {
542                             let mut psrc = [Src::new_zero(); 4];
543                             let mut psel = [0_u8; 4];
544 
545                             for b in 0..4 {
546                                 let sc = dc * 4 + b;
547                                 if sc < srcs.len() {
548                                     let (ssa, byte) = srcs[sc];
549                                     for i in 0..4_u8 {
550                                         let psrc_i = &mut psrc[usize::from(i)];
551                                         if *psrc_i == Src::new_zero() {
552                                             *psrc_i = ssa.into();
553                                         } else if *psrc_i != Src::from(ssa) {
554                                             continue;
555                                         }
556                                         psel[b] = i * 4 + byte;
557                                     }
558                                 }
559                             }
560                             comps.push(b.prmt4(psrc, psel)[0]);
561                         }
562                     }
563                     16 => {
564                         for dc in 0..bits.div_ceil(32) {
565                             let mut psrc = [Src::new_zero(); 2];
566                             let mut psel = [0_u8; 4];
567 
568                             for w in 0..2 {
569                                 let sc = dc * 2 + w;
570                                 if sc < srcs.len() {
571                                     let (ssa, byte) = srcs[sc];
572                                     let w_u8 = u8::try_from(w).unwrap();
573                                     psrc[w] = ssa.into();
574                                     psel[w * 2 + 0] = (w_u8 * 4) + byte;
575                                     psel[w * 2 + 1] = (w_u8 * 4) + byte + 1;
576                                 }
577                             }
578                             comps.push(b.prmt(psrc[0], psrc[1], psel)[0]);
579                         }
580                     }
581                     _ => panic!("Unknown bit size: {src_bit_size}"),
582                 }
583 
584                 self.set_ssa(&alu.def, comps);
585                 return;
586             }
587             _ => (),
588         }
589 
590         let nir_srcs = alu.srcs_as_slice();
591         let mut srcs: Vec<Src> = Vec::new();
592         for (i, alu_src) in nir_srcs.iter().enumerate() {
593             let bit_size = alu_src.src.bit_size();
594             let comps = alu.src_components(i.try_into().unwrap());
595             let ssa = self.get_ssa(alu_src.src.as_def());
596 
597             match bit_size {
598                 1 => {
599                     assert!(comps == 1);
600                     let s = usize::from(alu_src.swizzle[0]);
601                     srcs.push(ssa[s].into());
602                 }
603                 8 | 16 => {
604                     let num_bytes = usize::from(comps * (bit_size / 8));
605                     assert!(num_bytes <= 4);
606 
607                     let mut bytes = [0_u8; 4];
608                     for c in 0..usize::from(comps) {
609                         let cs = alu_src.swizzle[c];
610                         if bit_size == 8 {
611                             bytes[c] = cs;
612                         } else {
613                             bytes[c * 2 + 0] = cs * 2 + 0;
614                             bytes[c * 2 + 1] = cs * 2 + 1;
615                         }
616                     }
617 
618                     let mut prmt_srcs = [Src::new_zero(); 4];
619                     let mut prmt = [0_u8; 4];
620                     for b in 0..num_bytes {
621                         for (ds, s) in prmt_srcs.iter_mut().enumerate() {
622                             let dw = ssa[usize::from(bytes[b] / 4)];
623                             if s.is_zero() {
624                                 *s = dw.into();
625                             } else if *s != Src::from(dw) {
626                                 continue;
627                             }
628                             prmt[usize::from(b)] =
629                                 (ds as u8) * 4 + (bytes[b] % 4);
630                             break;
631                         }
632                     }
633 
634                     srcs.push(b.prmt4(prmt_srcs, prmt).into());
635                 }
636                 32 => {
637                     assert!(comps == 1);
638                     let s = usize::from(alu_src.swizzle[0]);
639                     srcs.push(ssa[s].into());
640                 }
641                 64 => {
642                     assert!(comps == 1);
643                     let s = usize::from(alu_src.swizzle[0]);
644                     srcs.push([ssa[s * 2], ssa[s * 2 + 1]].into());
645                 }
646                 _ => panic!("Invalid bit size: {bit_size}"),
647             }
648         }
649 
650         // Restricts an F16v2 source to just x if the ALU op is single-component. This
651         // must only be called for per-component sources (see nir_op_info::output_sizes
652         // for more details).
653         let restrict_f16v2_src = |mut src: Src| {
654             if alu.def.num_components == 1 {
655                 src.src_swizzle = SrcSwizzle::Xx;
656             }
657             src
658         };
659 
660         let dst: SSARef = match alu.op {
661             nir_op_b2b1 => {
662                 assert!(alu.get_src(0).bit_size() == 32);
663                 b.isetp(IntCmpType::I32, IntCmpOp::Ne, srcs[0], 0.into())
664             }
665             nir_op_b2b32 | nir_op_b2i8 | nir_op_b2i16 | nir_op_b2i32 => {
666                 b.sel(srcs[0].bnot(), 0.into(), 1.into())
667             }
668             nir_op_b2i64 => {
669                 let lo = b.sel(srcs[0].bnot(), 0.into(), 1.into());
670                 let hi = b.copy(0.into());
671                 [lo[0], hi[0]].into()
672             }
673             nir_op_b2f16 => b.sel(srcs[0].bnot(), 0.into(), 0x3c00.into()),
674             nir_op_b2f32 => {
675                 b.sel(srcs[0].bnot(), 0.0_f32.into(), 1.0_f32.into())
676             }
677             nir_op_b2f64 => {
678                 let lo = b.copy(0.into());
679                 let hi = b.sel(srcs[0].bnot(), 0.into(), 0x3ff00000.into());
680                 [lo[0], hi[0]].into()
681             }
682             nir_op_bcsel => b.sel(srcs[0], srcs[1], srcs[2]),
683             nir_op_bfm => {
684                 let dst = b.alloc_ssa(RegFile::GPR, 1);
685                 b.push_op(OpBMsk {
686                     dst: dst.into(),
687                     pos: srcs[1],
688                     width: srcs[0],
689                     wrap: true,
690                 });
691                 dst
692             }
693             nir_op_bit_count => {
694                 let dst = b.alloc_ssa(RegFile::GPR, 1);
695                 b.push_op(OpPopC {
696                     dst: dst.into(),
697                     src: srcs[0],
698                 });
699                 dst
700             }
701             nir_op_bitfield_reverse => b.brev(srcs[0]),
702             nir_op_ibitfield_extract | nir_op_ubitfield_extract => {
703                 let range = b.alloc_ssa(RegFile::GPR, 1);
704                 b.push_op(OpPrmt {
705                     dst: range.into(),
706                     srcs: [srcs[1], srcs[2]],
707                     sel: 0x0040.into(),
708                     mode: PrmtMode::Index,
709                 });
710 
711                 let dst = b.alloc_ssa(RegFile::GPR, 1);
712                 b.push_op(OpBfe {
713                     dst: dst.into(),
714                     base: srcs[0],
715                     signed: !matches!(alu.op, nir_op_ubitfield_extract),
716                     range: range.into(),
717                     reverse: false,
718                 });
719                 dst
720             }
721             nir_op_extract_u8 | nir_op_extract_i8 | nir_op_extract_u16
722             | nir_op_extract_i16 => {
723                 let src1 = alu.get_src(1);
724                 let elem = src1.src.comp_as_uint(src1.swizzle[0]).unwrap();
725                 let elem = u8::try_from(elem).unwrap();
726 
727                 match alu.op {
728                     nir_op_extract_u8 => {
729                         assert!(elem < 4);
730                         let byte = elem;
731                         let zero = 4;
732                         b.prmt(srcs[0], 0.into(), [byte, zero, zero, zero])
733                     }
734                     nir_op_extract_i8 => {
735                         assert!(elem < 4);
736                         let byte = elem;
737                         let sign = byte | 0x8;
738                         b.prmt(srcs[0], 0.into(), [byte, sign, sign, sign])
739                     }
740                     nir_op_extract_u16 => {
741                         assert!(elem < 2);
742                         let byte = elem * 2;
743                         let zero = 4;
744                         b.prmt(srcs[0], 0.into(), [byte, byte + 1, zero, zero])
745                     }
746                     nir_op_extract_i16 => {
747                         assert!(elem < 2);
748                         let byte = elem * 2;
749                         let sign = (byte + 1) | 0x8;
750                         b.prmt(srcs[0], 0.into(), [byte, byte + 1, sign, sign])
751                     }
752                     _ => panic!("Unknown extract op: {}", alu.op),
753                 }
754             }
755             nir_op_f2f16 | nir_op_f2f16_rtne | nir_op_f2f16_rtz
756             | nir_op_f2f32 | nir_op_f2f64 => {
757                 let src_bits = alu.get_src(0).src.bit_size();
758                 let dst_bits = alu.def.bit_size();
759                 let src_type = FloatType::from_bits(src_bits.into());
760                 let dst_type = FloatType::from_bits(dst_bits.into());
761 
762                 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
763                 b.push_op(OpF2F {
764                     dst: dst.into(),
765                     src: srcs[0],
766                     src_type: FloatType::from_bits(src_bits.into()),
767                     dst_type: dst_type,
768                     rnd_mode: match alu.op {
769                         nir_op_f2f16_rtne => FRndMode::NearestEven,
770                         nir_op_f2f16_rtz => FRndMode::Zero,
771                         _ => self.float_ctl[dst_type].rnd_mode,
772                     },
773                     ftz: if src_bits < dst_bits {
774                         self.float_ctl[src_type].ftz
775                     } else {
776                         self.float_ctl[dst_type].ftz
777                     },
778                     high: false,
779                     integer_rnd: false,
780                 });
781                 dst
782             }
783             nir_op_find_lsb => {
784                 let rev = b.brev(srcs[0]);
785                 let dst = b.alloc_ssa(RegFile::GPR, 1);
786                 b.push_op(OpFlo {
787                     dst: dst.into(),
788                     src: rev.into(),
789                     signed: false,
790                     return_shift_amount: true,
791                 });
792                 dst
793             }
794             nir_op_f2i8 | nir_op_f2i16 | nir_op_f2i32 | nir_op_f2i64
795             | nir_op_f2u8 | nir_op_f2u16 | nir_op_f2u32 | nir_op_f2u64 => {
796                 let src_bits = usize::from(alu.get_src(0).bit_size());
797                 let dst_bits = alu.def.bit_size();
798                 let src_type = FloatType::from_bits(src_bits);
799                 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
800                 let dst_is_signed = alu.info().output_type & 2 != 0;
801                 let dst_type =
802                     IntType::from_bits(dst_bits.into(), dst_is_signed);
803                 if b.sm() < 70 && dst_bits == 8 {
804                     // F2I doesn't support 8-bit destinations pre-Volta
805                     let tmp = b.alloc_ssa(RegFile::GPR, 1);
806                     let tmp_type = IntType::from_bits(32, dst_is_signed);
807                     b.push_op(OpF2I {
808                         dst: tmp.into(),
809                         src: srcs[0],
810                         src_type,
811                         dst_type: tmp_type,
812                         rnd_mode: FRndMode::Zero,
813                         ftz: self.float_ctl[src_type].ftz,
814                     });
815                     b.push_op(OpI2I {
816                         dst: dst.into(),
817                         src: tmp.into(),
818                         src_type: tmp_type,
819                         dst_type,
820                         saturate: true,
821                         abs: false,
822                         neg: false,
823                     });
824                 } else {
825                     b.push_op(OpF2I {
826                         dst: dst.into(),
827                         src: srcs[0],
828                         src_type,
829                         dst_type,
830                         rnd_mode: FRndMode::Zero,
831                         ftz: self.float_ctl[src_type].ftz,
832                     });
833                 }
834                 dst
835             }
836             nir_op_fabs | nir_op_fadd | nir_op_fneg => {
837                 let (x, y) = match alu.op {
838                     nir_op_fabs => (Src::new_zero().fneg(), srcs[0].fabs()),
839                     nir_op_fadd => (srcs[0], srcs[1]),
840                     nir_op_fneg => (Src::new_zero().fneg(), srcs[0].fneg()),
841                     _ => panic!("Unhandled case"),
842                 };
843                 let ftype = FloatType::from_bits(alu.def.bit_size().into());
844                 let dst;
845                 if alu.def.bit_size() == 64 {
846                     dst = b.alloc_ssa(RegFile::GPR, 2);
847                     b.push_op(OpDAdd {
848                         dst: dst.into(),
849                         srcs: [x, y],
850                         rnd_mode: self.float_ctl[ftype].rnd_mode,
851                     });
852                 } else if alu.def.bit_size() == 32 {
853                     dst = b.alloc_ssa(RegFile::GPR, 1);
854                     b.push_op(OpFAdd {
855                         dst: dst.into(),
856                         srcs: [x, y],
857                         saturate: self.try_saturate_alu_dst(&alu.def),
858                         rnd_mode: self.float_ctl[ftype].rnd_mode,
859                         ftz: self.float_ctl[ftype].ftz,
860                     });
861                 } else if alu.def.bit_size() == 16 {
862                     assert!(
863                         self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
864                     );
865 
866                     dst = b.alloc_ssa(RegFile::GPR, 1);
867                     b.push_op(OpHAdd2 {
868                         dst: dst.into(),
869                         srcs: [restrict_f16v2_src(x), restrict_f16v2_src(y)],
870                         saturate: self.try_saturate_alu_dst(&alu.def),
871                         ftz: self.float_ctl[ftype].ftz,
872                         f32: false,
873                     });
874                 } else {
875                     panic!("Unsupported float type: f{}", alu.def.bit_size());
876                 }
877                 dst
878             }
879             nir_op_fceil | nir_op_ffloor | nir_op_fround_even
880             | nir_op_ftrunc => {
881                 let dst = b.alloc_ssa(RegFile::GPR, 1);
882                 let ty = FloatType::from_bits(alu.def.bit_size().into());
883                 let rnd_mode = match alu.op {
884                     nir_op_fceil => FRndMode::PosInf,
885                     nir_op_ffloor => FRndMode::NegInf,
886                     nir_op_ftrunc => FRndMode::Zero,
887                     nir_op_fround_even => FRndMode::NearestEven,
888                     _ => unreachable!(),
889                 };
890                 let ftz = self.float_ctl[ty].ftz;
891                 if b.sm() >= 70 {
892                     assert!(
893                         alu.def.bit_size() == 32 || alu.def.bit_size() == 16
894                     );
895                     b.push_op(OpFRnd {
896                         dst: dst.into(),
897                         src: srcs[0],
898                         src_type: ty,
899                         dst_type: ty,
900                         rnd_mode,
901                         ftz,
902                     });
903                 } else {
904                     assert!(alu.def.bit_size() == 32);
905                     b.push_op(OpF2F {
906                         dst: dst.into(),
907                         src: srcs[0],
908                         src_type: ty,
909                         dst_type: ty,
910                         rnd_mode,
911                         ftz,
912                         integer_rnd: true,
913                         high: false,
914                     });
915                 }
916                 dst
917             }
918             nir_op_fcos => b.fcos(srcs[0]),
919             nir_op_feq | nir_op_fge | nir_op_flt | nir_op_fneu => {
920                 let src_type =
921                     FloatType::from_bits(alu.get_src(0).bit_size().into());
922                 let cmp_op = match alu.op {
923                     nir_op_feq => FloatCmpOp::OrdEq,
924                     nir_op_fge => FloatCmpOp::OrdGe,
925                     nir_op_flt => FloatCmpOp::OrdLt,
926                     nir_op_fneu => FloatCmpOp::UnordNe,
927                     _ => panic!("Usupported float comparison"),
928                 };
929 
930                 let dst = b.alloc_ssa(RegFile::Pred, alu.def.num_components);
931                 if alu.get_src(0).bit_size() == 64 {
932                     assert!(alu.def.num_components == 1);
933                     b.push_op(OpDSetP {
934                         dst: dst.into(),
935                         set_op: PredSetOp::And,
936                         cmp_op: cmp_op,
937                         srcs: [srcs[0], srcs[1]],
938                         accum: SrcRef::True.into(),
939                     });
940                 } else if alu.get_src(0).bit_size() == 32 {
941                     assert!(alu.def.num_components == 1);
942                     b.push_op(OpFSetP {
943                         dst: dst.into(),
944                         set_op: PredSetOp::And,
945                         cmp_op: cmp_op,
946                         srcs: [srcs[0], srcs[1]],
947                         accum: SrcRef::True.into(),
948                         ftz: self.float_ctl[src_type].ftz,
949                     });
950                 } else if alu.get_src(0).bit_size() == 16 {
951                     assert!(
952                         alu.def.num_components == 1
953                             || alu.def.num_components == 2
954                     );
955 
956                     let dsts = if alu.def.num_components == 2 {
957                         [dst[0].into(), dst[1].into()]
958                     } else {
959                         [dst[0].into(), Dst::None]
960                     };
961 
962                     b.push_op(OpHSetP2 {
963                         dsts,
964                         set_op: PredSetOp::And,
965                         cmp_op: cmp_op,
966                         srcs: [
967                             restrict_f16v2_src(srcs[0]),
968                             restrict_f16v2_src(srcs[1]),
969                         ],
970                         accum: SrcRef::True.into(),
971                         ftz: self.float_ctl[src_type].ftz,
972                         horizontal: false,
973                     });
974                 } else {
975                     panic!(
976                         "Unsupported float type: f{}",
977                         alu.get_src(0).bit_size()
978                     );
979                 }
980                 dst
981             }
982             nir_op_fexp2 => b.fexp2(srcs[0]),
983             nir_op_ffma => {
984                 let ftype = FloatType::from_bits(alu.def.bit_size().into());
985                 let dst;
986                 if alu.def.bit_size() == 64 {
987                     debug_assert!(!self.float_ctl[ftype].ftz);
988                     dst = b.alloc_ssa(RegFile::GPR, 2);
989                     b.push_op(OpDFma {
990                         dst: dst.into(),
991                         srcs: [srcs[0], srcs[1], srcs[2]],
992                         rnd_mode: self.float_ctl[ftype].rnd_mode,
993                     });
994                 } else if alu.def.bit_size() == 32 {
995                     dst = b.alloc_ssa(RegFile::GPR, 1);
996                     b.push_op(OpFFma {
997                         dst: dst.into(),
998                         srcs: [srcs[0], srcs[1], srcs[2]],
999                         saturate: self.try_saturate_alu_dst(&alu.def),
1000                         rnd_mode: self.float_ctl[ftype].rnd_mode,
1001                         // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
1002                         // anyway so only set one of the two bits.
1003                         ftz: self.float_ctl[ftype].ftz,
1004                         dnz: false,
1005                     });
1006                 } else if alu.def.bit_size() == 16 {
1007                     assert!(
1008                         self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
1009                     );
1010 
1011                     dst = b.alloc_ssa(RegFile::GPR, 1);
1012                     b.push_op(OpHFma2 {
1013                         dst: dst.into(),
1014                         srcs: [
1015                             restrict_f16v2_src(srcs[0]),
1016                             restrict_f16v2_src(srcs[1]),
1017                             restrict_f16v2_src(srcs[2]),
1018                         ],
1019                         saturate: self.try_saturate_alu_dst(&alu.def),
1020                         ftz: self.float_ctl[ftype].ftz,
1021                         dnz: false,
1022                         f32: false,
1023                     });
1024                 } else {
1025                     panic!("Unsupported float type: f{}", alu.def.bit_size());
1026                 }
1027                 dst
1028             }
1029             nir_op_ffmaz => {
1030                 assert!(alu.def.bit_size() == 32);
1031                 // DNZ implies FTZ so we need FTZ set or this is invalid
1032                 assert!(self.float_ctl.fp32.ftz);
1033                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1034                 b.push_op(OpFFma {
1035                     dst: dst.into(),
1036                     srcs: [srcs[0], srcs[1], srcs[2]],
1037                     saturate: self.try_saturate_alu_dst(&alu.def),
1038                     rnd_mode: self.float_ctl.fp32.rnd_mode,
1039                     // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
1040                     // anyway so only set one of the two bits.
1041                     ftz: false,
1042                     dnz: true,
1043                 });
1044                 dst
1045             }
1046             nir_op_flog2 => {
1047                 assert!(alu.def.bit_size() == 32);
1048                 b.mufu(MuFuOp::Log2, srcs[0])
1049             }
1050             nir_op_fmax | nir_op_fmin => {
1051                 let dst;
1052                 if alu.def.bit_size() == 64 {
1053                     dst = b.alloc_ssa(RegFile::GPR, 2);
1054                     b.push_op(OpDMnMx {
1055                         dst: dst.into(),
1056                         srcs: [srcs[0], srcs[1]],
1057                         min: (alu.op == nir_op_fmin).into(),
1058                     });
1059                 } else if alu.def.bit_size() == 32 {
1060                     dst = b.alloc_ssa(RegFile::GPR, 1);
1061                     b.push_op(OpFMnMx {
1062                         dst: dst.into(),
1063                         srcs: [srcs[0], srcs[1]],
1064                         min: (alu.op == nir_op_fmin).into(),
1065                         ftz: self.float_ctl.fp32.ftz,
1066                     });
1067                 } else if alu.def.bit_size() == 16 {
1068                     dst = b.alloc_ssa(RegFile::GPR, 1);
1069                     b.push_op(OpHMnMx2 {
1070                         dst: dst.into(),
1071                         srcs: [
1072                             restrict_f16v2_src(srcs[0]),
1073                             restrict_f16v2_src(srcs[1]),
1074                         ],
1075                         min: (alu.op == nir_op_fmin).into(),
1076                         ftz: self.float_ctl.fp16.ftz,
1077                     });
1078                 } else {
1079                     panic!("Unsupported float type: f{}", alu.def.bit_size());
1080                 }
1081                 dst
1082             }
1083             nir_op_fmul => {
1084                 let ftype = FloatType::from_bits(alu.def.bit_size().into());
1085                 let dst;
1086                 if alu.def.bit_size() == 64 {
1087                     debug_assert!(!self.float_ctl[ftype].ftz);
1088                     dst = b.alloc_ssa(RegFile::GPR, 2);
1089                     b.push_op(OpDMul {
1090                         dst: dst.into(),
1091                         srcs: [srcs[0], srcs[1]],
1092                         rnd_mode: self.float_ctl[ftype].rnd_mode,
1093                     });
1094                 } else if alu.def.bit_size() == 32 {
1095                     dst = b.alloc_ssa(RegFile::GPR, 1);
1096                     b.push_op(OpFMul {
1097                         dst: dst.into(),
1098                         srcs: [srcs[0], srcs[1]],
1099                         saturate: self.try_saturate_alu_dst(&alu.def),
1100                         rnd_mode: self.float_ctl[ftype].rnd_mode,
1101                         ftz: self.float_ctl[ftype].ftz,
1102                         dnz: false,
1103                     });
1104                 } else if alu.def.bit_size() == 16 {
1105                     assert!(
1106                         self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
1107                     );
1108 
1109                     dst = b.alloc_ssa(RegFile::GPR, 1);
1110                     b.push_op(OpHMul2 {
1111                         dst: dst.into(),
1112                         srcs: [
1113                             restrict_f16v2_src(srcs[0]),
1114                             restrict_f16v2_src(srcs[1]),
1115                         ],
1116                         saturate: self.try_saturate_alu_dst(&alu.def),
1117                         ftz: self.float_ctl[ftype].ftz,
1118                         dnz: false,
1119                     });
1120                 } else {
1121                     panic!("Unsupported float type: f{}", alu.def.bit_size());
1122                 }
1123                 dst
1124             }
1125             nir_op_fmulz => {
1126                 assert!(alu.def.bit_size() == 32);
1127                 // DNZ implies FTZ so we need FTZ set or this is invalid
1128                 assert!(self.float_ctl.fp32.ftz);
1129                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1130                 b.push_op(OpFMul {
1131                     dst: dst.into(),
1132                     srcs: [srcs[0], srcs[1]],
1133                     saturate: self.try_saturate_alu_dst(&alu.def),
1134                     rnd_mode: self.float_ctl.fp32.rnd_mode,
1135                     // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
1136                     // anyway so only set one of the two bits.
1137                     ftz: false,
1138                     dnz: true,
1139                 });
1140                 dst
1141             }
1142             nir_op_fquantize2f16 => {
1143                 let tmp = b.alloc_ssa(RegFile::GPR, 1);
1144                 b.push_op(OpF2F {
1145                     dst: tmp.into(),
1146                     src: srcs[0],
1147                     src_type: FloatType::F32,
1148                     dst_type: FloatType::F16,
1149                     rnd_mode: FRndMode::NearestEven,
1150                     ftz: true,
1151                     high: false,
1152                     integer_rnd: false,
1153                 });
1154                 assert!(alu.def.bit_size() == 32);
1155                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1156                 b.push_op(OpF2F {
1157                     dst: dst.into(),
1158                     src: tmp.into(),
1159                     src_type: FloatType::F16,
1160                     dst_type: FloatType::F32,
1161                     rnd_mode: FRndMode::NearestEven,
1162                     ftz: true,
1163                     high: false,
1164                     integer_rnd: false,
1165                 });
1166                 if b.sm() < 70 {
1167                     // Pre-Volta, F2F.ftz doesn't flush denorms so we need to do
1168                     // that manually
1169                     let denorm = b.fsetp(
1170                         FloatCmpOp::OrdLt,
1171                         srcs[0].fabs(),
1172                         0x38800000.into(),
1173                     );
1174                     // Get the correctly signed zero
1175                     let zero =
1176                         b.lop2(LogicOp2::And, srcs[0], 0x80000000.into());
1177                     b.sel(denorm.into(), zero.into(), dst.into())
1178                 } else {
1179                     dst
1180                 }
1181             }
1182             nir_op_frcp => {
1183                 assert!(alu.def.bit_size() == 32);
1184                 b.mufu(MuFuOp::Rcp, srcs[0])
1185             }
1186             nir_op_frsq => {
1187                 assert!(alu.def.bit_size() == 32);
1188                 b.mufu(MuFuOp::Rsq, srcs[0])
1189             }
1190             nir_op_fsat => {
1191                 let ftype = FloatType::from_bits(alu.def.bit_size().into());
1192 
1193                 if self.alu_src_is_saturated(&alu.srcs_as_slice()[0]) {
1194                     b.copy(srcs[0])
1195                 } else if alu.def.bit_size() == 32 {
1196                     let dst = b.alloc_ssa(RegFile::GPR, 1);
1197                     b.push_op(OpFAdd {
1198                         dst: dst.into(),
1199                         srcs: [srcs[0], 0.into()],
1200                         saturate: true,
1201                         rnd_mode: self.float_ctl[ftype].rnd_mode,
1202                         ftz: self.float_ctl[ftype].ftz,
1203                     });
1204                     dst
1205                 } else if alu.def.bit_size() == 16 {
1206                     assert!(
1207                         self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
1208                     );
1209 
1210                     let dst = b.alloc_ssa(RegFile::GPR, 1);
1211                     b.push_op(OpHAdd2 {
1212                         dst: dst.into(),
1213                         srcs: [restrict_f16v2_src(srcs[0]), 0.into()],
1214                         saturate: true,
1215                         ftz: self.float_ctl[ftype].ftz,
1216                         f32: false,
1217                     });
1218                     dst
1219                 } else {
1220                     panic!("Unsupported float type: f{}", alu.def.bit_size());
1221                 }
1222             }
1223             nir_op_fsign => {
1224                 if alu.def.bit_size() == 64 {
1225                     let lz = b.dsetp(FloatCmpOp::OrdLt, srcs[0], 0.into());
1226                     let gz = b.dsetp(FloatCmpOp::OrdGt, srcs[0], 0.into());
1227                     let hi = b.sel(lz.into(), 0xbff00000.into(), 0.into());
1228                     let hi = b.sel(gz.into(), 0x3ff00000.into(), hi.into());
1229                     let lo = b.copy(0.into());
1230                     [lo[0], hi[0]].into()
1231                 } else if alu.def.bit_size() == 32 {
1232                     let lz = b.fset(FloatCmpOp::OrdLt, srcs[0], 0.into());
1233                     let gz = b.fset(FloatCmpOp::OrdGt, srcs[0], 0.into());
1234                     b.fadd(gz.into(), Src::from(lz).fneg())
1235                 } else if alu.def.bit_size() == 16 {
1236                     let x = restrict_f16v2_src(srcs[0]);
1237 
1238                     let lz = restrict_f16v2_src(
1239                         b.hset2(FloatCmpOp::OrdLt, x, 0.into()).into(),
1240                     );
1241                     let gz = restrict_f16v2_src(
1242                         b.hset2(FloatCmpOp::OrdGt, x, 0.into()).into(),
1243                     );
1244 
1245                     b.hadd2(gz, lz.fneg())
1246                 } else {
1247                     panic!("Unsupported float type: f{}", alu.def.bit_size());
1248                 }
1249             }
1250             nir_op_fsin => b.fsin(srcs[0]),
1251             nir_op_fsqrt => b.mufu(MuFuOp::Sqrt, srcs[0]),
1252             nir_op_i2f16 | nir_op_i2f32 | nir_op_i2f64 => {
1253                 let src_bits = alu.get_src(0).src.bit_size();
1254                 let dst_bits = alu.def.bit_size();
1255                 let dst_type = FloatType::from_bits(dst_bits.into());
1256                 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
1257                 b.push_op(OpI2F {
1258                     dst: dst.into(),
1259                     src: srcs[0],
1260                     dst_type: dst_type,
1261                     src_type: IntType::from_bits(src_bits.into(), true),
1262                     rnd_mode: self.float_ctl[dst_type].rnd_mode,
1263                 });
1264                 dst
1265             }
1266             nir_op_i2i8 | nir_op_i2i16 | nir_op_i2i32 | nir_op_i2i64
1267             | nir_op_u2u8 | nir_op_u2u16 | nir_op_u2u32 | nir_op_u2u64 => {
1268                 let src_bits = alu.get_src(0).src.bit_size();
1269                 let dst_bits = alu.def.bit_size();
1270 
1271                 let mut prmt = [0_u8; 8];
1272                 match alu.op {
1273                     nir_op_i2i8 | nir_op_i2i16 | nir_op_i2i32
1274                     | nir_op_i2i64 => {
1275                         let sign = ((src_bits / 8) - 1) | 0x8;
1276                         for i in 0..8 {
1277                             if i < (src_bits / 8) {
1278                                 prmt[usize::from(i)] = i;
1279                             } else {
1280                                 prmt[usize::from(i)] = sign;
1281                             }
1282                         }
1283                     }
1284                     nir_op_u2u8 | nir_op_u2u16 | nir_op_u2u32
1285                     | nir_op_u2u64 => {
1286                         for i in 0..8 {
1287                             if i < (src_bits / 8) {
1288                                 prmt[usize::from(i)] = i;
1289                             } else {
1290                                 prmt[usize::from(i)] = 4;
1291                             }
1292                         }
1293                     }
1294                     _ => panic!("Invalid integer conversion: {}", alu.op),
1295                 }
1296                 let prmt_lo: [u8; 4] = prmt[0..4].try_into().unwrap();
1297                 let prmt_hi: [u8; 4] = prmt[4..8].try_into().unwrap();
1298 
1299                 let src = srcs[0].as_ssa().unwrap();
1300                 if src_bits == 64 {
1301                     if dst_bits == 64 {
1302                         *src
1303                     } else {
1304                         b.prmt(src[0].into(), src[1].into(), prmt_lo)
1305                     }
1306                 } else {
1307                     if dst_bits == 64 {
1308                         let lo = b.prmt(src[0].into(), 0.into(), prmt_lo);
1309                         let hi = b.prmt(src[0].into(), 0.into(), prmt_hi);
1310                         [lo[0], hi[0]].into()
1311                     } else {
1312                         b.prmt(src[0].into(), 0.into(), prmt_lo)
1313                     }
1314                 }
1315             }
1316             nir_op_iabs => b.iabs(srcs[0]),
1317             nir_op_iadd => match alu.def.bit_size {
1318                 32 => b.iadd(srcs[0], srcs[1], 0.into()),
1319                 64 => b.iadd64(srcs[0], srcs[1], 0.into()),
1320                 x => panic!("unsupported bit size for nir_op_iadd: {x}"),
1321             },
1322             nir_op_iadd3 => match alu.def.bit_size {
1323                 32 => b.iadd(srcs[0], srcs[1], srcs[2]),
1324                 64 => b.iadd64(srcs[0], srcs[1], srcs[2]),
1325                 x => panic!("unsupported bit size for nir_op_iadd3: {x}"),
1326             },
1327             nir_op_iand => b.lop2(LogicOp2::And, srcs[0], srcs[1]),
1328             nir_op_ieq => {
1329                 if alu.get_src(0).bit_size() == 1 {
1330                     b.lop2(LogicOp2::Xor, srcs[0], srcs[1].bnot())
1331                 } else if alu.get_src(0).bit_size() == 64 {
1332                     b.isetp64(IntCmpType::I32, IntCmpOp::Eq, srcs[0], srcs[1])
1333                 } else {
1334                     assert!(alu.get_src(0).bit_size() == 32);
1335                     b.isetp(IntCmpType::I32, IntCmpOp::Eq, srcs[0], srcs[1])
1336                 }
1337             }
1338             nir_op_ifind_msb | nir_op_ifind_msb_rev | nir_op_ufind_msb
1339             | nir_op_ufind_msb_rev => {
1340                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1341                 b.push_op(OpFlo {
1342                     dst: dst.into(),
1343                     src: srcs[0],
1344                     signed: match alu.op {
1345                         nir_op_ifind_msb | nir_op_ifind_msb_rev => true,
1346                         nir_op_ufind_msb | nir_op_ufind_msb_rev => false,
1347                         _ => panic!("Not a find_msb op"),
1348                     },
1349                     return_shift_amount: match alu.op {
1350                         nir_op_ifind_msb | nir_op_ufind_msb => false,
1351                         nir_op_ifind_msb_rev | nir_op_ufind_msb_rev => true,
1352                         _ => panic!("Not a find_msb op"),
1353                     },
1354                 });
1355                 dst
1356             }
1357             nir_op_ige | nir_op_ilt | nir_op_uge | nir_op_ult => {
1358                 let x = *srcs[0].as_ssa().unwrap();
1359                 let y = *srcs[1].as_ssa().unwrap();
1360                 let (cmp_type, cmp_op) = match alu.op {
1361                     nir_op_ige => (IntCmpType::I32, IntCmpOp::Ge),
1362                     nir_op_ilt => (IntCmpType::I32, IntCmpOp::Lt),
1363                     nir_op_uge => (IntCmpType::U32, IntCmpOp::Ge),
1364                     nir_op_ult => (IntCmpType::U32, IntCmpOp::Lt),
1365                     _ => panic!("Not an integer comparison"),
1366                 };
1367                 if alu.get_src(0).bit_size() == 64 {
1368                     b.isetp64(cmp_type, cmp_op, x.into(), y.into())
1369                 } else {
1370                     assert!(alu.get_src(0).bit_size() == 32);
1371                     b.isetp(cmp_type, cmp_op, x.into(), y.into())
1372                 }
1373             }
1374             nir_op_imad => {
1375                 assert!(alu.def.bit_size() == 32);
1376                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1377                 b.push_op(OpIMad {
1378                     dst: dst.into(),
1379                     srcs: [srcs[0], srcs[1], srcs[2]],
1380                     signed: false,
1381                 });
1382                 dst
1383             }
1384             nir_op_imax | nir_op_imin | nir_op_umax | nir_op_umin => {
1385                 let (tp, min) = match alu.op {
1386                     nir_op_imax => (IntCmpType::I32, SrcRef::False),
1387                     nir_op_imin => (IntCmpType::I32, SrcRef::True),
1388                     nir_op_umax => (IntCmpType::U32, SrcRef::False),
1389                     nir_op_umin => (IntCmpType::U32, SrcRef::True),
1390                     _ => panic!("Not an integer min/max"),
1391                 };
1392                 assert!(alu.def.bit_size() == 32);
1393                 b.imnmx(tp, srcs[0], srcs[1], min.into())
1394             }
1395             nir_op_imul => {
1396                 assert!(alu.def.bit_size() == 32);
1397                 b.imul(srcs[0], srcs[1])
1398             }
1399             nir_op_imul_2x32_64 | nir_op_umul_2x32_64 => {
1400                 let signed = alu.op == nir_op_imul_2x32_64;
1401                 b.imul_2x32_64(srcs[0], srcs[1], signed)
1402             }
1403             nir_op_imul_high | nir_op_umul_high => {
1404                 let signed = alu.op == nir_op_imul_high;
1405                 let dst64 = b.imul_2x32_64(srcs[0], srcs[1], signed);
1406                 dst64[1].into()
1407             }
1408             nir_op_ine => {
1409                 if alu.get_src(0).bit_size() == 1 {
1410                     b.lop2(LogicOp2::Xor, srcs[0], srcs[1])
1411                 } else if alu.get_src(0).bit_size() == 64 {
1412                     b.isetp64(IntCmpType::I32, IntCmpOp::Ne, srcs[0], srcs[1])
1413                 } else {
1414                     assert!(alu.get_src(0).bit_size() == 32);
1415                     b.isetp(IntCmpType::I32, IntCmpOp::Ne, srcs[0], srcs[1])
1416                 }
1417             }
1418             nir_op_ineg => {
1419                 if alu.def.bit_size == 64 {
1420                     b.ineg64(srcs[0])
1421                 } else {
1422                     assert!(alu.def.bit_size() == 32);
1423                     b.ineg(srcs[0])
1424                 }
1425             }
1426             nir_op_inot => {
1427                 if alu.def.bit_size() == 1 {
1428                     b.lop2(LogicOp2::PassB, true.into(), srcs[0].bnot())
1429                 } else {
1430                     assert!(alu.def.bit_size() == 32);
1431                     b.lop2(LogicOp2::PassB, 0.into(), srcs[0].bnot())
1432                 }
1433             }
1434             nir_op_ior => b.lop2(LogicOp2::Or, srcs[0], srcs[1]),
1435             nir_op_ishl => {
1436                 if alu.def.bit_size() == 64 {
1437                     let shift = if let Some(s) = nir_srcs[1].comp_as_uint(0) {
1438                         (s as u32).into()
1439                     } else {
1440                         srcs[1]
1441                     };
1442                     b.shl64(srcs[0], shift)
1443                 } else {
1444                     assert!(alu.def.bit_size() == 32);
1445                     b.shl(srcs[0], srcs[1])
1446                 }
1447             }
1448             nir_op_ishr => {
1449                 if alu.def.bit_size() == 64 {
1450                     let shift = if let Some(s) = nir_srcs[1].comp_as_uint(0) {
1451                         (s as u32).into()
1452                     } else {
1453                         srcs[1]
1454                     };
1455                     b.shr64(srcs[0], shift, true)
1456                 } else {
1457                     assert!(alu.def.bit_size() == 32);
1458                     b.shr(srcs[0], srcs[1], true)
1459                 }
1460             }
1461             nir_op_isub => match alu.def.bit_size {
1462                 32 => b.iadd(srcs[0], srcs[1].ineg(), 0.into()),
1463                 64 => b.iadd64(srcs[0], srcs[1].ineg(), 0.into()),
1464                 x => panic!("unsupported bit size for nir_op_iadd: {x}"),
1465             },
1466             nir_op_ixor => b.lop2(LogicOp2::Xor, srcs[0], srcs[1]),
1467             nir_op_pack_half_2x16_split | nir_op_pack_half_2x16_rtz_split => {
1468                 assert!(alu.get_src(0).bit_size() == 32);
1469 
1470                 let rnd_mode = match alu.op {
1471                     nir_op_pack_half_2x16_split => FRndMode::NearestEven,
1472                     nir_op_pack_half_2x16_rtz_split => FRndMode::Zero,
1473                     _ => panic!("Unhandled fp16 pack op"),
1474                 };
1475 
1476                 if self.sm.sm() >= 86 {
1477                     let result: SSARef = b.alloc_ssa(RegFile::GPR, 1);
1478                     b.push_op(OpF2FP {
1479                         dst: result.into(),
1480                         srcs: [srcs[1], srcs[0]],
1481                         rnd_mode: rnd_mode,
1482                     });
1483 
1484                     result
1485                 } else {
1486                     let low = b.alloc_ssa(RegFile::GPR, 1);
1487                     let high = b.alloc_ssa(RegFile::GPR, 1);
1488 
1489                     b.push_op(OpF2F {
1490                         dst: low.into(),
1491                         src: srcs[0],
1492                         src_type: FloatType::F32,
1493                         dst_type: FloatType::F16,
1494                         rnd_mode: rnd_mode,
1495                         ftz: false,
1496                         high: false,
1497                         integer_rnd: false,
1498                     });
1499 
1500                     let src_bits = usize::from(alu.get_src(1).bit_size());
1501                     let src_type = FloatType::from_bits(src_bits);
1502                     assert!(matches!(src_type, FloatType::F32));
1503                     b.push_op(OpF2F {
1504                         dst: high.into(),
1505                         src: srcs[1],
1506                         src_type: FloatType::F32,
1507                         dst_type: FloatType::F16,
1508                         rnd_mode: rnd_mode,
1509                         ftz: false,
1510                         high: false,
1511                         integer_rnd: false,
1512                     });
1513 
1514                     b.prmt(low.into(), high.into(), [0, 1, 4, 5])
1515                 }
1516             }
1517             nir_op_prmt_nv => {
1518                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1519                 b.push_op(OpPrmt {
1520                     dst: dst.into(),
1521                     srcs: [srcs[1], srcs[2]],
1522                     sel: srcs[0],
1523                     mode: PrmtMode::Index,
1524                 });
1525                 dst
1526             }
1527             nir_op_sdot_4x8_iadd => {
1528                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1529                 b.push_op(OpIDp4 {
1530                     dst: dst.into(),
1531                     src_types: [IntType::I8, IntType::I8],
1532                     srcs: [srcs[0], srcs[1], srcs[2]],
1533                 });
1534                 dst
1535             }
1536             nir_op_sudot_4x8_iadd => {
1537                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1538                 b.push_op(OpIDp4 {
1539                     dst: dst.into(),
1540                     src_types: [IntType::I8, IntType::U8],
1541                     srcs: [srcs[0], srcs[1], srcs[2]],
1542                 });
1543                 dst
1544             }
1545             nir_op_udot_4x8_uadd => {
1546                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1547                 b.push_op(OpIDp4 {
1548                     dst: dst.into(),
1549                     src_types: [IntType::U8, IntType::U8],
1550                     srcs: [srcs[0], srcs[1], srcs[2]],
1551                 });
1552                 dst
1553             }
1554             nir_op_u2f16 | nir_op_u2f32 | nir_op_u2f64 => {
1555                 let src_bits = alu.get_src(0).src.bit_size();
1556                 let dst_bits = alu.def.bit_size();
1557                 let dst_type = FloatType::from_bits(dst_bits.into());
1558                 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
1559                 b.push_op(OpI2F {
1560                     dst: dst.into(),
1561                     src: srcs[0],
1562                     dst_type: dst_type,
1563                     src_type: IntType::from_bits(src_bits.into(), false),
1564                     rnd_mode: self.float_ctl[dst_type].rnd_mode,
1565                 });
1566                 dst
1567             }
1568             nir_op_uadd_sat => {
1569                 let x = srcs[0].as_ssa().unwrap();
1570                 let y = srcs[1].as_ssa().unwrap();
1571                 let sum_lo = b.alloc_ssa(RegFile::GPR, 1);
1572                 let ovf_lo = b.alloc_ssa(RegFile::Pred, 1);
1573                 b.push_op(OpIAdd3 {
1574                     dst: sum_lo.into(),
1575                     overflow: [ovf_lo.into(), Dst::None],
1576                     srcs: [0.into(), x[0].into(), y[0].into()],
1577                 });
1578                 if alu.def.bit_size() == 64 {
1579                     let sum_hi = b.alloc_ssa(RegFile::GPR, 1);
1580                     let ovf_hi = b.alloc_ssa(RegFile::Pred, 1);
1581                     b.push_op(OpIAdd3X {
1582                         dst: sum_hi.into(),
1583                         overflow: [ovf_hi.into(), Dst::None],
1584                         srcs: [0.into(), x[1].into(), y[1].into()],
1585                         carry: [ovf_lo.into(), false.into()],
1586                     });
1587                     let lo =
1588                         b.sel(ovf_hi.into(), u32::MAX.into(), sum_lo.into());
1589                     let hi =
1590                         b.sel(ovf_hi.into(), u32::MAX.into(), sum_hi.into());
1591                     [lo[0], hi[0]].into()
1592                 } else {
1593                     assert!(alu.def.bit_size() == 32);
1594                     b.sel(ovf_lo.into(), u32::MAX.into(), sum_lo.into())
1595                 }
1596             }
1597             nir_op_usub_sat => {
1598                 let x = srcs[0].as_ssa().unwrap();
1599                 let y = srcs[1].as_ssa().unwrap();
1600                 let sum_lo = b.alloc_ssa(RegFile::GPR, 1);
1601                 let ovf_lo = b.alloc_ssa(RegFile::Pred, 1);
1602                 // The result of OpIAdd3X is the 33-bit value
1603                 //
1604                 //  s|o = x + !y + 1
1605                 //
1606                 // The overflow bit of this result is true if and only if the
1607                 // subtract did NOT overflow.
1608                 b.push_op(OpIAdd3 {
1609                     dst: sum_lo.into(),
1610                     overflow: [ovf_lo.into(), Dst::None],
1611                     srcs: [0.into(), x[0].into(), Src::from(y[0]).ineg()],
1612                 });
1613                 if alu.def.bit_size() == 64 {
1614                     let sum_hi = b.alloc_ssa(RegFile::GPR, 1);
1615                     let ovf_hi = b.alloc_ssa(RegFile::Pred, 1);
1616                     b.push_op(OpIAdd3X {
1617                         dst: sum_hi.into(),
1618                         overflow: [ovf_hi.into(), Dst::None],
1619                         srcs: [0.into(), x[1].into(), Src::from(y[1]).bnot()],
1620                         carry: [ovf_lo.into(), false.into()],
1621                     });
1622                     let lo = b.sel(ovf_hi.into(), sum_lo.into(), 0.into());
1623                     let hi = b.sel(ovf_hi.into(), sum_hi.into(), 0.into());
1624                     [lo[0], hi[0]].into()
1625                 } else {
1626                     assert!(alu.def.bit_size() == 32);
1627                     b.sel(ovf_lo.into(), sum_lo.into(), 0.into())
1628                 }
1629             }
1630             nir_op_unpack_32_2x16_split_x => {
1631                 b.prmt(srcs[0], 0.into(), [0, 1, 4, 4])
1632             }
1633             nir_op_unpack_32_2x16_split_y => {
1634                 b.prmt(srcs[0], 0.into(), [2, 3, 4, 4])
1635             }
1636             nir_op_unpack_64_2x32_split_x => {
1637                 let src0_x = srcs[0].as_ssa().unwrap()[0];
1638                 b.copy(src0_x.into())
1639             }
1640             nir_op_unpack_64_2x32_split_y => {
1641                 let src0_y = srcs[0].as_ssa().unwrap()[1];
1642                 b.copy(src0_y.into())
1643             }
1644             nir_op_unpack_half_2x16_split_x
1645             | nir_op_unpack_half_2x16_split_y => {
1646                 assert!(alu.def.bit_size() == 32);
1647                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1648 
1649                 b.push_op(OpF2F {
1650                     dst: dst[0].into(),
1651                     src: srcs[0],
1652                     src_type: FloatType::F16,
1653                     dst_type: FloatType::F32,
1654                     rnd_mode: FRndMode::NearestEven,
1655                     ftz: false,
1656                     high: alu.op == nir_op_unpack_half_2x16_split_y,
1657                     integer_rnd: false,
1658                 });
1659 
1660                 dst
1661             }
1662             nir_op_ushr => {
1663                 if alu.def.bit_size() == 64 {
1664                     let shift = if let Some(s) = nir_srcs[1].comp_as_uint(0) {
1665                         (s as u32).into()
1666                     } else {
1667                         srcs[1]
1668                     };
1669                     b.shr64(srcs[0], shift, false)
1670                 } else {
1671                     assert!(alu.def.bit_size() == 32);
1672                     b.shr(srcs[0], srcs[1], false)
1673                 }
1674             }
1675             _ => panic!("Unsupported ALU instruction: {}", alu.info().name()),
1676         };
1677         self.set_dst(&alu.def, dst);
1678     }
1679 
parse_tex(&mut self, b: &mut impl SSABuilder, tex: &nir_tex_instr)1680     fn parse_tex(&mut self, b: &mut impl SSABuilder, tex: &nir_tex_instr) {
1681         let dim = match tex.sampler_dim {
1682             GLSL_SAMPLER_DIM_1D => {
1683                 if tex.is_array {
1684                     TexDim::Array1D
1685                 } else {
1686                     TexDim::_1D
1687                 }
1688             }
1689             GLSL_SAMPLER_DIM_2D => {
1690                 if tex.is_array {
1691                     TexDim::Array2D
1692                 } else {
1693                     TexDim::_2D
1694                 }
1695             }
1696             GLSL_SAMPLER_DIM_3D => {
1697                 assert!(!tex.is_array);
1698                 TexDim::_3D
1699             }
1700             GLSL_SAMPLER_DIM_CUBE => {
1701                 if tex.is_array {
1702                     TexDim::ArrayCube
1703                 } else {
1704                     TexDim::Cube
1705                 }
1706             }
1707             GLSL_SAMPLER_DIM_BUF => TexDim::_1D,
1708             GLSL_SAMPLER_DIM_MS => {
1709                 if tex.is_array {
1710                     TexDim::Array2D
1711                 } else {
1712                     TexDim::_2D
1713                 }
1714             }
1715             _ => panic!("Unsupported texture dimension: {}", tex.sampler_dim),
1716         };
1717 
1718         let srcs = tex.srcs_as_slice();
1719         assert!(srcs[0].src_type == nir_tex_src_backend1);
1720         if srcs.len() > 1 {
1721             assert!(srcs.len() == 2);
1722             assert!(srcs[1].src_type == nir_tex_src_backend2);
1723         }
1724 
1725         let flags: nak_nir_tex_flags =
1726             unsafe { std::mem::transmute_copy(&tex.backend_flags) };
1727 
1728         let mask = tex.def.components_read();
1729         let mut mask = u8::try_from(mask).unwrap();
1730         if flags.is_sparse() {
1731             mask &= !(1 << (tex.def.num_components - 1));
1732         }
1733 
1734         let dst_comps = u8::try_from(mask.count_ones()).unwrap();
1735         let dst = b.alloc_ssa(RegFile::GPR, dst_comps);
1736 
1737         // On Volta and later, the destination is split in two
1738         let mut dsts = [Dst::None; 2];
1739         if dst_comps > 2 && b.sm() >= 70 {
1740             dsts[0] = SSARef::try_from(&dst[0..2]).unwrap().into();
1741             dsts[1] = SSARef::try_from(&dst[2..]).unwrap().into();
1742         } else {
1743             dsts[0] = dst.into();
1744         }
1745 
1746         let fault = if flags.is_sparse() {
1747             b.alloc_ssa(RegFile::Pred, 1).into()
1748         } else {
1749             Dst::None
1750         };
1751 
1752         if tex.op == nir_texop_hdr_dim_nv {
1753             let src = self.get_src(&srcs[0].src);
1754             assert!(fault.is_none());
1755             b.push_op(OpTxq {
1756                 dsts: dsts,
1757                 src: src,
1758                 query: TexQuery::Dimension,
1759                 mask: mask,
1760             });
1761         } else if tex.op == nir_texop_tex_type_nv {
1762             let src = self.get_src(&srcs[0].src);
1763             assert!(fault.is_none());
1764             b.push_op(OpTxq {
1765                 dsts: dsts,
1766                 src: src,
1767                 query: TexQuery::TextureType,
1768                 mask: mask,
1769             });
1770         } else {
1771             let lod_mode = match flags.lod_mode() {
1772                 NAK_NIR_LOD_MODE_AUTO => TexLodMode::Auto,
1773                 NAK_NIR_LOD_MODE_ZERO => TexLodMode::Zero,
1774                 NAK_NIR_LOD_MODE_BIAS => TexLodMode::Bias,
1775                 NAK_NIR_LOD_MODE_LOD => TexLodMode::Lod,
1776                 NAK_NIR_LOD_MODE_CLAMP => TexLodMode::Clamp,
1777                 NAK_NIR_LOD_MODE_BIAS_CLAMP => TexLodMode::BiasClamp,
1778                 _ => panic!("Invalid LOD mode"),
1779             };
1780 
1781             let offset_mode = match flags.offset_mode() {
1782                 NAK_NIR_OFFSET_MODE_NONE => Tld4OffsetMode::None,
1783                 NAK_NIR_OFFSET_MODE_AOFFI => Tld4OffsetMode::AddOffI,
1784                 NAK_NIR_OFFSET_MODE_PER_PX => Tld4OffsetMode::PerPx,
1785                 _ => panic!("Invalid offset mode"),
1786             };
1787 
1788             let srcs = [self.get_src(&srcs[0].src), self.get_src(&srcs[1].src)];
1789 
1790             if tex.op == nir_texop_txd {
1791                 assert!(lod_mode == TexLodMode::Auto);
1792                 assert!(offset_mode != Tld4OffsetMode::PerPx);
1793                 assert!(!flags.has_z_cmpr());
1794                 b.push_op(OpTxd {
1795                     dsts: dsts,
1796                     fault,
1797                     srcs: srcs,
1798                     dim: dim,
1799                     offset: offset_mode == Tld4OffsetMode::AddOffI,
1800                     mask: mask,
1801                 });
1802             } else if tex.op == nir_texop_lod {
1803                 assert!(offset_mode == Tld4OffsetMode::None);
1804                 b.push_op(OpTmml {
1805                     dsts: dsts,
1806                     srcs: srcs,
1807                     dim: dim,
1808                     mask: mask,
1809                 });
1810             } else if tex.op == nir_texop_txf || tex.op == nir_texop_txf_ms {
1811                 assert!(offset_mode != Tld4OffsetMode::PerPx);
1812                 b.push_op(OpTld {
1813                     dsts: dsts,
1814                     fault,
1815                     srcs: srcs,
1816                     dim: dim,
1817                     lod_mode: lod_mode,
1818                     is_ms: tex.op == nir_texop_txf_ms,
1819                     offset: offset_mode == Tld4OffsetMode::AddOffI,
1820                     mask: mask,
1821                 });
1822             } else if tex.op == nir_texop_tg4 {
1823                 b.push_op(OpTld4 {
1824                     dsts: dsts,
1825                     fault,
1826                     srcs: srcs,
1827                     dim: dim,
1828                     comp: tex.component().try_into().unwrap(),
1829                     offset_mode: offset_mode,
1830                     z_cmpr: flags.has_z_cmpr(),
1831                     mask: mask,
1832                 });
1833             } else {
1834                 assert!(offset_mode != Tld4OffsetMode::PerPx);
1835                 b.push_op(OpTex {
1836                     dsts: dsts,
1837                     fault,
1838                     srcs: srcs,
1839                     dim: dim,
1840                     lod_mode: lod_mode,
1841                     z_cmpr: flags.has_z_cmpr(),
1842                     offset: offset_mode == Tld4OffsetMode::AddOffI,
1843                     mask: mask,
1844                 });
1845             }
1846         }
1847 
1848         let mut di = 0_usize;
1849         let mut nir_dst = Vec::new();
1850         for i in 0..tex.def.num_components() {
1851             if flags.is_sparse() && i == tex.def.num_components - 1 {
1852                 let Dst::SSA(fault) = fault else {
1853                     panic!("No fault value for sparse op");
1854                 };
1855                 nir_dst.push(b.sel(fault.into(), 0.into(), 1.into())[0]);
1856             } else if mask & (1 << i) == 0 {
1857                 nir_dst.push(b.copy(0.into())[0]);
1858             } else {
1859                 nir_dst.push(dst[di]);
1860                 di += 1;
1861             }
1862         }
1863         self.set_ssa(tex.def.as_def(), nir_dst);
1864     }
1865 
get_atomic_type(&self, intrin: &nir_intrinsic_instr) -> AtomType1866     fn get_atomic_type(&self, intrin: &nir_intrinsic_instr) -> AtomType {
1867         let bit_size = intrin.def.bit_size();
1868         match intrin.atomic_op() {
1869             nir_atomic_op_iadd => AtomType::U(bit_size),
1870             nir_atomic_op_imin => AtomType::I(bit_size),
1871             nir_atomic_op_umin => AtomType::U(bit_size),
1872             nir_atomic_op_imax => AtomType::I(bit_size),
1873             nir_atomic_op_umax => AtomType::U(bit_size),
1874             nir_atomic_op_iand => AtomType::U(bit_size),
1875             nir_atomic_op_ior => AtomType::U(bit_size),
1876             nir_atomic_op_ixor => AtomType::U(bit_size),
1877             nir_atomic_op_xchg => AtomType::U(bit_size),
1878             nir_atomic_op_fadd => AtomType::F(bit_size),
1879             nir_atomic_op_fmin => AtomType::F(bit_size),
1880             nir_atomic_op_fmax => AtomType::F(bit_size),
1881             nir_atomic_op_cmpxchg => AtomType::U(bit_size),
1882             _ => panic!("Unsupported NIR atomic op"),
1883         }
1884     }
1885 
get_atomic_op( &self, intrin: &nir_intrinsic_instr, cmp_src: AtomCmpSrc, ) -> AtomOp1886     fn get_atomic_op(
1887         &self,
1888         intrin: &nir_intrinsic_instr,
1889         cmp_src: AtomCmpSrc,
1890     ) -> AtomOp {
1891         match intrin.atomic_op() {
1892             nir_atomic_op_iadd => AtomOp::Add,
1893             nir_atomic_op_imin => AtomOp::Min,
1894             nir_atomic_op_umin => AtomOp::Min,
1895             nir_atomic_op_imax => AtomOp::Max,
1896             nir_atomic_op_umax => AtomOp::Max,
1897             nir_atomic_op_iand => AtomOp::And,
1898             nir_atomic_op_ior => AtomOp::Or,
1899             nir_atomic_op_ixor => AtomOp::Xor,
1900             nir_atomic_op_xchg => AtomOp::Exch,
1901             nir_atomic_op_fadd => AtomOp::Add,
1902             nir_atomic_op_fmin => AtomOp::Min,
1903             nir_atomic_op_fmax => AtomOp::Max,
1904             nir_atomic_op_cmpxchg => AtomOp::CmpExch(cmp_src),
1905             _ => panic!("Unsupported NIR atomic op"),
1906         }
1907     }
1908 
get_eviction_priority( &mut self, access: gl_access_qualifier, ) -> MemEvictionPriority1909     fn get_eviction_priority(
1910         &mut self,
1911         access: gl_access_qualifier,
1912     ) -> MemEvictionPriority {
1913         if self.sm.sm() >= 70 && access & ACCESS_NON_TEMPORAL != 0 {
1914             MemEvictionPriority::First
1915         } else {
1916             MemEvictionPriority::Normal
1917         }
1918     }
1919 
get_image_dim(&mut self, intrin: &nir_intrinsic_instr) -> ImageDim1920     fn get_image_dim(&mut self, intrin: &nir_intrinsic_instr) -> ImageDim {
1921         let is_array = intrin.image_array();
1922         let image_dim = intrin.image_dim();
1923         match intrin.image_dim() {
1924             GLSL_SAMPLER_DIM_1D => {
1925                 if is_array {
1926                     ImageDim::_1DArray
1927                 } else {
1928                     ImageDim::_1D
1929                 }
1930             }
1931             GLSL_SAMPLER_DIM_2D => {
1932                 if is_array {
1933                     ImageDim::_2DArray
1934                 } else {
1935                     ImageDim::_2D
1936                 }
1937             }
1938             GLSL_SAMPLER_DIM_3D => {
1939                 assert!(!is_array);
1940                 ImageDim::_3D
1941             }
1942             GLSL_SAMPLER_DIM_CUBE => ImageDim::_2DArray,
1943             GLSL_SAMPLER_DIM_BUF => {
1944                 assert!(!is_array);
1945                 ImageDim::_1DBuffer
1946             }
1947             _ => panic!("Unsupported image dimension: {}", image_dim),
1948         }
1949     }
1950 
get_image_coord( &mut self, intrin: &nir_intrinsic_instr, dim: ImageDim, ) -> Src1951     fn get_image_coord(
1952         &mut self,
1953         intrin: &nir_intrinsic_instr,
1954         dim: ImageDim,
1955     ) -> Src {
1956         let vec = self.get_ssa(intrin.get_src(1).as_def());
1957         // let sample = self.get_src(&srcs[2]);
1958         let comps = usize::from(dim.coord_comps());
1959         SSARef::try_from(&vec[0..comps]).unwrap().into()
1960     }
1961 
parse_intrinsic( &mut self, b: &mut impl SSABuilder, intrin: &nir_intrinsic_instr, )1962     fn parse_intrinsic(
1963         &mut self,
1964         b: &mut impl SSABuilder,
1965         intrin: &nir_intrinsic_instr,
1966     ) {
1967         let srcs = intrin.srcs_as_slice();
1968         match intrin.intrinsic {
1969             nir_intrinsic_al2p_nv => {
1970                 let offset = self.get_src(&srcs[0]);
1971                 let addr = u16::try_from(intrin.base()).unwrap();
1972 
1973                 let flags = intrin.flags();
1974                 let flags: nak_nir_attr_io_flags =
1975                     unsafe { std::mem::transmute_copy(&flags) };
1976 
1977                 let access = AttrAccess {
1978                     addr: addr,
1979                     comps: 1,
1980                     patch: flags.patch(),
1981                     output: flags.output(),
1982                     phys: false,
1983                 };
1984 
1985                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1986                 b.push_op(OpAL2P {
1987                     dst: dst.into(),
1988                     offset: offset,
1989                     access: access,
1990                 });
1991                 self.set_dst(&intrin.def, dst);
1992             }
1993             nir_intrinsic_ald_nv | nir_intrinsic_ast_nv => {
1994                 let addr = u16::try_from(intrin.base()).unwrap();
1995                 let base = u16::try_from(intrin.range_base()).unwrap();
1996                 let range = u16::try_from(intrin.range()).unwrap();
1997                 let range = base..(base + range);
1998 
1999                 let flags = intrin.flags();
2000                 let flags: nak_nir_attr_io_flags =
2001                     unsafe { std::mem::transmute_copy(&flags) };
2002                 assert!(!flags.patch() || !flags.phys());
2003 
2004                 if let ShaderIoInfo::Vtg(io) = &mut self.info.io {
2005                     if flags.patch() {
2006                         match &mut self.info.stage {
2007                             ShaderStageInfo::TessellationInit(stage) => {
2008                                 assert!(flags.output());
2009                                 stage.per_patch_attribute_count = max(
2010                                     stage.per_patch_attribute_count,
2011                                     (range.end / 4).try_into().unwrap(),
2012                                 );
2013                             }
2014                             ShaderStageInfo::Tessellation(_) => (),
2015                             _ => panic!("Patch I/O not supported"),
2016                         }
2017                     } else {
2018                         if flags.output() {
2019                             if intrin.intrinsic == nir_intrinsic_ast_nv {
2020                                 io.mark_store_req(range.clone());
2021                             }
2022                             io.mark_attrs_written(range);
2023                         } else {
2024                             io.mark_attrs_read(range);
2025                         }
2026                     }
2027                 } else {
2028                     panic!("Must be a VTG stage");
2029                 }
2030 
2031                 let access = AttrAccess {
2032                     addr: addr,
2033                     comps: intrin.num_components,
2034                     patch: flags.patch(),
2035                     output: flags.output(),
2036                     phys: flags.phys(),
2037                 };
2038 
2039                 if intrin.intrinsic == nir_intrinsic_ald_nv {
2040                     let vtx = self.get_src(&srcs[0]);
2041                     let offset = self.get_src(&srcs[1]);
2042 
2043                     assert!(intrin.def.bit_size() == 32);
2044                     let dst = b.alloc_ssa(RegFile::GPR, access.comps);
2045                     b.push_op(OpALd {
2046                         dst: dst.into(),
2047                         vtx: vtx,
2048                         offset: offset,
2049                         access: access,
2050                     });
2051                     self.set_dst(&intrin.def, dst);
2052                 } else if intrin.intrinsic == nir_intrinsic_ast_nv {
2053                     assert!(srcs[0].bit_size() == 32);
2054                     let data = self.get_src(&srcs[0]);
2055                     let vtx = self.get_src(&srcs[1]);
2056                     let offset = self.get_src(&srcs[2]);
2057 
2058                     b.push_op(OpASt {
2059                         data: data,
2060                         vtx: vtx,
2061                         offset: offset,
2062                         access: access,
2063                     });
2064                 } else {
2065                     panic!("Invalid VTG I/O intrinsic");
2066                 }
2067             }
2068             nir_intrinsic_as_uniform => {
2069                 let src = self.get_ssa(srcs[0].as_def());
2070                 let mut dst = Vec::new();
2071                 for comp in src {
2072                     let u = b.alloc_ssa(RegFile::UGPR, 1);
2073                     b.push_op(OpR2UR {
2074                         src: [*comp].into(),
2075                         dst: u.into(),
2076                     });
2077                     dst.push(u[0]);
2078                 }
2079                 self.set_ssa(&intrin.def, dst);
2080             }
2081             nir_intrinsic_ddx
2082             | nir_intrinsic_ddx_coarse
2083             | nir_intrinsic_ddx_fine => {
2084                 // TODO: Real coarse derivatives
2085 
2086                 assert!(intrin.def.bit_size() == 32);
2087                 let ftype = FloatType::F32;
2088                 let scratch = b.alloc_ssa(RegFile::GPR, 1);
2089 
2090                 b.push_op(OpShfl {
2091                     dst: scratch[0].into(),
2092                     in_bounds: Dst::None,
2093                     src: self.get_src(&srcs[0]),
2094                     lane: 1_u32.into(),
2095                     c: (0x3_u32 | 0x1c_u32 << 8).into(),
2096                     op: ShflOp::Bfly,
2097                 });
2098 
2099                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2100 
2101                 b.push_op(OpFSwzAdd {
2102                     dst: dst[0].into(),
2103                     srcs: [scratch[0].into(), self.get_src(&srcs[0])],
2104                     ops: [
2105                         FSwzAddOp::SubLeft,
2106                         FSwzAddOp::SubRight,
2107                         FSwzAddOp::SubLeft,
2108                         FSwzAddOp::SubRight,
2109                     ],
2110                     rnd_mode: self.float_ctl[ftype].rnd_mode,
2111                     ftz: self.float_ctl[ftype].ftz,
2112                 });
2113 
2114                 self.set_dst(&intrin.def, dst);
2115             }
2116             nir_intrinsic_ddy
2117             | nir_intrinsic_ddy_coarse
2118             | nir_intrinsic_ddy_fine => {
2119                 // TODO: Real coarse derivatives
2120 
2121                 assert!(intrin.def.bit_size() == 32);
2122                 let ftype = FloatType::F32;
2123                 let scratch = b.alloc_ssa(RegFile::GPR, 1);
2124 
2125                 b.push_op(OpShfl {
2126                     dst: scratch[0].into(),
2127                     in_bounds: Dst::None,
2128                     src: self.get_src(&srcs[0]),
2129                     lane: 2_u32.into(),
2130                     c: (0x3_u32 | 0x1c_u32 << 8).into(),
2131                     op: ShflOp::Bfly,
2132                 });
2133 
2134                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2135 
2136                 b.push_op(OpFSwzAdd {
2137                     dst: dst[0].into(),
2138                     srcs: [scratch[0].into(), self.get_src(&srcs[0])],
2139                     ops: [
2140                         FSwzAddOp::SubLeft,
2141                         FSwzAddOp::SubLeft,
2142                         FSwzAddOp::SubRight,
2143                         FSwzAddOp::SubRight,
2144                     ],
2145                     rnd_mode: self.float_ctl[ftype].rnd_mode,
2146                     ftz: self.float_ctl[ftype].ftz,
2147                 });
2148 
2149                 self.set_dst(&intrin.def, dst);
2150             }
2151             nir_intrinsic_ballot => {
2152                 assert!(srcs[0].bit_size() == 1);
2153                 let src = self.get_src(&srcs[0]);
2154 
2155                 assert!(intrin.def.bit_size() == 32);
2156                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2157 
2158                 b.push_op(OpVote {
2159                     op: VoteOp::Any,
2160                     ballot: dst.into(),
2161                     vote: Dst::None,
2162                     pred: src,
2163                 });
2164                 self.set_dst(&intrin.def, dst);
2165             }
2166             nir_intrinsic_bar_break_nv => {
2167                 let src = self.get_src(&srcs[0]);
2168                 let bar_in = b.bmov_to_bar(src);
2169                 let cond = self.get_src(&srcs[1]);
2170 
2171                 let bar_out = b.alloc_ssa(RegFile::Bar, 1);
2172                 b.push_op(OpBreak {
2173                     bar_out: bar_out.into(),
2174                     bar_in: bar_in.into(),
2175                     cond: cond.into(),
2176                 });
2177 
2178                 self.set_dst(&intrin.def, b.bmov_to_gpr(bar_out.into()));
2179             }
2180             nir_intrinsic_bar_set_nv => {
2181                 let label = self.label_alloc.alloc();
2182                 let old = self.bar_label.insert(intrin.def.index, label);
2183                 assert!(old.is_none());
2184 
2185                 let bar_clear = b.alloc_ssa(RegFile::Bar, 1);
2186                 b.push_op(OpBClear {
2187                     dst: bar_clear.into(),
2188                 });
2189 
2190                 let bar_out = b.alloc_ssa(RegFile::Bar, 1);
2191                 b.push_op(OpBSSy {
2192                     bar_out: bar_out.into(),
2193                     bar_in: bar_clear.into(),
2194                     cond: SrcRef::True.into(),
2195                     target: label,
2196                 });
2197 
2198                 self.set_dst(&intrin.def, b.bmov_to_gpr(bar_out.into()));
2199             }
2200             nir_intrinsic_bar_sync_nv => {
2201                 let src = self.get_src(&srcs[0]);
2202 
2203                 let bar = b.bmov_to_bar(src);
2204                 b.push_op(OpBSync {
2205                     bar: bar.into(),
2206                     cond: SrcRef::True.into(),
2207                 });
2208 
2209                 let bar_set_idx = &srcs[1].as_def().index;
2210                 if let Some(label) = self.bar_label.get(bar_set_idx) {
2211                     b.push_op(OpNop {
2212                         label: Some(*label),
2213                     });
2214                 }
2215             }
2216             nir_intrinsic_bindless_image_atomic
2217             | nir_intrinsic_bindless_image_atomic_swap => {
2218                 let handle = self.get_src(&srcs[0]);
2219                 let dim = self.get_image_dim(intrin);
2220                 let coord = self.get_image_coord(intrin, dim);
2221                 // let sample = self.get_src(&srcs[2]);
2222                 let atom_type = self.get_atomic_type(intrin);
2223                 let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Packed);
2224 
2225                 assert!(
2226                     intrin.def.bit_size() == 32 || intrin.def.bit_size() == 64
2227                 );
2228                 assert!(intrin.def.num_components() == 1);
2229                 let dst = b.alloc_ssa(RegFile::GPR, intrin.def.bit_size() / 32);
2230 
2231                 let data = if intrin.intrinsic
2232                     == nir_intrinsic_bindless_image_atomic_swap
2233                 {
2234                     if intrin.def.bit_size() == 64 {
2235                         SSARef::from([
2236                             self.get_ssa(srcs[3].as_def())[0],
2237                             self.get_ssa(srcs[3].as_def())[1],
2238                             self.get_ssa(srcs[4].as_def())[0],
2239                             self.get_ssa(srcs[4].as_def())[1],
2240                         ])
2241                         .into()
2242                     } else {
2243                         SSARef::from([
2244                             self.get_ssa(srcs[3].as_def())[0],
2245                             self.get_ssa(srcs[4].as_def())[0],
2246                         ])
2247                         .into()
2248                     }
2249                 } else {
2250                     self.get_src(&srcs[3])
2251                 };
2252 
2253                 let is_reduction =
2254                     atom_op.is_reduction() && intrin.def.components_read() == 0;
2255 
2256                 b.push_op(OpSuAtom {
2257                     dst: if self.sm.sm() >= 70 && is_reduction {
2258                         Dst::None
2259                     } else {
2260                         dst.into()
2261                     },
2262                     fault: Dst::None,
2263                     handle: handle,
2264                     coord: coord,
2265                     data: data,
2266                     atom_op: atom_op,
2267                     atom_type: atom_type,
2268                     image_dim: dim,
2269                     mem_order: MemOrder::Strong(MemScope::System),
2270                     mem_eviction_priority: self
2271                         .get_eviction_priority(intrin.access()),
2272                 });
2273                 self.set_dst(&intrin.def, dst);
2274             }
2275             nir_intrinsic_bindless_image_load => {
2276                 let handle = self.get_src(&srcs[0]);
2277                 let dim = self.get_image_dim(intrin);
2278                 let coord = self.get_image_coord(intrin, dim);
2279                 // let sample = self.get_src(&srcs[2]);
2280 
2281                 let comps = intrin.num_components;
2282                 assert!(intrin.def.bit_size() == 32);
2283                 assert!(comps == 1 || comps == 2 || comps == 4);
2284 
2285                 let dst = b.alloc_ssa(RegFile::GPR, comps);
2286 
2287                 b.push_op(OpSuLd {
2288                     dst: dst.into(),
2289                     fault: Dst::None,
2290                     image_dim: dim,
2291                     mem_order: MemOrder::Strong(MemScope::System),
2292                     mem_eviction_priority: self
2293                         .get_eviction_priority(intrin.access()),
2294                     mask: (1 << comps) - 1,
2295                     handle: handle,
2296                     coord: coord,
2297                 });
2298                 self.set_dst(&intrin.def, dst);
2299             }
2300             nir_intrinsic_bindless_image_sparse_load => {
2301                 let handle = self.get_src(&srcs[0]);
2302                 let dim = self.get_image_dim(intrin);
2303                 let coord = self.get_image_coord(intrin, dim);
2304                 // let sample = self.get_src(&srcs[2]);
2305 
2306                 let comps = intrin.num_components;
2307                 assert!(intrin.def.bit_size() == 32);
2308                 assert!(comps == 5);
2309 
2310                 let dst = b.alloc_ssa(RegFile::GPR, comps - 1);
2311                 let fault = b.alloc_ssa(RegFile::Pred, 1);
2312 
2313                 b.push_op(OpSuLd {
2314                     dst: dst.into(),
2315                     fault: fault.into(),
2316                     image_dim: dim,
2317                     mem_order: MemOrder::Strong(MemScope::System),
2318                     mem_eviction_priority: self
2319                         .get_eviction_priority(intrin.access()),
2320                     mask: (1 << (comps - 1)) - 1,
2321                     handle: handle,
2322                     coord: coord,
2323                 });
2324 
2325                 let mut final_dst = Vec::new();
2326                 for i in 0..usize::from(comps) - 1 {
2327                     final_dst.push(dst[i]);
2328                 }
2329                 final_dst.push(b.sel(fault.into(), 0.into(), 1.into())[0]);
2330 
2331                 self.set_ssa(&intrin.def, final_dst);
2332             }
2333             nir_intrinsic_bindless_image_store => {
2334                 let handle = self.get_src(&srcs[0]);
2335                 let dim = self.get_image_dim(intrin);
2336                 let coord = self.get_image_coord(intrin, dim);
2337                 // let sample = self.get_src(&srcs[2]);
2338                 let data = self.get_src(&srcs[3]);
2339 
2340                 let comps = intrin.num_components;
2341                 assert!(srcs[3].bit_size() == 32);
2342                 assert!(comps == 1 || comps == 2 || comps == 4);
2343 
2344                 b.push_op(OpSuSt {
2345                     image_dim: dim,
2346                     mem_order: MemOrder::Strong(MemScope::System),
2347                     mem_eviction_priority: self
2348                         .get_eviction_priority(intrin.access()),
2349                     mask: (1 << comps) - 1,
2350                     handle: handle,
2351                     coord: coord,
2352                     data: data,
2353                 });
2354             }
2355             nir_intrinsic_copy_fs_outputs_nv => {
2356                 let ShaderIoInfo::Fragment(info) = &mut self.info.io else {
2357                     panic!(
2358                         "copy_fs_outputs_nv is only allowed in fragment shaders"
2359                     );
2360                 };
2361 
2362                 for i in 0..32 {
2363                     if !self.fs_out_regs[i].is_none() {
2364                         info.writes_color |= 1 << i;
2365                     }
2366                 }
2367                 let mask_idx = (NAK_FS_OUT_SAMPLE_MASK / 4) as usize;
2368                 info.writes_sample_mask = !self.fs_out_regs[mask_idx].is_none();
2369                 let depth_idx = (NAK_FS_OUT_DEPTH / 4) as usize;
2370                 info.writes_depth = !self.fs_out_regs[depth_idx].is_none();
2371 
2372                 let mut srcs = Vec::new();
2373                 for i in 0..8 {
2374                     // Even though the mask is per-component, the actual output
2375                     // space is per-output vec4s.
2376                     if info.writes_color & (0xf << (i * 4)) != 0 {
2377                         for c in 0..4 {
2378                             let reg = self.fs_out_regs[i * 4 + c];
2379                             if reg.is_none() {
2380                                 srcs.push(b.undef().into());
2381                             } else {
2382                                 srcs.push(reg.into());
2383                             }
2384                         }
2385                     }
2386                 }
2387 
2388                 // These always come together for some reason
2389                 if info.writes_sample_mask || info.writes_depth {
2390                     if info.writes_sample_mask {
2391                         srcs.push(self.fs_out_regs[mask_idx].into());
2392                     } else {
2393                         srcs.push(b.undef().into());
2394                     }
2395                     if info.writes_depth {
2396                         srcs.push(self.fs_out_regs[depth_idx].into());
2397                     }
2398                 }
2399 
2400                 b.push_op(OpRegOut { srcs: srcs });
2401             }
2402             nir_intrinsic_demote => {
2403                 if let ShaderStageInfo::Fragment(info) = &mut self.info.stage {
2404                     info.uses_kill = true;
2405                 } else {
2406                     panic!("OpKill is only available in fragment shaders");
2407                 }
2408                 b.push_op(OpKill {});
2409             }
2410             nir_intrinsic_demote_if => {
2411                 if let ShaderStageInfo::Fragment(info) = &mut self.info.stage {
2412                     info.uses_kill = true;
2413                 } else {
2414                     panic!("OpKill is only available in fragment shaders");
2415                 }
2416                 let cond = self.get_ssa(srcs[0].as_def())[0];
2417                 b.predicate(cond.into()).push_op(OpKill {});
2418             }
2419             nir_intrinsic_global_atomic => {
2420                 let bit_size = intrin.def.bit_size();
2421                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2422                 let data = self.get_src(&srcs[1]);
2423                 let atom_type = self.get_atomic_type(intrin);
2424                 let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate);
2425 
2426                 assert!(intrin.def.num_components() == 1);
2427                 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2428 
2429                 let is_reduction =
2430                     atom_op.is_reduction() && intrin.def.components_read() == 0;
2431 
2432                 b.push_op(OpAtom {
2433                     dst: if is_reduction { Dst::None } else { dst.into() },
2434                     addr: addr,
2435                     cmpr: 0.into(),
2436                     data: data,
2437                     atom_op: atom_op,
2438                     atom_type: atom_type,
2439                     addr_offset: offset,
2440                     mem_space: MemSpace::Global(MemAddrType::A64),
2441                     mem_order: MemOrder::Strong(MemScope::System),
2442                     mem_eviction_priority: MemEvictionPriority::Normal, // Note: no intrinic access
2443                 });
2444                 self.set_dst(&intrin.def, dst);
2445             }
2446             nir_intrinsic_global_atomic_swap => {
2447                 assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg);
2448                 let bit_size = intrin.def.bit_size();
2449                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2450                 let cmpr = self.get_src(&srcs[1]);
2451                 let data = self.get_src(&srcs[2]);
2452                 let atom_type = AtomType::U(bit_size);
2453 
2454                 assert!(intrin.def.num_components() == 1);
2455                 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2456 
2457                 b.push_op(OpAtom {
2458                     dst: dst.into(),
2459                     addr: addr,
2460                     cmpr: cmpr,
2461                     data: data,
2462                     atom_op: AtomOp::CmpExch(AtomCmpSrc::Separate),
2463                     atom_type: atom_type,
2464                     addr_offset: offset,
2465                     mem_space: MemSpace::Global(MemAddrType::A64),
2466                     mem_order: MemOrder::Strong(MemScope::System),
2467                     mem_eviction_priority: MemEvictionPriority::Normal, // Note: no intrinic access
2468                 });
2469                 self.set_dst(&intrin.def, dst);
2470             }
2471             nir_intrinsic_ipa_nv => {
2472                 let addr = u16::try_from(intrin.base()).unwrap();
2473 
2474                 let flags = intrin.flags();
2475                 let flags: nak_nir_ipa_flags =
2476                     unsafe { std::mem::transmute_copy(&flags) };
2477 
2478                 let mode = match flags.interp_mode() {
2479                     NAK_INTERP_MODE_PERSPECTIVE => PixelImap::Perspective,
2480                     NAK_INTERP_MODE_SCREEN_LINEAR => PixelImap::ScreenLinear,
2481                     NAK_INTERP_MODE_CONSTANT => PixelImap::Constant,
2482                     _ => panic!("Unsupported interp mode"),
2483                 };
2484 
2485                 let freq = match flags.interp_freq() {
2486                     NAK_INTERP_FREQ_PASS => InterpFreq::Pass,
2487                     NAK_INTERP_FREQ_PASS_MUL_W => InterpFreq::PassMulW,
2488                     NAK_INTERP_FREQ_CONSTANT => InterpFreq::Constant,
2489                     NAK_INTERP_FREQ_STATE => InterpFreq::State,
2490                     _ => panic!("Invalid interp freq"),
2491                 };
2492 
2493                 let loc = match flags.interp_loc() {
2494                     NAK_INTERP_LOC_DEFAULT => InterpLoc::Default,
2495                     NAK_INTERP_LOC_CENTROID => InterpLoc::Centroid,
2496                     NAK_INTERP_LOC_OFFSET => InterpLoc::Offset,
2497                     _ => panic!("Invalid interp loc"),
2498                 };
2499 
2500                 let inv_w = if freq == InterpFreq::PassMulW {
2501                     self.get_src(&srcs[0])
2502                 } else {
2503                     0.into()
2504                 };
2505 
2506                 let offset = if loc == InterpLoc::Offset {
2507                     self.get_src(&srcs[1])
2508                 } else {
2509                     0.into()
2510                 };
2511 
2512                 let ShaderIoInfo::Fragment(io) = &mut self.info.io else {
2513                     panic!("OpIpa is only used for fragment shaders");
2514                 };
2515 
2516                 io.mark_attr_read(addr, mode);
2517 
2518                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2519                 b.push_op(OpIpa {
2520                     dst: dst.into(),
2521                     addr: addr,
2522                     freq: freq,
2523                     loc: loc,
2524                     inv_w: inv_w,
2525                     offset: offset,
2526                 });
2527                 self.set_dst(&intrin.def, dst);
2528             }
2529             nir_intrinsic_isberd_nv => {
2530                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2531                 b.push_op(OpIsberd {
2532                     dst: dst.into(),
2533                     idx: self.get_src(&srcs[0]),
2534                 });
2535                 self.set_dst(&intrin.def, dst);
2536             }
2537             nir_intrinsic_load_barycentric_at_offset_nv => (),
2538             nir_intrinsic_load_barycentric_centroid => (),
2539             nir_intrinsic_load_barycentric_pixel => (),
2540             nir_intrinsic_load_barycentric_sample => (),
2541             nir_intrinsic_load_global | nir_intrinsic_load_global_constant => {
2542                 let size_B =
2543                     (intrin.def.bit_size() / 8) * intrin.def.num_components();
2544                 assert!(u32::from(size_B) <= intrin.align());
2545                 let order =
2546                     if intrin.intrinsic == nir_intrinsic_load_global_constant {
2547                         MemOrder::Constant
2548                     } else {
2549                         MemOrder::Strong(MemScope::System)
2550                     };
2551                 let access = MemAccess {
2552                     mem_type: MemType::from_size(size_B, false),
2553                     space: MemSpace::Global(MemAddrType::A64),
2554                     order: order,
2555                     eviction_priority: self
2556                         .get_eviction_priority(intrin.access()),
2557                 };
2558                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2559                 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2560 
2561                 b.push_op(OpLd {
2562                     dst: dst.into(),
2563                     addr: addr,
2564                     offset: offset,
2565                     access: access,
2566                 });
2567                 self.set_dst(&intrin.def, dst);
2568             }
2569             nir_intrinsic_ldtram_nv => {
2570                 let ShaderIoInfo::Fragment(io) = &mut self.info.io else {
2571                     panic!("ldtram_nv is only used for fragment shaders");
2572                 };
2573 
2574                 assert!(
2575                     intrin.def.bit_size() == 32
2576                         && intrin.def.num_components == 2
2577                 );
2578 
2579                 let flags = intrin.flags();
2580                 let use_c = flags != 0;
2581 
2582                 let addr = u16::try_from(intrin.base()).unwrap();
2583 
2584                 io.mark_barycentric_attr_in(addr);
2585 
2586                 let dst = b.alloc_ssa(RegFile::GPR, 2);
2587                 b.push_op(OpLdTram {
2588                     dst: dst.into(),
2589                     addr,
2590                     use_c,
2591                 });
2592                 self.set_dst(&intrin.def, dst);
2593             }
2594             nir_intrinsic_load_sample_id => {
2595                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2596                 b.push_op(OpPixLd {
2597                     dst: dst.into(),
2598                     val: PixVal::MyIndex,
2599                 });
2600                 self.set_dst(&intrin.def, dst);
2601             }
2602             nir_intrinsic_load_sample_mask_in => {
2603                 if let ShaderIoInfo::Fragment(info) = &mut self.info.io {
2604                     info.reads_sample_mask = true;
2605                 } else {
2606                     panic!(
2607                         "sample_mask_in is only available in fragment shaders"
2608                     );
2609                 }
2610 
2611                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2612                 b.push_op(OpPixLd {
2613                     dst: dst.into(),
2614                     val: PixVal::CovMask,
2615                 });
2616                 self.set_dst(&intrin.def, dst);
2617             }
2618             nir_intrinsic_load_tess_coord_xy => {
2619                 // Loading gl_TessCoord in tessellation evaluation shaders is
2620                 // weird.  It's treated as a per-vertex output which is indexed
2621                 // by LANEID.
2622                 match &self.info.stage {
2623                     ShaderStageInfo::Tessellation(_) => (),
2624                     _ => panic!(
2625                         "load_tess_coord is only available in tessellation \
2626                          shaders"
2627                     ),
2628                 };
2629 
2630                 assert!(intrin.def.bit_size() == 32);
2631                 assert!(intrin.def.num_components() == 2);
2632 
2633                 let vtx = b.alloc_ssa(RegFile::GPR, 1);
2634                 b.push_op(OpS2R {
2635                     dst: vtx.into(),
2636                     idx: 0,
2637                 });
2638 
2639                 let access = AttrAccess {
2640                     addr: NAK_ATTR_TESS_COORD,
2641                     comps: 2,
2642                     patch: false,
2643                     output: true,
2644                     phys: false,
2645                 };
2646 
2647                 // This is recorded as a patch output in parse_shader() because
2648                 // the hardware requires it be in the SPH, whether we use it or
2649                 // not.
2650 
2651                 let dst = b.alloc_ssa(RegFile::GPR, access.comps);
2652                 b.push_op(OpALd {
2653                     dst: dst.into(),
2654                     vtx: vtx.into(),
2655                     offset: 0.into(),
2656                     access: access,
2657                 });
2658                 self.set_dst(&intrin.def, dst);
2659             }
2660             nir_intrinsic_load_scratch => {
2661                 let size_B =
2662                     (intrin.def.bit_size() / 8) * intrin.def.num_components();
2663                 assert!(u32::from(size_B) <= intrin.align());
2664                 let access = MemAccess {
2665                     mem_type: MemType::from_size(size_B, false),
2666                     space: MemSpace::Local,
2667                     order: MemOrder::Strong(MemScope::CTA),
2668                     eviction_priority: MemEvictionPriority::Normal,
2669                 };
2670                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2671                 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2672 
2673                 b.push_op(OpLd {
2674                     dst: dst.into(),
2675                     addr: addr,
2676                     offset: offset,
2677                     access: access,
2678                 });
2679                 self.set_dst(&intrin.def, dst);
2680             }
2681             nir_intrinsic_load_shared => {
2682                 let size_B =
2683                     (intrin.def.bit_size() / 8) * intrin.def.num_components();
2684                 assert!(u32::from(size_B) <= intrin.align());
2685                 let access = MemAccess {
2686                     mem_type: MemType::from_size(size_B, false),
2687                     space: MemSpace::Shared,
2688                     order: MemOrder::Strong(MemScope::CTA),
2689                     eviction_priority: MemEvictionPriority::Normal,
2690                 };
2691                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2692                 let offset = offset + intrin.base();
2693                 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2694 
2695                 b.push_op(OpLd {
2696                     dst: dst.into(),
2697                     addr: addr,
2698                     offset: offset,
2699                     access: access,
2700                 });
2701                 self.set_dst(&intrin.def, dst);
2702             }
2703             nir_intrinsic_load_sysval_nv => {
2704                 let idx = u8::try_from(intrin.base()).unwrap();
2705                 debug_assert!(intrin.def.num_components == 1);
2706                 debug_assert!(
2707                     intrin.def.bit_size == 32 || intrin.def.bit_size == 64
2708                 );
2709                 let comps = intrin.def.bit_size / 32;
2710                 let dst = b.alloc_ssa(RegFile::GPR, comps);
2711                 if idx == NAK_SV_CLOCK || idx == NAK_SV_CLOCK + 1 {
2712                     debug_assert!(idx + comps <= NAK_SV_CLOCK + 2);
2713                     b.push_op(OpCS2R {
2714                         dst: dst.into(),
2715                         idx: idx,
2716                     });
2717                 } else {
2718                     debug_assert!(intrin.def.bit_size == 32);
2719                     b.push_op(OpS2R {
2720                         dst: dst.into(),
2721                         idx: idx,
2722                     });
2723                 }
2724                 self.set_dst(&intrin.def, dst);
2725             }
2726             nir_intrinsic_ldc_nv => {
2727                 let size_B =
2728                     (intrin.def.bit_size() / 8) * intrin.def.num_components();
2729                 let idx = &srcs[0];
2730 
2731                 let (off, off_imm) = self.get_cbuf_addr_offset(&srcs[1]);
2732 
2733                 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2734 
2735                 if let Some(idx_imm) = idx.as_uint() {
2736                     let idx_imm: u8 = idx_imm.try_into().unwrap();
2737                     let cb = CBufRef {
2738                         buf: CBuf::Binding(idx_imm),
2739                         offset: off_imm,
2740                     };
2741                     if off.is_zero() {
2742                         for (i, comp) in dst.iter().enumerate() {
2743                             let i = u16::try_from(i).unwrap();
2744                             b.copy_to((*comp).into(), cb.offset(i * 4).into());
2745                         }
2746                     } else {
2747                         b.push_op(OpLdc {
2748                             dst: dst.into(),
2749                             cb: cb.into(),
2750                             offset: off,
2751                             mode: LdcMode::Indexed,
2752                             mem_type: MemType::from_size(size_B, false),
2753                         });
2754                     }
2755                 } else {
2756                     // In the IndexedSegmented mode, the hardware computes the
2757                     // actual index and offset as follows:
2758                     //
2759                     //    idx = imm_idx + reg[31:16]
2760                     //    offset = imm_offset + reg[15:0]
2761                     //    ldc c[idx][offset]
2762                     //
2763                     // So pack the index and offset accordingly
2764                     let idx = self.get_src(idx);
2765                     let off_idx = b.prmt(off, idx, [0, 1, 4, 5]);
2766                     let cb = CBufRef {
2767                         buf: CBuf::Binding(0),
2768                         offset: off_imm,
2769                     };
2770                     b.push_op(OpLdc {
2771                         dst: dst.into(),
2772                         cb: cb.into(),
2773                         offset: off_idx.into(),
2774                         mode: LdcMode::IndexedSegmented,
2775                         mem_type: MemType::from_size(size_B, false),
2776                     });
2777                 }
2778                 self.set_dst(&intrin.def, dst);
2779             }
2780             nir_intrinsic_ldcx_nv => {
2781                 let size_B =
2782                     (intrin.def.bit_size() / 8) * intrin.def.num_components();
2783 
2784                 let handle = self.get_ssa_ref(&srcs[0]);
2785                 let (off, off_imm) = self.get_cbuf_addr_offset(&srcs[1]);
2786 
2787                 let cb = CBufRef {
2788                     buf: CBuf::BindlessSSA(handle),
2789                     offset: off_imm,
2790                 };
2791 
2792                 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2793                 if off.is_zero() {
2794                     for (i, comp) in dst.iter().enumerate() {
2795                         let i = u16::try_from(i).unwrap();
2796                         b.copy_to((*comp).into(), cb.offset(i * 4).into());
2797                     }
2798                 } else {
2799                     b.push_op(OpLdc {
2800                         dst: dst.into(),
2801                         cb: cb.into(),
2802                         offset: off,
2803                         mode: LdcMode::Indexed,
2804                         mem_type: MemType::from_size(size_B, false),
2805                     });
2806                 }
2807                 self.set_dst(&intrin.def, dst);
2808             }
2809             nir_intrinsic_pin_cx_handle_nv => {
2810                 let handle = self.get_ssa_ref(&srcs[0]);
2811                 b.push_op(OpPin {
2812                     src: handle.into(),
2813                     dst: handle.into(),
2814                 });
2815             }
2816             nir_intrinsic_unpin_cx_handle_nv => {
2817                 let handle = self.get_ssa_ref(&srcs[0]);
2818                 b.push_op(OpUnpin {
2819                     src: handle.into(),
2820                     dst: handle.into(),
2821                 });
2822             }
2823             nir_intrinsic_barrier => {
2824                 let modes = intrin.memory_modes();
2825                 let semantics = intrin.memory_semantics();
2826                 if (modes & nir_var_mem_global) != 0
2827                     && (semantics & NIR_MEMORY_RELEASE) != 0
2828                 {
2829                     // Pre-Volta doesn't have WBAll but it also seems that we
2830                     // don't need it.
2831                     if self.sm.sm() >= 70 {
2832                         b.push_op(OpCCtl {
2833                             op: CCtlOp::WBAll,
2834                             mem_space: MemSpace::Global(MemAddrType::A64),
2835                             addr: 0.into(),
2836                             addr_offset: 0,
2837                         });
2838                     }
2839                 }
2840                 match intrin.execution_scope() {
2841                     SCOPE_NONE => (),
2842                     SCOPE_WORKGROUP => {
2843                         assert!(
2844                             self.nir.info.stage() == MESA_SHADER_COMPUTE
2845                                 || self.nir.info.stage() == MESA_SHADER_KERNEL
2846                         );
2847                         self.info.num_control_barriers = 1;
2848                         b.push_op(OpBar {});
2849                     }
2850                     _ => panic!("Unhandled execution scope"),
2851                 }
2852                 if intrin.memory_scope() != SCOPE_NONE {
2853                     let mem_scope = match intrin.memory_scope() {
2854                         SCOPE_INVOCATION | SCOPE_SUBGROUP => MemScope::CTA,
2855                         SCOPE_WORKGROUP | SCOPE_QUEUE_FAMILY | SCOPE_DEVICE => {
2856                             MemScope::GPU
2857                         }
2858                         _ => panic!("Unhandled memory scope"),
2859                     };
2860                     b.push_op(OpMemBar { scope: mem_scope });
2861                 }
2862                 if (modes & nir_var_mem_global) != 0
2863                     && (semantics & NIR_MEMORY_ACQUIRE) != 0
2864                 {
2865                     b.push_op(OpCCtl {
2866                         op: CCtlOp::IVAll,
2867                         mem_space: MemSpace::Global(MemAddrType::A64),
2868                         addr: 0.into(),
2869                         addr_offset: 0,
2870                     });
2871                 }
2872             }
2873             nir_intrinsic_quad_broadcast
2874             | nir_intrinsic_read_invocation
2875             | nir_intrinsic_shuffle
2876             | nir_intrinsic_shuffle_down
2877             | nir_intrinsic_shuffle_up
2878             | nir_intrinsic_shuffle_xor => {
2879                 assert!(srcs[0].bit_size() == 32);
2880                 assert!(srcs[0].num_components() == 1);
2881                 let data = self.get_src(&srcs[0]);
2882 
2883                 assert!(srcs[1].bit_size() == 32);
2884                 let idx = self.get_src(&srcs[1]);
2885 
2886                 assert!(intrin.def.bit_size() == 32);
2887                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2888 
2889                 b.push_op(OpShfl {
2890                     dst: dst.into(),
2891                     in_bounds: Dst::None,
2892                     src: data,
2893                     lane: idx,
2894                     c: match intrin.intrinsic {
2895                         nir_intrinsic_quad_broadcast => 0x1c_03.into(),
2896                         nir_intrinsic_shuffle_up => 0.into(),
2897                         _ => 0x1f.into(),
2898                     },
2899                     op: match intrin.intrinsic {
2900                         nir_intrinsic_shuffle_down => ShflOp::Down,
2901                         nir_intrinsic_shuffle_up => ShflOp::Up,
2902                         nir_intrinsic_shuffle_xor => ShflOp::Bfly,
2903                         _ => ShflOp::Idx,
2904                     },
2905                 });
2906                 self.set_dst(&intrin.def, dst);
2907             }
2908             nir_intrinsic_quad_swap_horizontal
2909             | nir_intrinsic_quad_swap_vertical
2910             | nir_intrinsic_quad_swap_diagonal => {
2911                 assert!(srcs[0].bit_size() == 32);
2912                 assert!(srcs[0].num_components() == 1);
2913                 let data = self.get_src(&srcs[0]);
2914 
2915                 assert!(intrin.def.bit_size() == 32);
2916                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2917                 b.push_op(OpShfl {
2918                     dst: dst.into(),
2919                     in_bounds: Dst::None,
2920                     src: data,
2921                     lane: match intrin.intrinsic {
2922                         nir_intrinsic_quad_swap_horizontal => 1_u32.into(),
2923                         nir_intrinsic_quad_swap_vertical => 2_u32.into(),
2924                         nir_intrinsic_quad_swap_diagonal => 3_u32.into(),
2925                         op => panic!("Unknown quad intrinsic {}", op),
2926                     },
2927                     c: 0x1c_03.into(),
2928                     op: ShflOp::Bfly,
2929                 });
2930                 self.set_dst(&intrin.def, dst);
2931             }
2932             nir_intrinsic_shared_atomic => {
2933                 let bit_size = intrin.def.bit_size();
2934                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2935                 let data = self.get_src(&srcs[1]);
2936                 let atom_type = self.get_atomic_type(intrin);
2937                 let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate);
2938 
2939                 assert!(intrin.def.num_components() == 1);
2940                 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2941 
2942                 b.push_op(OpAtom {
2943                     dst: dst.into(),
2944                     addr: addr,
2945                     cmpr: 0.into(),
2946                     data: data,
2947                     atom_op: atom_op,
2948                     atom_type: atom_type,
2949                     addr_offset: offset,
2950                     mem_space: MemSpace::Shared,
2951                     mem_order: MemOrder::Strong(MemScope::CTA),
2952                     mem_eviction_priority: MemEvictionPriority::Normal,
2953                 });
2954                 self.set_dst(&intrin.def, dst);
2955             }
2956             nir_intrinsic_shared_atomic_swap => {
2957                 assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg);
2958                 let bit_size = intrin.def.bit_size();
2959                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2960                 let cmpr = self.get_src(&srcs[1]);
2961                 let data = self.get_src(&srcs[2]);
2962                 let atom_type = AtomType::U(bit_size);
2963 
2964                 assert!(intrin.def.num_components() == 1);
2965                 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2966 
2967                 b.push_op(OpAtom {
2968                     dst: dst.into(),
2969                     addr: addr,
2970                     cmpr: cmpr,
2971                     data: data,
2972                     atom_op: AtomOp::CmpExch(AtomCmpSrc::Separate),
2973                     atom_type: atom_type,
2974                     addr_offset: offset,
2975                     mem_space: MemSpace::Shared,
2976                     mem_order: MemOrder::Strong(MemScope::CTA),
2977                     mem_eviction_priority: MemEvictionPriority::Normal,
2978                 });
2979                 self.set_dst(&intrin.def, dst);
2980             }
2981             nir_intrinsic_ssa_bar_nv => {
2982                 let src = self.get_src(&srcs[0]);
2983                 b.push_op(OpSrcBar { src });
2984             }
2985             nir_intrinsic_store_global => {
2986                 let data = self.get_src(&srcs[0]);
2987                 let size_B =
2988                     (srcs[0].bit_size() / 8) * srcs[0].num_components();
2989                 assert!(u32::from(size_B) <= intrin.align());
2990                 let access = MemAccess {
2991                     mem_type: MemType::from_size(size_B, false),
2992                     space: MemSpace::Global(MemAddrType::A64),
2993                     order: MemOrder::Strong(MemScope::System),
2994                     eviction_priority: self
2995                         .get_eviction_priority(intrin.access()),
2996                 };
2997                 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
2998 
2999                 b.push_op(OpSt {
3000                     addr: addr,
3001                     data: data,
3002                     offset: offset,
3003                     access: access,
3004                 });
3005             }
3006             nir_intrinsic_fs_out_nv => {
3007                 let data = self.get_ssa(srcs[0].as_def());
3008                 assert!(data.len() == 1);
3009                 let data = data[0];
3010 
3011                 let addr = u16::try_from(intrin.base()).unwrap();
3012                 assert!(addr % 4 == 0);
3013 
3014                 self.fs_out_regs[usize::from(addr / 4)] = data;
3015             }
3016             nir_intrinsic_store_scratch => {
3017                 let data = self.get_src(&srcs[0]);
3018                 let size_B =
3019                     (srcs[0].bit_size() / 8) * srcs[0].num_components();
3020                 assert!(u32::from(size_B) <= intrin.align());
3021                 let access = MemAccess {
3022                     mem_type: MemType::from_size(size_B, false),
3023                     space: MemSpace::Local,
3024                     order: MemOrder::Strong(MemScope::CTA),
3025                     eviction_priority: MemEvictionPriority::Normal,
3026                 };
3027                 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
3028 
3029                 b.push_op(OpSt {
3030                     addr: addr,
3031                     data: data,
3032                     offset: offset,
3033                     access: access,
3034                 });
3035             }
3036             nir_intrinsic_store_shared => {
3037                 let data = self.get_src(&srcs[0]);
3038                 let size_B =
3039                     (srcs[0].bit_size() / 8) * srcs[0].num_components();
3040                 assert!(u32::from(size_B) <= intrin.align());
3041                 let access = MemAccess {
3042                     mem_type: MemType::from_size(size_B, false),
3043                     space: MemSpace::Shared,
3044                     order: MemOrder::Strong(MemScope::CTA),
3045                     eviction_priority: MemEvictionPriority::Normal,
3046                 };
3047                 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
3048                 let offset = offset + intrin.base();
3049 
3050                 b.push_op(OpSt {
3051                     addr: addr,
3052                     data: data,
3053                     offset: offset,
3054                     access: access,
3055                 });
3056             }
3057             nir_intrinsic_emit_vertex_nv | nir_intrinsic_end_primitive_nv => {
3058                 assert!(intrin.def.bit_size() == 32);
3059                 assert!(intrin.def.num_components() == 1);
3060 
3061                 let dst = b.alloc_ssa(RegFile::GPR, 1);
3062                 let handle = self.get_src(&srcs[0]);
3063                 let stream_id = intrin.stream_id();
3064 
3065                 b.push_op(OpOut {
3066                     dst: dst.into(),
3067                     handle: handle,
3068                     stream: stream_id.into(),
3069                     out_type: if intrin.intrinsic
3070                         == nir_intrinsic_emit_vertex_nv
3071                     {
3072                         OutType::Emit
3073                     } else {
3074                         OutType::Cut
3075                     },
3076                 });
3077                 self.set_dst(&intrin.def, dst);
3078             }
3079 
3080             nir_intrinsic_final_primitive_nv => {
3081                 let handle = self.get_src(&srcs[0]);
3082 
3083                 if self.sm.sm() >= 70 {
3084                     b.push_op(OpOutFinal { handle: handle });
3085                 } else {
3086                     b.push_op(OpRegOut { srcs: vec![handle] });
3087                 }
3088             }
3089             nir_intrinsic_vote_all
3090             | nir_intrinsic_vote_any
3091             | nir_intrinsic_vote_ieq => {
3092                 assert!(srcs[0].bit_size() == 1);
3093                 let src = self.get_src(&srcs[0]);
3094 
3095                 assert!(intrin.def.bit_size() == 1);
3096                 let dst = b.alloc_ssa(RegFile::Pred, 1);
3097 
3098                 b.push_op(OpVote {
3099                     op: match intrin.intrinsic {
3100                         nir_intrinsic_vote_all => VoteOp::All,
3101                         nir_intrinsic_vote_any => VoteOp::Any,
3102                         nir_intrinsic_vote_ieq => VoteOp::Eq,
3103                         _ => panic!("Unknown vote intrinsic"),
3104                     },
3105                     ballot: Dst::None,
3106                     vote: dst.into(),
3107                     pred: src,
3108                 });
3109                 self.set_dst(&intrin.def, dst);
3110             }
3111             nir_intrinsic_is_sparse_texels_resident => {
3112                 let src = self.get_src(&srcs[0]);
3113                 let dst = b.isetp(IntCmpType::I32, IntCmpOp::Ne, src, 0.into());
3114                 self.set_dst(&intrin.def, dst);
3115             }
3116             _ => panic!(
3117                 "Unsupported intrinsic instruction: {}",
3118                 intrin.info().name()
3119             ),
3120         }
3121     }
3122 
parse_load_const( &mut self, b: &mut impl SSABuilder, load_const: &nir_load_const_instr, )3123     fn parse_load_const(
3124         &mut self,
3125         b: &mut impl SSABuilder,
3126         load_const: &nir_load_const_instr,
3127     ) {
3128         let values = &load_const.values();
3129 
3130         let mut dst = Vec::new();
3131         match load_const.def.bit_size {
3132             1 => {
3133                 for c in 0..load_const.def.num_components {
3134                     let imm_b1 = unsafe { values[usize::from(c)].b };
3135                     dst.push(b.copy(imm_b1.into())[0]);
3136                 }
3137             }
3138             8 => {
3139                 for dw in 0..load_const.def.num_components.div_ceil(4) {
3140                     let mut imm_u32 = 0;
3141                     for b in 0..4 {
3142                         let c = dw * 4 + b;
3143                         if c < load_const.def.num_components {
3144                             let imm_u8 = unsafe { values[usize::from(c)].u8_ };
3145                             imm_u32 |= u32::from(imm_u8) << b * 8;
3146                         }
3147                     }
3148                     dst.push(b.copy(imm_u32.into())[0]);
3149                 }
3150             }
3151             16 => {
3152                 for dw in 0..load_const.def.num_components.div_ceil(2) {
3153                     let mut imm_u32 = 0;
3154                     for w in 0..2 {
3155                         let c = dw * 2 + w;
3156                         if c < load_const.def.num_components {
3157                             let imm_u16 =
3158                                 unsafe { values[usize::from(c)].u16_ };
3159                             imm_u32 |= u32::from(imm_u16) << w * 16;
3160                         }
3161                     }
3162                     dst.push(b.copy(imm_u32.into())[0]);
3163                 }
3164             }
3165             32 => {
3166                 for c in 0..load_const.def.num_components {
3167                     let imm_u32 = unsafe { values[usize::from(c)].u32_ };
3168                     dst.push(b.copy(imm_u32.into())[0]);
3169                 }
3170             }
3171             64 => {
3172                 for c in 0..load_const.def.num_components {
3173                     let imm_u64 = unsafe { values[c as usize].u64_ };
3174                     dst.push(b.copy((imm_u64 as u32).into())[0]);
3175                     dst.push(b.copy(((imm_u64 >> 32) as u32).into())[0]);
3176                 }
3177             }
3178             _ => panic!("Unknown bit size: {}", load_const.def.bit_size),
3179         }
3180 
3181         self.set_ssa(&load_const.def, dst);
3182     }
3183 
parse_undef( &mut self, b: &mut impl SSABuilder, undef: &nir_undef_instr, )3184     fn parse_undef(
3185         &mut self,
3186         b: &mut impl SSABuilder,
3187         undef: &nir_undef_instr,
3188     ) {
3189         let dst = alloc_ssa_for_nir(b, &undef.def);
3190         for c in &dst {
3191             b.push_op(OpUndef { dst: (*c).into() });
3192         }
3193         self.set_ssa(&undef.def, dst);
3194     }
3195 
emit_jump( &mut self, b: &mut impl SSABuilder, nb: &nir_block, target: &nir_block, )3196     fn emit_jump(
3197         &mut self,
3198         b: &mut impl SSABuilder,
3199         nb: &nir_block,
3200         target: &nir_block,
3201     ) {
3202         if target.index == self.end_block_id {
3203             b.push_op(OpExit {});
3204         } else {
3205             self.cfg.add_edge(nb.index, target.index);
3206             let target_label = self.get_block_label(target);
3207 
3208             match self.peek_crs(target) {
3209                 Some(SyncType::Sync) => {
3210                     b.push_op(OpSync {
3211                         target: target_label,
3212                     });
3213                 }
3214                 Some(SyncType::Brk) => {
3215                     b.push_op(OpBrk {
3216                         target: target_label,
3217                     });
3218                 }
3219                 Some(SyncType::Cont) => {
3220                     b.push_op(OpCont {
3221                         target: target_label,
3222                     });
3223                 }
3224                 None => {
3225                     b.push_op(OpBra {
3226                         target: target_label,
3227                     });
3228                 }
3229             }
3230         }
3231     }
3232 
emit_pred_jump( &mut self, b: &mut impl SSABuilder, nb: &nir_block, pred: Pred, target: &nir_block, fallthrough: &nir_block, )3233     fn emit_pred_jump(
3234         &mut self,
3235         b: &mut impl SSABuilder,
3236         nb: &nir_block,
3237         pred: Pred,
3238         target: &nir_block,
3239         fallthrough: &nir_block,
3240     ) {
3241         // The fall-through edge has to come first
3242         self.cfg.add_edge(nb.index, fallthrough.index);
3243         let op = if target.index == self.end_block_id {
3244             Op::Exit(OpExit {})
3245         } else {
3246             self.cfg.add_edge(nb.index, target.index);
3247             Op::Bra(OpBra {
3248                 target: self.get_block_label(target),
3249             })
3250         };
3251         b.predicate(pred).push_op(op);
3252     }
3253 
parse_block( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, nb: &nir_block, )3254     fn parse_block(
3255         &mut self,
3256         ssa_alloc: &mut SSAValueAllocator,
3257         phi_map: &mut PhiAllocMap,
3258         nb: &nir_block,
3259     ) {
3260         let sm = self.sm;
3261         let mut b = SSAInstrBuilder::new(sm, ssa_alloc);
3262 
3263         if self.sm.sm() >= 70 && nb.index == 0 && self.nir.info.shared_size > 0
3264         {
3265             // The blob seems to always do a BSYNC before accessing shared
3266             // memory.  Perhaps this is to ensure that our allocation is
3267             // actually available and not in use by another thread?
3268             let label = self.label_alloc.alloc();
3269             let bar_clear = b.alloc_ssa(RegFile::Bar, 1);
3270 
3271             b.push_op(OpBClear {
3272                 dst: bar_clear.into(),
3273             });
3274 
3275             let bar = b.alloc_ssa(RegFile::Bar, 1);
3276             b.push_op(OpBSSy {
3277                 bar_out: bar.into(),
3278                 bar_in: bar_clear.into(),
3279                 cond: SrcRef::True.into(),
3280                 target: label,
3281             });
3282 
3283             b.push_op(OpBSync {
3284                 bar: bar.into(),
3285                 cond: SrcRef::True.into(),
3286             });
3287 
3288             b.push_op(OpNop { label: Some(label) });
3289         }
3290 
3291         let mut phi = OpPhiDsts::new();
3292         for ni in nb.iter_instr_list() {
3293             let Some(np) = ni.as_phi() else {
3294                 break;
3295             };
3296 
3297             if DEBUG.annotate() {
3298                 let annotation = self
3299                     .nir_instr_printer
3300                     .instr_to_string(ni)
3301                     .split_whitespace()
3302                     .collect::<Vec<_>>()
3303                     .join(" ");
3304                 b.push_op(OpAnnotate {
3305                     annotation: format!("generated by \"{}\"", annotation,),
3306                 });
3307             }
3308 
3309             let uniform = !nb.divergent
3310                 && self.sm.sm() >= 75
3311                 && !DEBUG.no_ugpr()
3312                 && !np.def.divergent;
3313 
3314             // This should be ensured by nak_nir_lower_cf()
3315             if uniform {
3316                 for ps in np.iter_srcs() {
3317                     assert!(!ps.pred().divergent);
3318                 }
3319             }
3320 
3321             let mut b = UniformBuilder::new(&mut b, uniform);
3322             let dst = alloc_ssa_for_nir(&mut b, np.def.as_def());
3323             for i in 0..dst.len() {
3324                 let phi_id = phi_map.get_phi_id(np, i.try_into().unwrap());
3325                 phi.dsts.push(phi_id, dst[i].into());
3326             }
3327             self.set_ssa(np.def.as_def(), dst);
3328         }
3329 
3330         if !phi.dsts.is_empty() {
3331             b.push_op(phi);
3332         }
3333 
3334         if self.sm.sm() < 75 && nb.cf_node.prev().is_none() {
3335             if let Some(_) = nb.parent().as_loop() {
3336                 b.push_op(OpPCnt {
3337                     target: self.get_block_label(nb),
3338                 });
3339                 self.push_crs(nb, SyncType::Cont);
3340             }
3341         }
3342 
3343         let mut goto = None;
3344         for ni in nb.iter_instr_list() {
3345             if DEBUG.annotate() && ni.type_ != nir_instr_type_phi {
3346                 let annotation = self
3347                     .nir_instr_printer
3348                     .instr_to_string(ni)
3349                     .split_whitespace()
3350                     .collect::<Vec<_>>()
3351                     .join(" ");
3352                 b.push_op(OpAnnotate {
3353                     annotation: format!("generated by \"{}\"", annotation,),
3354                 });
3355             }
3356 
3357             let uniform = !nb.divergent
3358                 && self.sm.sm() >= 75
3359                 && !DEBUG.no_ugpr()
3360                 && ni.def().is_some_and(|d| !d.divergent);
3361             let mut b = UniformBuilder::new(&mut b, uniform);
3362 
3363             match ni.type_ {
3364                 nir_instr_type_alu => {
3365                     self.parse_alu(&mut b, ni.as_alu().unwrap())
3366                 }
3367                 nir_instr_type_jump => {
3368                     let jump = ni.as_jump().unwrap();
3369                     if jump.type_ == nir_jump_goto
3370                         || jump.type_ == nir_jump_goto_if
3371                     {
3372                         goto = Some(jump);
3373                     }
3374                 }
3375                 nir_instr_type_tex => {
3376                     self.parse_tex(&mut b, ni.as_tex().unwrap())
3377                 }
3378                 nir_instr_type_intrinsic => {
3379                     self.parse_intrinsic(&mut b, ni.as_intrinsic().unwrap())
3380                 }
3381                 nir_instr_type_load_const => {
3382                     self.parse_load_const(&mut b, ni.as_load_const().unwrap())
3383                 }
3384                 nir_instr_type_undef => {
3385                     self.parse_undef(&mut b, ni.as_undef().unwrap())
3386                 }
3387                 nir_instr_type_phi => (),
3388                 _ => panic!("Unsupported instruction type"),
3389             }
3390         }
3391 
3392         if self.sm.sm() < 70 {
3393             if let Some(ni) = nb.following_if() {
3394                 let fb = ni.following_block();
3395                 b.push_op(OpSSy {
3396                     target: self.get_block_label(fb),
3397                 });
3398                 self.push_crs(fb, SyncType::Sync);
3399             } else if let Some(nl) = nb.following_loop() {
3400                 let fb = nl.following_block();
3401                 b.push_op(OpPBk {
3402                     target: self.get_block_label(fb),
3403                 });
3404                 self.push_crs(fb, SyncType::Brk);
3405             }
3406         }
3407 
3408         let succ = nb.successors();
3409         for sb in succ {
3410             let sb = match sb {
3411                 Some(b) => b,
3412                 None => continue,
3413             };
3414 
3415             let mut phi = OpPhiSrcs::new();
3416 
3417             for ni in sb.iter_instr_list() {
3418                 let Some(np) = ni.as_phi() else {
3419                     break;
3420                 };
3421 
3422                 if DEBUG.annotate() {
3423                     let annotation = self
3424                         .nir_instr_printer
3425                         .instr_to_string(ni)
3426                         .split_whitespace()
3427                         .collect::<Vec<_>>()
3428                         .join(" ");
3429                     b.push_op(OpAnnotate {
3430                         annotation: format!("generated by \"{}\"", annotation,),
3431                     });
3432                 }
3433 
3434                 for ps in np.iter_srcs() {
3435                     if ps.pred().index == nb.index {
3436                         let src = *self.get_src(&ps.src).as_ssa().unwrap();
3437                         for (i, src) in src.iter().enumerate() {
3438                             let phi_id =
3439                                 phi_map.get_phi_id(np, i.try_into().unwrap());
3440                             phi.srcs.push(phi_id, (*src).into());
3441                         }
3442                         break;
3443                     }
3444                 }
3445             }
3446 
3447             if !phi.srcs.is_empty() {
3448                 b.push_op(phi);
3449             }
3450         }
3451 
3452         if let Some(goto) = goto {
3453             let target = goto.target().unwrap();
3454             if goto.type_ == nir_jump_goto {
3455                 self.emit_jump(&mut b, nb, target);
3456             } else {
3457                 let cond = self.get_ssa(goto.condition.as_def())[0];
3458                 let else_target = goto.else_target().unwrap();
3459 
3460                 /* Next block in the NIR CF list */
3461                 let next_block = nb.cf_node.next().unwrap().as_block().unwrap();
3462 
3463                 if else_target as *const _ == next_block as *const _ {
3464                     self.emit_pred_jump(
3465                         &mut b,
3466                         nb,
3467                         // This is the branch to jump to the else
3468                         cond.into(),
3469                         target,
3470                         else_target,
3471                     );
3472                 } else if target as *const _ == next_block as *const _ {
3473                     self.emit_pred_jump(
3474                         &mut b,
3475                         nb,
3476                         Pred::from(cond).bnot(),
3477                         else_target,
3478                         target,
3479                     );
3480                 } else {
3481                     panic!(
3482                         "One of the two goto targets must be the next block in \
3483                             the NIR CF list"
3484                     );
3485                 }
3486             }
3487         } else {
3488             if let Some(ni) = nb.following_if() {
3489                 let cond = self.get_ssa(ni.condition.as_def())[0];
3490                 self.emit_pred_jump(
3491                     &mut b,
3492                     nb,
3493                     // This is the branch to jump to the else
3494                     Pred::from(cond).bnot(),
3495                     ni.first_else_block(),
3496                     ni.first_then_block(),
3497                 );
3498             } else {
3499                 assert!(succ[1].is_none());
3500                 let s0 = succ[0].unwrap();
3501                 self.emit_jump(&mut b, nb, s0);
3502             }
3503         }
3504 
3505         let bb = BasicBlock {
3506             label: self.get_block_label(nb),
3507             uniform: !nb.divergent,
3508             instrs: b.as_vec(),
3509         };
3510         self.cfg.add_node(nb.index, bb);
3511     }
3512 
parse_if( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, ni: &nir_if, )3513     fn parse_if(
3514         &mut self,
3515         ssa_alloc: &mut SSAValueAllocator,
3516         phi_map: &mut PhiAllocMap,
3517         ni: &nir_if,
3518     ) {
3519         self.parse_cf_list(ssa_alloc, phi_map, ni.iter_then_list());
3520         self.parse_cf_list(ssa_alloc, phi_map, ni.iter_else_list());
3521 
3522         if self.sm.sm() < 70 {
3523             let next_block = ni.cf_node.next().unwrap().as_block().unwrap();
3524             self.pop_crs(next_block, SyncType::Sync);
3525         }
3526     }
3527 
parse_loop( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, nl: &nir_loop, )3528     fn parse_loop(
3529         &mut self,
3530         ssa_alloc: &mut SSAValueAllocator,
3531         phi_map: &mut PhiAllocMap,
3532         nl: &nir_loop,
3533     ) {
3534         self.parse_cf_list(ssa_alloc, phi_map, nl.iter_body());
3535 
3536         if self.sm.sm() < 70 {
3537             let header = nl.iter_body().next().unwrap().as_block().unwrap();
3538             self.pop_crs(header, SyncType::Cont);
3539             let next_block = nl.cf_node.next().unwrap().as_block().unwrap();
3540             self.pop_crs(next_block, SyncType::Brk);
3541         }
3542     }
3543 
parse_cf_list( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, list: ExecListIter<nir_cf_node>, )3544     fn parse_cf_list(
3545         &mut self,
3546         ssa_alloc: &mut SSAValueAllocator,
3547         phi_map: &mut PhiAllocMap,
3548         list: ExecListIter<nir_cf_node>,
3549     ) {
3550         for node in list {
3551             match node.type_ {
3552                 nir_cf_node_block => {
3553                     let nb = node.as_block().unwrap();
3554                     self.parse_block(ssa_alloc, phi_map, nb);
3555                 }
3556                 nir_cf_node_if => {
3557                     let ni = node.as_if().unwrap();
3558                     self.parse_if(ssa_alloc, phi_map, ni);
3559                 }
3560                 nir_cf_node_loop => {
3561                     let nl = node.as_loop().unwrap();
3562                     self.parse_loop(ssa_alloc, phi_map, nl);
3563                 }
3564                 _ => panic!("Invalid inner CF node type"),
3565             }
3566         }
3567     }
3568 
parse_function_impl(&mut self, nfi: &nir_function_impl) -> Function3569     pub fn parse_function_impl(&mut self, nfi: &nir_function_impl) -> Function {
3570         let mut ssa_alloc = SSAValueAllocator::new();
3571         let end_nb = nfi.end_block();
3572         self.end_block_id = end_nb.index;
3573 
3574         let mut phi_alloc = PhiAllocator::new();
3575         let mut phi_map = PhiAllocMap::new(&mut phi_alloc);
3576 
3577         self.parse_cf_list(&mut ssa_alloc, &mut phi_map, nfi.iter_body());
3578 
3579         let cfg = std::mem::take(&mut self.cfg).as_cfg();
3580         assert!(cfg.len() > 0);
3581         for i in 0..cfg.len() {
3582             if cfg[i].falls_through() {
3583                 assert!(cfg.succ_indices(i)[0] == i + 1);
3584             }
3585         }
3586 
3587         let mut f = Function {
3588             ssa_alloc: ssa_alloc,
3589             phi_alloc: phi_alloc,
3590             blocks: cfg,
3591         };
3592         f.repair_ssa();
3593         f
3594     }
3595 
parse_shader(mut self) -> Shader<'a>3596     pub fn parse_shader(mut self) -> Shader<'a> {
3597         let mut functions = Vec::new();
3598         for nf in self.nir.iter_functions() {
3599             if let Some(nfi) = nf.get_impl() {
3600                 let f = self.parse_function_impl(nfi);
3601                 functions.push(f);
3602             }
3603         }
3604 
3605         // Tessellation evaluation shaders MUST claim to read gl_TessCoord or
3606         // the hardware will throw an SPH error.
3607         if matches!(self.info.stage, ShaderStageInfo::Tessellation(_)) {
3608             match &mut self.info.io {
3609                 ShaderIoInfo::Vtg(io) => {
3610                     let tc = NAK_ATTR_TESS_COORD;
3611                     io.mark_attrs_written(tc..(tc + 8));
3612                 }
3613                 _ => panic!("Tessellation must have ShaderIoInfo::Vtg"),
3614             }
3615         }
3616 
3617         Shader {
3618             sm: self.sm,
3619             info: self.info,
3620             functions: functions,
3621         }
3622     }
3623 }
3624 
nak_shader_from_nir<'a>( ns: &'a nir_shader, sm: &'a dyn ShaderModel, ) -> Shader<'a>3625 pub fn nak_shader_from_nir<'a>(
3626     ns: &'a nir_shader,
3627     sm: &'a dyn ShaderModel,
3628 ) -> Shader<'a> {
3629     ShaderFromNir::new(ns, sm).parse_shader()
3630 }
3631