xref: /aosp_15_r20/external/mesa3d/src/panfrost/compiler/valhall/ISA.xml (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1<!--
2  Copyright (C) 2021 Collabora Ltd.
3
4  Permission is hereby granted, free of charge, to any person obtaining a
5  copy of this software and associated documentation files (the "Software"),
6  to deal in the Software without restriction, including without limitation
7  the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  and/or sell copies of the Software, and to permit persons to whom the
9  Software is furnished to do so, subject to the following conditions:
10
11  The above copyright notice and this permission notice (including the next
12  paragraph) shall be included in all copies or substantial portions of the
13  Software.
14
15  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  SOFTWARE.
22-->
23
24<valhall>
25  <lut name="Immediates">
26    <desc>
27      This immediates are accessible in (almost) any instruction, provided the
28      immediate mode is kept to the default. They optimize for the most common
29      immediate values; any immediate listed here may be used without taking up
30      a uniform slot or a register. Most integer instructions can access
31      separate half-words and individual bytes via swizzles on the source.
32    </desc>
33    <constant desc="Zero">0x00000000</constant>
34    <constant desc="All ones; integer $-1$">0xFFFFFFFF</constant>
35    <constant desc="Maximum integer; floating-point NaN">0x7FFFFFFF</constant>
36    <constant desc="Integers $(-2, -3, -4, -6)$">0xFAFCFDFE</constant>
37    <constant desc="16-bit integer $2^8$">0x01000000</constant>
38    <constant desc="Multiples of 16 $(0, 32, 0, 128)$">0x80002000</constant>
39    <constant desc="Multiples of 16 $(48, 80, 96, 112)$">0x70605030</constant>
40    <constant desc="Multiples of 16 $(144, 160, 176, 192)$">0xC0B0A090</constant>
41    <constant desc="Integers $(0, 1, 2, 3)$">0x03020100</constant>
42    <constant desc="Integers $(4, 5, 6, 7)$">0x07060504</constant>
43    <constant desc="Integers $(8, 9, 10, 11)$">0x0B0A0908</constant>
44    <constant desc="Integers $(12, 13, 14, 15)$">0x0F0E0D0C</constant>
45    <constant desc="Integers $(16, 17, 18, 19)$">0x13121110</constant>
46    <constant desc="Integers $(20, 21, 22, 23)$">0x17161514</constant>
47    <constant desc="Integers $(24, 25, 26, 27)$">0x1B1A1918</constant>
48    <constant desc="Integers $(28, 29, 30, 31)$">0x1F1E1D1C</constant>
49    <constant desc="Float $1.0$">0x3F800000</constant>
50    <constant desc="Float $0.1$">0x3DCCCCCD</constant>
51    <constant desc="Float $1 / \pi$">0x3EA2F983</constant>
52    <constant desc="Float $\log(2)$">0x3F317218</constant>
53    <constant desc="Float $\pi$">0x40490FDB</constant>
54    <constant desc="Float $0.0$">0x00000000</constant>
55    <constant desc="Float $65535.0 = 2^{16} - 1$">0x477FFF00</constant>
56    <constant desc="Half-float $(255.0, 256.0) = (2^8 - 1, 2^8)$">0x5C005BF8</constant>
57    <constant desc="Half-float $0.1 = 1 / 10$">0x2E660000</constant>
58    <constant desc="Half-float $0.25 = 2^{-2}$">0x34000000</constant>
59    <constant desc="Half-float $0.5 = 2^{-1}$">0x38000000</constant>
60    <constant desc="Half-float $1.0 = 2^0$">0x3C000000</constant>
61    <constant desc="Half-float $2.0 = 2^1$">0x40000000</constant>
62    <constant desc="Half-float $4.0 = 2^2$">0x44000000</constant>
63    <constant desc="Half-float $8.0 = 2^3$">0x48000000</constant>
64    <constant desc="Half-float $\pi$">0x42480000</constant>
65  </lut>
66
67  <enum name="Flow">
68    <desc>
69      Every Valhall instruction can wait on dependency
70      slots. A few special flows are available, specified in the instruction
71      metadata from this enum. The `wait0126` flow is required to wait on
72      dependency slot #6 and should be set on the instruction immediately
73      preceding `ATEST`. The `wait` flow should be set for barriers.
74      The `discard` flow only applies to fragment shaders and is used to
75      terminate helper invocations, it should be set as early as possible after
76      helper invocations are no longer needed as determined by data flow
77      analysis. The `end` flow is used to terminate the shader, although it
78      may be overloaded by the `BLEND` instruction.
79
80      The `reconverge` flow is required on any instruction immediately
81      preceding a possible change to the mask of active threads in a subgroup.
82      This includes all divergent branches, but it also includes the final
83      instruction at the end of any basic block where the immediate successor
84      (fallthrough) is the target of a divergent branch.
85    </desc>
86    <value name="None" default="true">none</value>
87    <value name="Wait on slot 0">wait0</value>
88    <value name="Wait on slot 1">wait1</value>
89    <value name="Wait on slots 0, 1">wait01</value>
90    <value name="Wait on slot 2">wait2</value>
91    <value name="Wait on slots 0, 2">wait02</value>
92    <value name="Wait on slots 1, 2">wait12</value>
93    <value name="Wait on slots 0, 1, 2">wait012</value>
94    <value name="Wait on slots 0, 1, 2, 6">wait0126</value>
95    <value name="Wait on slots 0, 1, 2, 6, 7">wait</value>
96    <value name="Perform branch reconverge">reconverge</value>
97    <reserved/>
98    <reserved/>
99    <value name="Terminate discarded threads">discard</value>
100    <reserved/>
101    <value name="Return from shader">end</value>
102  </enum>
103
104  <enum name="FAU special page 0">
105    <desc>
106      Situated between the immediates hard-coded in the hardware and the
107      uniforms defined purely in software, Valhall has a some special
108      "constants" passing through data structures. These are encoded like the
109      table of immediates, as if special constant $i$ were lookup table entry
110      $32 + i$.
111    </desc>
112    <reserved/>
113    <reserved/>
114    <value desc="Warp ID and warps/core - 1">warp_id</value>
115    <reserved/>
116    <value desc="Bounding box maximum X/Y">framebuffer_size</value>
117    <value desc="ATEST datum">atest_datum</value>
118    <value desc="Sample positions">sample</value>
119    <reserved/>
120    <value desc="Blend descriptor 0">blend_descriptor_0</value>
121    <value desc="Blend descriptor 1">blend_descriptor_1</value>
122    <value desc="Blend descriptor 2">blend_descriptor_2</value>
123    <value desc="Blend descriptor 3">blend_descriptor_3</value>
124    <value desc="Blend descriptor 4">blend_descriptor_4</value>
125    <value desc="Blend descriptor 5">blend_descriptor_5</value>
126    <value desc="Blend descriptor 6">blend_descriptor_6</value>
127    <value desc="Blend descriptor 7">blend_descriptor_7</value>
128  </enum>
129
130  <enum name="FAU special page 1">
131    <desc>
132      Situated between the immediates hard-coded in the hardware and the
133      uniforms defined purely in software, Valhall has a some special
134      "constants" passing through data structures. These are encoded like the
135      table of immediates, as if special constant $i$ were lookup table entry
136      $32 + i$.
137    </desc>
138    <reserved/>
139    <value desc="Thread local storage base pointer">thread_local_pointer</value>
140    <reserved/>
141    <value desc="Workgroup local storage base pointer">workgroup_local_pointer</value>
142    <reserved/>
143    <reserved/>
144    <reserved/>
145    <value desc="Shader resource table base pointer">resource_table_pointer</value>
146    <reserved/>
147    <reserved/>
148    <reserved/>
149    <reserved/>
150    <reserved/>
151    <reserved/>
152    <reserved/>
153    <reserved/>
154  </enum>
155
156  <enum name="FAU special page 3">
157    <desc>
158      Situated between the immediates hard-coded in the hardware and the
159      uniforms defined purely in software, Valhall has a some special
160      "constants" passing through data structures. These are encoded like the
161      table of immediates, as if special constant $i$ were lookup table entry
162      $32 + i$.
163    </desc>
164    <reserved/>
165    <value desc="Lane ID">lane_id</value>
166    <reserved/>
167    <value desc="Core ID">core_id</value>
168    <reserved/>
169    <reserved/>
170    <reserved/>
171    <reserved/>
172    <reserved/>
173    <reserved/>
174    <reserved/>
175    <reserved/>
176    <reserved/>
177    <reserved/>
178    <reserved/>
179    <value desc="Program counter">program_counter</value>
180  </enum>
181
182  <enum name="Swizzles (8-bit)">
183    <value default="true">b0123</value>
184    <value>b3210</value>
185    <value>b0101</value>
186    <value>b2323</value>
187    <value>b0000</value>
188    <value>b1111</value>
189    <value>b2222</value>
190    <value>b3333</value>
191    <value>b2301</value>
192    <value>b1032</value>
193    <value>b0011</value>
194    <value>b2233</value>
195    <reserved/>
196    <reserved/>
197    <reserved/>
198    <reserved/>
199  </enum>
200
201  <enum name="Lanes (8-bit)">
202    <desc>Used to select the 2 bytes for shifts of 16-bit vectors</desc>
203    <value>b02</value>
204    <reserved/>
205    <reserved/>
206    <reserved/>
207    <value>b00</value>
208    <value>b11</value>
209    <value>b22</value>
210    <value>b33</value>
211    <reserved/>
212    <reserved/>
213    <value>b01</value>
214    <value>b23</value>
215    <reserved/>
216    <reserved/>
217    <reserved/>
218    <reserved/>
219  </enum>
220
221  <enum name="Half-swizzles (8-bit)">
222    <desc>
223      Used to select the 2 bytes to convert for conversions from 8-bit vectors
224      to 16-bit vectors
225    </desc>
226    <value>b00</value>
227    <value>b10</value>
228    <value>b20</value>
229    <value>b30</value>
230    <value>b01</value>
231    <value>b11</value>
232    <value>b21</value>
233    <value>b31</value>
234    <value>b02</value>
235    <value>b12</value>
236    <value>b22</value>
237    <value>b32</value>
238    <value>b03</value>
239    <value>b13</value>
240    <value>b23</value>
241    <value>b33</value>
242  </enum>
243
244  <enum name="Swizzles (16-bit)">
245    <value>h00</value> <!-- 0,2 -->
246    <value>h10</value>
247    <value default="true">h01</value>
248    <value>h11</value>
249    <value>b00</value> <!-- 0,0 -->
250    <value>b20</value> <!-- 1,1 -->
251    <value>b02</value> <!-- 2,2 -->
252    <value>b22</value> <!-- 3,3 -->
253    <value>b11</value>
254    <value>b31</value>
255    <value>b13</value> <!-- 0,1 -->
256    <value>b33</value> <!-- 2,3 -->
257    <value>b01</value>
258    <value>b23</value>
259    <reserved/>
260    <reserved/>
261  </enum>
262
263  <enum name="Swizzles (32-bit)">
264    <value default="true">none</value>
265    <reserved/>
266    <value>h0</value>
267    <value>h1</value>
268    <value>b0</value>
269    <value>b1</value>
270    <value>b2</value>
271    <value>b3</value>
272  </enum>
273
274  <enum name="Swizzles (64-bit)">
275    <value default="true">none</value>
276    <reserved/>
277    <value>h0</value>
278    <value>h1</value>
279    <value>b0</value>
280    <value>b1</value>
281    <value>b2</value>
282    <value>b3</value>
283    <value>w0</value>
284    <reserved/>
285    <reserved/>
286    <reserved/>
287    <reserved/>
288    <reserved/>
289    <reserved/>
290    <reserved/>
291  </enum>
292
293  <enum name="Lane (8-bit)" implied="true">
294    <value>b0</value>
295    <value>b1</value>
296    <value>b2</value>
297    <value>b3</value>
298  </enum>
299
300  <enum name="Combine">
301    <desc>
302      Used for the lane select of `BRANCHZ`. To use an 8-bit condition, a
303      separate `ICMP` is required to cast to 16-bit.
304    </desc>
305    <value default="true">none</value>
306    <value>h0</value>
307    <value>h1</value>
308    <value>and</value>
309    <value>lowbits</value>
310  </enum>
311
312  <enum name="Lane (16-bit)" implied="true">
313    <value>h0</value>
314    <value>h1</value>
315  </enum>
316
317  <enum name="Load lane (8-bit)">
318    <value default="true">b0</value>
319    <value>b1</value>
320    <value>b2</value>
321    <value>b3</value>
322    <value desc="Zero-extend to 16-bit, low-half">h0</value>
323    <value desc="Zero-extend to 16-bit, high-half">h1</value>
324    <value desc="Zero-extend to 32-bit">w0</value>
325    <value desc="Zero-extend to 64-bit">d0</value>
326  </enum>
327
328  <enum name="Load lane (16-bit)">
329    <value desc="Low half" default="true">h0</value>
330    <value desc="High half">h1</value>
331    <value desc="Zero-extend to 32-bit">w0</value>
332    <value desc="Zero-extend to 64-bit">d0</value>
333    <reserved/>
334    <reserved/>
335    <reserved/>
336    <reserved/>
337  </enum>
338
339  <enum name="Load lane (24-bit)" implied="true">
340    <value default="true">identity</value>
341    <reserved/>
342    <reserved/>
343    <reserved/>
344    <reserved/>
345    <reserved/>
346    <reserved/>
347  </enum>
348
349  <enum name="Load lane (32-bit)">
350    <value default="true">w0</value>
351    <value desc="Zero-extend to 64-bit">d0</value>
352    <reserved/>
353    <reserved/>
354    <reserved/>
355    <reserved/>
356    <reserved/>
357    <reserved/>
358  </enum>
359
360  <enum name="Load lane (48-bit)">
361    <reserved/>
362    <reserved/>
363    <reserved/>
364    <reserved/>
365    <value default="true">identity</value>
366    <reserved/>
367    <reserved/>
368    <reserved/>
369  </enum>
370
371  <enum name="Load lane (64-bit)">
372    <reserved/>
373    <reserved/>
374    <reserved/>
375    <reserved/>
376    <reserved/>
377    <reserved/>
378    <reserved/>
379    <value default="true">identity</value>
380  </enum>
381
382  <enum name="Load lane (96-bit)">
383    <reserved/>
384    <reserved/>
385    <reserved/>
386    <reserved/>
387    <reserved/>
388    <reserved/>
389    <value default="true">identity</value>
390    <reserved/>
391  </enum>
392
393  <enum name="Load lane (128-bit)">
394    <reserved/>
395    <reserved/>
396    <reserved/>
397    <reserved/>
398    <reserved/>
399    <reserved/>
400    <reserved/>
401    <value default="true">identity</value>
402  </enum>
403
404  <enum name="Round mode">
405    <desc>Corresponds to IEEE 754 rounding modes</desc>
406    <value desc="Round to nearest even" default="true">rte</value>
407    <value desc="Round to positive infinity">rtp</value>
408    <value desc="Round to negative infinity">rtn</value>
409    <value desc="Round to zero">rtz</value>
410  </enum>
411
412  <enum name="Result type">
413    <desc>
414      Comparison instructions like `FCMP` return a boolean but may encode this
415      boolean in a variety of ways. `i1` gives a OpenGL style `0/1` boolean.
416      `m1` gives a Direct3D style `0/~0` boolean. `f1` gives a floating-point
417      `0.0f / 1.0f` boolean. Switching between these modes is useful to fold a
418      boolean type convert into a comparison. `u1` is used internally to
419      implement 64-bit comparisons.
420    </desc>
421    <value desc="Integer 1">i1</value>
422    <value desc="Float 1">f1</value>
423    <value desc="Minus 1">m1</value>
424    <value desc="Low half of 64-bit compare">u1</value>
425  </enum>
426
427  <enum name="Widen">
428    <value default="true">none</value>
429    <value>h0</value>
430    <value>h1</value>
431    <reserved/>
432    <reserved/>
433    <reserved/>
434    <reserved/>
435    <reserved/>
436  </enum>
437
438  <enum name="Clamp">
439    <desc>
440      Clamp applied to the destination of a floating-point instruction. Note the
441      clamps may be decomposed as two independent bits for `clamp_0_inf` and
442      `clamp_m1_1`, with `clamp_0_1` arising as the composition of `clamp_0_inf`
443      and `clamp_m1_1` in either order.
444
445      Clamps are implemented per the SPIR-V specification:
446
447      $$\text{clamp} \; (x, \ell, h) = \min( \max( x, \ell ), h)$$
448
449      The min/max functions return the other operand if one operand is NaN, and
450      compare $-0 &lt; +0$. That means the following identities hold for Valhall
451      clamps:
452
453      \begin{align*}
454        \text{clamp}(-0.0, 0.0, 1.0) &amp; = +0.0 \\
455        \text{clamp}(-\text{NaN}, 0.0, 1.0) &amp; = +0.0 \\
456        \text{clamp}(\text{NaN}, 0.0, 1.0) &amp; = +0.0 \\
457        &amp; \\
458        \text{clamp}(-0.0, -1.0, 1.0) &amp; = -0.0 \\
459        \text{clamp}(\text{NaN}, -1.0, 1.0) &amp; = -1.0 \\
460        \text{clamp}(-\text{NaN}, -1.0, 1.0) &amp; = -1.0 \\
461        &amp; \\
462        \max(\text{NaN}, 0.0) &amp; = +0.0 \\
463        \max(-\text{NaN}, 0.0) &amp; = +0.0 \\
464        \max(-0.0, 0.0) &amp; = +0.0 \\
465      \end{align*}
466
467      This behaviour is consistent with the FMin/FMax/FClamp and
468      NMin/NMax/NClamp rules prescribed by SPIR-V and governed by IEEE-754. As
469      a consequence, substituting these clamps for equivalent minimum/maximum
470      exprssions is legal even with strict floating point rules.
471    </desc>
472    <value default="true" desc="Identity">none</value>
473    <value desc="Clamp positive">clamp_0_inf</value>
474    <value desc="Clamp to $[-1, 1]$">clamp_m1_1</value>
475    <value desc="Clamp to $[0, 1]$">clamp_0_1</value>
476  </enum>
477
478  <enum name="Condition">
479    <desc>
480      Condition code. Type must be inferred from the instruction. IEEE 754 total
481      ordering only applies to floating point compares. "Not equal" and "greater
482      than or less than" are distinguished by NaN behaviour conforming to
483      the IEEE 754 specification.
484    </desc>
485    <value desc="Equal">eq</value>
486    <value desc="Greater than">gt</value>
487    <value desc="Greater than or equal">ge</value>
488    <value desc="Not equal">ne</value>
489    <value desc="Less than">lt</value>
490    <value desc="Less than or equal">le</value>
491    <value desc="Greater than or less than">gtlt</value>
492    <value desc="Totally ordered">total</value>
493  </enum>
494
495  <enum name="Dimension">
496    <desc>Texture dimension.</desc>
497    <value desc="1D or buffer">1d</value>
498    <value desc="2D or 2D array">2d</value>
499    <value desc="3D or 3D array">3d</value>
500    <value desc="Cube map or cube map array">cube</value>
501  </enum>
502
503  <enum name="LOD mode">
504    <desc>Level-of-detail selection mode in a texture instruction.</desc>
505    <value desc="Set to zero">zero</value>
506    <value desc="Computed based on neighboring fragments">computed</value>
507    <reserved/>
508    <reserved/>
509    <value desc="Explicitly specified in a register">explicit</value>
510    <value desc="Computed based on neighboring fragments added with bias in a register">computed_bias</value>
511    <value desc="Derived from a gradient descriptor in registers">grdesc</value>
512    <reserved/>
513  </enum>
514
515  <enum name="Register format">
516    <desc>Format of data loaded to / stored from registers for general memory access.</desc>
517    <value desc="32-bit type based on descriptor format">auto</value>
518    <reserved/>
519    <value desc="32-bit floats">f32</value>
520    <value desc="16-bit floats">f16</value>
521    <value desc="32-bit signed integers">s32</value>
522    <value desc="16-bit signed integers">s16</value>
523    <value desc="32-bit unsigned integers">u32</value>
524    <value desc="16-bit unsigned integers">u16</value>
525  </enum>
526
527  <enum name="Staging register count" implied="true">
528    <value>sr0</value>
529    <value>sr1</value>
530    <value>sr2</value>
531    <value>sr3</value>
532    <value>sr4</value>
533    <value>sr5</value>
534    <value>sr6</value>
535    <value>sr7</value>
536  </enum>
537
538  <enum name="Staging register write count" implied="true">
539    <value>write1</value>
540    <value>write2</value>
541    <value>write3</value>
542    <value>write4</value>
543    <value>write5</value>
544    <value>write6</value>
545    <value>write7</value>
546    <value>write8</value>
547  </enum>
548
549  <enum name="Write mask">
550    <reserved/>
551    <value>r</value>
552    <value>g</value>
553    <value>rg</value>
554    <value>b</value>
555    <value>rb</value>
556    <value>gb</value>
557    <value>rgb</value>
558    <value>a</value>
559    <value>ra</value>
560    <value>ga</value>
561    <value>rga</value>
562    <value>ba</value>
563    <value>rba</value>
564    <value>gba</value>
565    <value default="true">rgba</value>
566  </enum>
567
568  <enum name="Fetch component">
569    <value desc="Red">gather4_r</value>
570    <value desc="Green">gather4_g</value>
571    <value desc="Blue">gather4_b</value>
572    <value desc="Alpha">gather4_a</value>
573  </enum>
574
575  <enum name="Register type">
576    <desc>Unsized type, part of a register format.</desc>
577    <reserved/>
578    <value name="Float">f</value>
579    <value name="Unsigned">u</value>
580    <value name="Signed">s</value>
581  </enum>
582
583  <enum name="Register width">
584    <desc>Untyped size, part of a register format.</desc>
585    <value>16</value>
586    <value>32</value>
587  </enum>
588
589  <enum name="Varying texture register width">
590    <desc>
591      Size of results for varying texture instructions. For dual 16-bit results
592      use "16-bit".
593    </desc>
594    <value desc="16-bit">16</value>
595    <value desc="32-bit">32</value>
596    <value desc="16-bit, 32-bit">16.32</value>
597    <value desc="32-bit, 32-bit">32.32</value>
598  </enum>
599
600  <enum name="Vector size">
601    <desc>Number of channels loaded/stored for general memory access.</desc>
602    <value default="true" desc="Scalar">none</value>
603    <value desc="2 channels">v2</value>
604    <value desc="3 channels">v3</value>
605    <value desc="4 channels">v4</value>
606  </enum>
607
608  <enum name="Slot">
609    <desc>
610      Dependency slot set on a message-passing instruction that writes to
611      registers. Before reading the destination, a future instruction must wait
612      on the specified slot. Slot #7 is for `BARRIER` instructions only.
613    </desc>
614    <value desc="Slot #0">slot0</value>
615    <value desc="Slot #1">slot1</value>
616    <value desc="Slot #2">slot2</value>
617    <reserved/>
618    <reserved/>
619    <reserved/>
620    <reserved/>
621    <value desc="Slot #7">slot7</value>
622  </enum>
623
624  <enum name="Memory access">
625    <desc>Memory access hint for a `LOAD` or `STORE` instruction.</desc>
626    <value desc="No hint (global)" default="true">none</value>
627    <value desc="Internally streaming (position output)">istream</value>
628    <value desc="Externally streaming (varying output)">estream</value>
629    <value desc="Force access in discarded threads (thread local storage)">force</value>
630  </enum>
631
632  <enum name="Subgroup size">
633    <desc>
634      Selects the effective subgroup size from subgroup operations. The hardware
635      warps are sixteen threads on Valhall, but subdividing a warp may be useful
636      for API requirements. In particular, derivatives may be calculated with
637      quads (four threads).
638    </desc>
639    <value desc="Two threads">subgroup2</value>
640    <value desc="Four threads">subgroup4</value>
641    <value desc="Eight threads">subgroup8</value>
642    <value desc="Sixteen threads" default="true">subgroup16</value>
643  </enum>
644
645  <enum name="Lane operation">
646    <desc>
647      Acts as a modifier on the lane specificier for a `CLPER` instruction. The
648      `accumulate` mode is required for efficient subgroup reductions.
649    </desc>
650    <value name="No operation" default="true">none</value>
651    <value name="Exclusive-or">xor</value>
652    <value name="Accumulate">accumulate</value>
653    <value name="Shift">shift</value>
654  </enum>
655
656  <enum name="Inactive result">
657    <desc>
658      Accesses to inactive lanes (due to divergence) in a subgroup is generally
659      undefined in APIs. However, the results of permuting with an inactive lane
660      with `CLPER.i32` are well-defined in Valhall: they return one of the
661      following values, as specified in the `CLPER.i32` instructions. Sometimes
662      certain values enable small optimizations.
663    </desc>
664    <value name="0x00000000" default="true">zero</value>
665    <value name="0xFFFFFFFF">umax</value>
666    <value name="0x00000001">i1</value>
667    <value name="0x00010001">v2i1</value>
668    <value name="0x80000000">smin</value>
669    <value name="0x7FFFFFFF">smax</value>
670    <value name="0x80008000">v2smin</value>
671    <value name="0x7FFF7FFF">v2smax</value>
672    <value name="0x80808080">v4smin</value>
673    <value name="0x7F7F7F7F">v4smax</value>
674    <value name="0x3F800000">f1</value>
675    <value name="0x3C003C00">v2f1</value>
676    <value name="0xFF800000">infn</value>
677    <value name="0x7F800000">inf</value>
678    <value name="0xFC00FC00">v2infn</value>
679    <value name="0x7C007C00">v2inf</value>
680  </enum>
681
682  <enum name="Mux">
683    <desc>
684      Condition to use for a `MUX` instruction. `neg` checks the sign bit,
685      `int_zero` compares to `0x00000000`, `fp_zero` compares to $\pm 0.0$ as
686      an IEEE 754 float, and `bit` checks each bit separately. The `bit` mode
687      acts like an imaginary `CSEL.v32u1` instruction, and implements
688      `bitselect()` in OpenCL.
689    </desc>
690    <value desc="Negative">neg</value>
691    <value desc="Integer zero" default="true">int_zero</value>
692    <value desc="Floating point zero">fp_zero</value>
693    <value desc="Bitwise">bit</value>
694  </enum>
695
696  <enum name="Sample mode">
697    <desc>
698      Varying interpolation mode, for choosing the correct sample to
699      interpolate at, allowing the `sample` and `centroid` qualifiers to be
700      implemented, as well as the `interpolateAt*` functions.
701    </desc>
702    <value desc="Center">center</value>
703    <value desc="Centroid">centroid</value>
704    <value desc="Sample">sample</value>
705    <value desc="Explicit">explicit</value>
706  </enum>
707
708  <enum name="Update mode">
709    <desc>
710      The Valhall GPU maintains hidden state when interpolating varyings, to
711      allow reusing sample location calculations. The update mode of a varying
712      load controls this hidden state.
713    </desc>
714    <value desc="Store interpolation position">store</value>
715    <value desc="Retrieve interpolation position">retrieve</value>
716    <reserved/>
717    <value desc="Clobber saved position">clobber</value>
718  </enum>
719
720  <enum name="Sample and update mode">
721    <desc>
722      For fused varying/texture instructions, only the following specific
723      combinations of sample and update modes are permitted.
724    </desc>
725    <value desc="Center, store">center_store</value>
726    <value desc="Centroid, store">centroid_store</value>
727    <value desc="Sample, store">sample_store</value>
728    <value desc="Explicit, store">explicit_store</value>
729    <value desc="Center, clobber">center_clobber</value>
730    <reserved/>
731    <value desc="Sample, clobber">sample_clobber</value>
732    <value desc="Retrieve previous state">retrieve</value>
733  </enum>
734
735  <enum name="Source format">
736    <desc>
737      In-memory format of varyings.
738
739      Note: src_flat32 is only valid with 32-bit varying instructions and
740      src_flat16 is only valid with 16-bit varying instructions.
741    </desc>
742    <value desc="Uninterpreted 32-bit values">src_flat32</value>
743    <value desc="Uninterpreted 16-bit values">src_flat16</value>
744    <value desc="Interpolated 32-bit floats">src_f32</value>
745    <value desc="Interpolated 16-bit floats">src_f16</value>
746  </enum>
747
748  <enum name="Atomic operation">
749    <desc>
750      Operation performed in a general computational atomic instruction.
751    </desc>
752    <reserved/>
753    <reserved/>
754    <value desc="Add">aadd</value>
755    <reserved/>
756    <reserved/>
757    <reserved/>
758    <reserved/>
759    <reserved/>
760    <value desc="Signed minimum">asmin</value>
761    <value desc="Signed maximum">asmax</value>
762    <value desc="Unsigned minimum">aumin</value>
763    <value desc="Unsigned maximum">aumax</value>
764    <value desc="Bitwise and">aand</value>
765    <value desc="Bitwise or">aor</value>
766    <value desc="Bitwise exclusive-or">axor</value>
767    <value desc="Exchange (must return the value)">axchg</value>
768  </enum>
769
770  <enum name="Atomic operation with 1">
771    <desc>
772      Operation performed in a computational atomic-with-1 instruction.
773    </desc>
774    <value desc="Increment">ainc</value>
775    <value desc="Decrement">adec</value>
776    <value desc="Unsigned maximum with 1">aumax1</value>
777    <value desc="Signed maximum with 1">asmax1</value>
778    <value desc="Set bottom bit">aor1</value>
779  </enum>
780
781  <!-- note that the `unused="true"` annotation here just means that this
782       particular entry is unused by the compiler. This may be because the
783       instruction isn't generated yet, but it may also be because there
784       is a duplicate instruction in the Bifrost or pseudo XML files
785  -->
786  <ins name="NOP" title="No operation" dests="0" opcode="0x00" unused="true" unit="CVT">
787    <desc>
788      Do nothing. Useful at the start of a block for waiting on slots required
789      by the first actual instruction of the block, to reconcile dependencies
790      after a branch. Also useful as the sole instruction of an empty shader.
791    </desc>
792  </ins>
793
794  <ins name="BRANCHZ" title="Compare to zero and branch" dests="0" opcode="0x1F" unused="true" unit="CVT">
795    <desc>
796      Branches to a specified relative offset if its source is nonzero (default)
797      or if its source is zero (if `.eq` is set). The offset is 27-bits and
798      sign-extended, giving an effective range of ±26-bits. The offset is
799      specified in units of instructions, relative to the *next* instruction.
800      Positive offsets may be interpreted as "number of instructions to skip".
801      Since Valhall instructions are 8 bytes, this operates as:
802
803      $$PC := \begin{cases} PC + 8 \cdot (\text{offset} \; + 1) &amp; \text{if} \;
804      \text{src} \stackrel{?}{=} 0 \\ PC + 8 &amp; \text{otherwise} \end{cases}$$
805
806      Used with comparison instructions to implement control flow. Tie the
807      source to a nonzero constant to implement a jump. May introduce
808      divergence, so generally requires `.reconverge` flow control.
809    </desc>
810    <src combine="true">Value to compare against zero</src>
811    <imm name="offset" start="8" size="27" signed="true"/>
812    <conservative/>
813    <va_mod name="eq" start="36" size="1"/>
814  </ins>
815
816  <ins name="DISCARD.f32" title="Discard fragment" dests="0" opcode="0x20" unused="true" unit="CVT">
817    <desc>
818      Evaluates the given condition, and if it passes, discards the current
819      fragment and terminates the thread. Only valid in a **fragment** shader.
820    </desc>
821    <cmp/>
822    <src absneg="true" swizzle="true">Left value to compare</src>
823    <src absneg="true" swizzle="true">Right value to compare</src>
824  </ins>
825
826  <ins name="BRANCHZI" title="Compare to zero and branch indirect" opcode="0x2F" dests="0" last="true" unit="CVT">
827    <desc>
828      Jump to an indirectly specified (absolute or relative) address. Used to
829      jump to blend shaders at the end of a fragment shader.
830    </desc>
831    <src combine="true">Value to compare against zero</src>
832    <src>Branch target</src>
833    <conservative/>
834    <va_mod name="eq" start="36" size="1"/>
835    <va_mod name="absolute" start="40" size="1"/>
836  </ins>
837
838  <ins name="BARRIER" title="Execution and memory barrier" opcode="0x45" unused="true" unit="NONE">
839    <desc>
840      General-purpose barrier. Must use slot #7. Must be paired with a
841      `.wait` flow on the instruction.
842    </desc>
843    <slot/>
844  </ins>
845
846  <group name="CSEL" title="Floating-point conditional select" dests="1" unused="true" unit="CVT">
847    <ins name="CSEL.f32" opcode="0x154"/>
848    <ins name="CSEL.v2f16" opcode="0x155"/>
849    <desc>
850      Evaluates the given condition and outputs either the true source or the
851      false source.
852    </desc>
853    <cmp/>
854    <src float="true">Left value to compare</src>
855    <src float="true">Right value to compare</src>
856    <src float="true">Return value if true</src>
857    <src float="true">Return value if false</src>
858  </group>
859
860  <group name="CSEL" title="Integer conditional select" dests="1" unused="true" unit="CVT">
861    <ins name="CSEL.u32" opcode="0x150"/>
862    <ins name="CSEL.v2u16" opcode="0x151"/>
863    <ins name="CSEL.s32" opcode="0x158"/>
864    <ins name="CSEL.v2s16" opcode="0x159"/>
865    <desc>
866      Evaluates the given condition and outputs either the true source or the
867      false source.
868
869      Valhall lacks integer minimum/maximum instructions. `CSEL` instructions
870      with tied operands form the canonical implementations of these
871      instructions. Similarly, the integer $\text{sign}$ function is canonically
872      implemented with a pair of `CSEL` instructions.
873    </desc>
874    <cmp/>
875    <src>Left value to compare</src>
876    <src>Right value to compare</src>
877    <src>Return value if true</src>
878    <src>Return value if false</src>
879  </group>
880
881  <ins name="LD_VAR_SPECIAL" title="Load special varying" opcode="0x56" unused="true" unit="V">
882    <sr write="true"/>
883    <sr_count/>
884    <vecsize/>
885    <regfmt/>
886    <sample/>
887    <update/>
888    <slot/>
889    <src/>
890    <imm name="index" start="12" size="4"/> <!-- 0 for pointx, 1 for pointy, 2 for fragw, 3 for fragz -->
891  </ins>
892
893  <group name="LD_VAR_BUF_IMM" title="Load immediate varying" message="varying" unit="V">
894    <desc>Interpolates a given varying from hardware buffer</desc>
895    <ins name="LD_VAR_BUF_IMM.f32" opcode="0x5C"/>
896    <ins name="LD_VAR_BUF_IMM.f16" opcode="0x5D"/>
897    <slot/>
898    <vecsize/>
899    <source_format/>
900    <regfmt pseudo="true"/>
901    <sample/>
902    <update/>
903    <sr write="true"/>
904    <sr_count count="format"/>
905    <src/>
906    <imm name="index" start="16" size="8"/>
907  </group>
908
909  <group name="LD_VAR_BUF" title="Load indirect varying" message="varying" unit="V">
910    <desc>Interpolates a given varying from hardware buffer</desc>
911    <ins name="LD_VAR_BUF.f32" opcode="0x6C"/>
912    <ins name="LD_VAR_BUF.f16" opcode="0x6D"/>
913    <slot/>
914    <vecsize/>
915    <source_format/>
916    <regfmt pseudo="true"/>
917    <sample/>
918    <update/>
919    <sr write="true"/>
920    <sr_count count="format"/>
921    <src/>
922    <src/>
923  </group>
924
925  <ins name="LD_VAR" title="Load indirect varying" unused="true" unit="V" opcode="0x64">
926    <desc>Interpolates a given varying from a software buffer</desc>
927    <slot/>
928    <vecsize/>
929    <regfmt/>
930    <sample/>
931    <update/>
932    <sr write="true"/>
933    <sr_count/>
934    <src/>
935    <src>Varying index and table</src>
936  </ins>
937
938  <ins name="LD_VAR_IMM" title="Load immediate varying" unused="true" unit="V" opcode="0x54">
939    <desc>Interpolates a given varying from a software buffer</desc>
940    <slot/>
941    <vecsize/>
942    <regfmt/>
943    <sample/>
944    <update/>
945    <sr write="true"/>
946    <sr_count/>
947    <src/>
948    <imm name="table" start="8" size="4"/>
949    <imm name="index" start="12" size="8"/>
950  </ins>
951
952  <ins name="LD_VAR_FLAT" title="Load indirect varying" unused="true" unit="V" opcode="0x55">
953    <desc>Fetches a given varying from a software buffer</desc>
954    <slot/>
955    <vecsize/>
956    <regfmt/>
957    <sr write="true"/>
958    <sr_count/>
959    <src>Varying index and table</src>
960  </ins>
961
962  <ins name="LD_VAR_FLAT_IMM" title="Load immediate varying" unused="true" unit="V" opcode="0x41">
963    <desc>Fetches a given varying from a software buffer</desc>
964    <slot/>
965    <vecsize/>
966    <regfmt/>
967    <sr write="true"/>
968    <sr_count/>
969    <imm name="table" start="8" size="4"/>
970    <imm name="index" start="12" size="8"/>
971  </ins>
972
973  <ins name="LD_ATTR_IMM" title="Load immediate attribute" opcode="0x66" opcode2="0" unused="true" unit="LS">
974    <desc>
975      Load `vecsize` components from the attribute descriptor at entry `index`
976      of resource table `table` at index (vertex ID, instance ID), converting
977      to the specified register format.
978    </desc>
979    <sr_count/>
980    <vecsize/>
981    <regfmt/>
982    <slot/>
983    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
984    <sr write="true"/>
985    <src>Vertex ID</src>
986    <src>Instance ID</src>
987    <imm name="index" start="20" size="4"/>
988    <imm name="table" start="16" size="4"/>
989  </ins>
990
991  <ins name="LD_ATTR" title="Load indirect attribute" opcode="0x76" opcode2="0" unused="true" unit="LS">
992    <desc>
993      Load `vecsize` components from the attribute descriptor at the specified
994      location at index (vertex ID, instance ID), converting
995      to the specified register format.
996
997      The index must not diverge within a warp.
998    </desc>
999    <sr_count/>
1000    <vecsize/>
1001    <regfmt/>
1002    <slot/>
1003    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
1004    <sr write="true"/>
1005    <src>Vertex ID</src>
1006    <src>Instance ID</src>
1007    <src>Index and table</src>
1008  </ins>
1009
1010  <ins name="LD_TEX_IMM" title="Load immediate texture" opcode="0x66" opcode2="1" message="attribute" unit="LS">
1011    <desc>
1012      Load `vecsize` components from the texture descriptor at entry `index`
1013      of resource table `table`, converting
1014      to the specified register format.
1015    </desc>
1016    <sr_count count="format"/>
1017    <vecsize/>
1018    <regfmt/>
1019    <slot/>
1020    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
1021    <sr write="true"/>
1022    <src>X/Y coordinates (16:16)</src>
1023    <src>Z/W coordinates (16:16)</src>
1024    <imm name="index" ir_name="texture_index" start="20" size="4"/>
1025    <imm name="table" ir_name="" start="16" size="4"/>
1026  </ins>
1027
1028  <ins name="LD_TEX" title="Load indirect texture" message="attribute" opcode="0x76" opcode2="1" unit="LS">
1029    <desc>
1030      Load `vecsize` components from the texture descriptor at the specified
1031      location at index, converting
1032      to the specified register format.
1033    </desc>
1034    <sr_count count="format"/>
1035    <vecsize/>
1036    <regfmt/>
1037    <slot/>
1038    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
1039    <sr write="true"/>
1040    <src>X/Y coordinates (16:16)</src>
1041    <src>Z/W coordinates (16:16)</src>
1042    <src>Index and table</src>
1043  </ins>
1044
1045  <ins name="LEA_ATTR_IMM" title="Load effective address of image texel" opcode="0x67" opcode2="0" unused="true" unit="LS">
1046    <desc>
1047      Load the effective address of an attribute specified with the
1048      given immediate index. Returns three staging register: the low/high
1049      32-bits of the address and the internal conversion descriptor.
1050    </desc>
1051    <slot/>
1052    <sr_count/>
1053    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
1054    <sr write="true"/>
1055    <src>Vertex index</src>
1056    <src>Instance index</src>
1057    <imm name="table" start="16" size="4"/>
1058    <imm name="index" start="20" size="4"/>
1059  </ins>
1060
1061  <ins name="LEA_ATTR" title="Load effective address of image texel" opcode="0x77" opcode2="0" unused="true" unit="LS">
1062    <desc>
1063      Load the effective address of an attribute specified with the
1064      given index. Returns three staging register: the low/high
1065      32-bits of the address and the internal conversion descriptor.
1066    </desc>
1067    <vecsize/>
1068    <slot/>
1069    <sr_count/>
1070    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
1071    <sr write="true"/>
1072    <src>Vertex index</src>
1073    <src>Instance index</src>
1074    <src>Attribute index and table</src>
1075  </ins>
1076
1077  <ins name="LEA_TEX_IMM" title="Load effective address of image texel" opcode="0x67" opcode2="1" unused="true" unit="LS">
1078    <desc>
1079      Load the effective address of a texel from the image specified with the
1080      given immediate index. Returns three staging registers: the low/high
1081      32-bits of the address and the internal conversion descriptor. The format
1082      of the internal conversion descriptor is compatible with Bifrost but
1083      omits the register format, as this is specified with the ST_CVT
1084      instruction on Valhall.
1085
1086      Coordinates are specified as 16-bit integers, packed into 32-bit sources.
1087    </desc>
1088    <slot/>
1089    <sr_count/>
1090    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
1091    <sr write="true"/>
1092    <src>X/Y coordinates (16:16)</src>
1093    <src>Z/W coordinates (16:16)</src>
1094    <imm name="table" start="16" size="4"/>
1095    <imm name="index" start="20" size="4"/>
1096  </ins>
1097
1098  <ins name="LEA_TEX" title="Load effective address of image texel" opcode="0x77" opcode2="1" unused="true" unit="LS">
1099    <desc>
1100      Load the effective address of a texel from the image specified with the
1101      given index. Returns three staging register: the low/high
1102      32-bits of the address and the internal conversion descriptor. The format
1103      of the internal conversion descriptor is compatible with Bifrost but
1104      omits the register format, as this is specified with the ST_CVT
1105      instruction on Valhall.
1106
1107      Coordinates are specified as 16-bit integers, packed into 32-bit sources.
1108    </desc>
1109    <vecsize/>
1110    <slot/>
1111    <sr_count/>
1112    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
1113    <sr write="true"/>
1114    <src size="16">X/Y coordinates (16:16)</src>
1115    <src>Z/W coordinates (16:16)</src>
1116    <src>Index and table</src>
1117  </ins>
1118
1119  <ins name="LD_BUFFER.i8" title="Global memory load" message="load" opcode="0x6a" opcode2="0" unit="LS">
1120    <desc>
1121      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1122      all-ones, load from the buffer descriptors in the table indexed by the
1123      bottom byte of the mode descriptor. If they are all zeroes, load the
1124      contents of the buffer in the first table indexed by the bottom byte of
1125      the mode descriptor.
1126    </desc>
1127    <sr write="true"/>
1128    <sr_count count="1"/>
1129    <va_mod name="load_lane_8_bit" start="36" size="3"/>
1130    <va_mod name="unsigned" start="39" size="1"/>
1131    <slot/>
1132    <src size="32">Address to load from after adding offset</src>
1133    <src size="32">Mode descriptor</src>
1134  </ins>
1135
1136  <ins name="LD_BUFFER.i16" title="Global memory load" message="load" opcode="0x6a" opcode2="1" unit="LS">
1137    <desc>
1138      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1139      all-ones, load from the buffer descriptors in the table indexed by the
1140      bottom byte of the mode descriptor. If they are all zeroes, load the
1141      contents of the buffer in the first table indexed by the bottom byte of
1142      the mode descriptor.
1143    </desc>
1144    <sr write="true"/>
1145    <sr_count count="1"/>
1146    <va_mod name="load_lane_16_bit" start="36" size="3"/>
1147    <va_mod name="unsigned" start="39" size="1"/>
1148    <slot/>
1149    <src size="32">Byte offset</src>
1150    <src size="32">Mode descriptor</src>
1151  </ins>
1152
1153  <ins name="LD_BUFFER.i24" title="Global memory load" message="load" opcode="0x6a" opcode2="2" unit="LS">
1154    <desc>
1155      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1156      all-ones, load from the buffer descriptors in the table indexed by the
1157      bottom byte of the mode descriptor. If they are all zeroes, load the
1158      contents of the buffer in the first table indexed by the bottom byte of
1159      the mode descriptor.
1160    </desc>
1161    <sr write="true"/>
1162    <sr_count count="1"/>
1163    <va_mod name="load_lane_24_bit" start="36" size="3"/>
1164    <va_mod name="unsigned" start="39" size="1"/>
1165    <slot/>
1166    <src size="32">Byte offset</src>
1167    <src size="32">Mode descriptor</src>
1168  </ins>
1169
1170  <ins name="LD_BUFFER.i32" title="Global memory load" message="load" opcode="0x6a" opcode2="3" unit="LS">
1171    <desc>
1172      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1173      all-ones, load from the buffer descriptors in the table indexed by the
1174      bottom byte of the mode descriptor. If they are all zeroes, load the
1175      contents of the buffer in the first table indexed by the bottom byte of
1176      the mode descriptor.
1177    </desc>
1178    <sr write="true"/>
1179    <sr_count count="1"/>
1180    <va_mod name="load_lane_32_bit" start="36" size="3"/>
1181    <va_mod name="unsigned" start="39" size="1"/>
1182    <slot/>
1183    <src size="32">Byte offset</src>
1184    <src size="32">Mode descriptor</src>
1185  </ins>
1186
1187  <ins name="LD_BUFFER.i48" title="Global memory load" message="load" opcode="0x6a" opcode2="4" unit="LS">
1188    <desc>
1189      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1190      all-ones, load from the buffer descriptors in the table indexed by the
1191      bottom byte of the mode descriptor. If they are all zeroes, load the
1192      contents of the buffer in the first table indexed by the bottom byte of
1193      the mode descriptor.
1194    </desc>
1195    <sr write="true"/>
1196    <sr_count count="2"/>
1197    <va_mod name="load_lane_48_bit" start="36" size="3"/>
1198    <va_mod name="unsigned" start="39" size="1"/>
1199    <slot/>
1200    <src size="32">Byte offset</src>
1201    <src size="32">Mode descriptor</src>
1202  </ins>
1203
1204  <ins name="LD_BUFFER.i64" title="Global memory load" message="load" opcode="0x6a" opcode2="5" unit="LS">
1205    <desc>
1206      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1207      all-ones, load from the buffer descriptors in the table indexed by the
1208      bottom byte of the mode descriptor. If they are all zeroes, load the
1209      contents of the buffer in the first table indexed by the bottom byte of
1210      the mode descriptor.
1211    </desc>
1212    <sr write="true"/>
1213    <sr_count count="2"/>
1214    <va_mod name="load_lane_64_bit" start="36" size="3"/>
1215    <va_mod name="unsigned" start="39" size="1"/>
1216    <slot/>
1217    <src size="32">Byte offset</src>
1218    <src size="32">Mode descriptor</src>
1219  </ins>
1220
1221  <ins name="LD_BUFFER.i96" title="Global memory load" message="load" opcode="0x6a" opcode2="6" unit="LS">
1222    <desc>
1223      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1224      all-ones, load from the buffer descriptors in the table indexed by the
1225      bottom byte of the mode descriptor. If they are all zeroes, load the
1226      contents of the buffer in the first table indexed by the bottom byte of
1227      the mode descriptor.
1228    </desc>
1229    <sr write="true"/>
1230    <sr_count count="3"/>
1231    <va_mod name="load_lane_96_bit" start="36" size="3"/>
1232    <va_mod name="unsigned" start="39" size="1"/>
1233    <slot/>
1234    <src size="32">Byte offset</src>
1235    <src size="32">Mode descriptor</src>
1236  </ins>
1237
1238  <ins name="LD_BUFFER.i128" title="Global memory load" message="load" opcode="0x6a" opcode2="7" unit="LS">
1239    <desc>
1240      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1241      all-ones, load from the buffer descriptors in the table indexed by the
1242      bottom byte of the mode descriptor. If they are all zeroes, load the
1243      contents of the buffer in the first table indexed by the bottom byte of
1244      the mode descriptor.
1245    </desc>
1246    <sr write="true"/>
1247    <sr_count count="4"/>
1248    <va_mod name="load_lane_128_bit" start="36" size="3"/>
1249    <va_mod name="unsigned" start="39" size="1"/>
1250    <slot/>
1251    <src size="32">Byte offset</src>
1252    <src size="32">Mode descriptor</src>
1253  </ins>
1254
1255  <ins name="LEA_BUF_IMM" title="Load buffer effective address" message="attribute" opcode="0x5E" unit="LS">
1256    <desc>
1257      Load effective address of a buffer with an immediate offset added.
1258    </desc>
1259    <sr write="true"/>
1260    <sr_count count="2"/>
1261    <slot/>
1262    <imm name="table" ir_name="" start="8" size="4"/>
1263    <imm name="index" ir_name="" start="12" size="8"/>
1264    <src>Linear ID</src>
1265  </ins>
1266
1267  <ins name="LOAD.i8" title="Global memory load" opcode="0x60" opcode2="0" unused="true" unit="LS">
1268    <desc>Loads from main memory</desc>
1269    <sr write="true"/>
1270    <memory_access/>
1271    <sr_count/>
1272    <va_mod name="load_lane_8_bit" start="36" size="3"/>
1273    <va_mod name="unsigned" start="39" size="1"/>
1274    <slot/>
1275    <src size="64">Address to load from after adding offset</src>
1276    <imm name="offset" start="8" size="16" signed="true"/>
1277  </ins>
1278
1279  <ins name="LOAD.i16" title="Global memory load" opcode="0x60" opcode2="1" unused="true" unit="LS">
1280    <desc>Loads from main memory</desc>
1281    <sr write="true"/>
1282    <memory_access/>
1283    <sr_count/>
1284    <va_mod name="load_lane_16_bit" start="36" size="3"/>
1285    <va_mod name="unsigned" start="39" size="1"/>
1286    <slot/>
1287    <src size="64">Address to load from after adding offset</src>
1288    <imm name="offset" start="8" size="16" signed="true"/>
1289  </ins>
1290
1291  <ins name="LOAD.i24" title="Global memory load" opcode="0x60" opcode2="2" unused="true" unit="LS">
1292    <desc>Loads from main memory</desc>
1293    <sr write="true"/>
1294    <memory_access/>
1295    <sr_count/>
1296    <va_mod name="load_lane_24_bit" start="36" size="3"/>
1297    <va_mod name="unsigned" start="39" size="1"/>
1298    <slot/>
1299    <src size="64">Address to load from after adding offset</src>
1300    <imm name="offset" start="8" size="16" signed="true"/>
1301  </ins>
1302
1303  <ins name="LOAD.i32" title="Global memory load" opcode="0x60" opcode2="3" unused="true" unit="LS">
1304    <desc>Loads from main memory</desc>
1305    <sr write="true"/>
1306    <memory_access/>
1307    <sr_count/>
1308    <va_mod name="load_lane_32_bit" start="36" size="3"/>
1309    <va_mod name="unsigned" start="39" size="1"/>
1310    <slot/>
1311    <src size="64">Address to load from after adding offset</src>
1312    <imm name="offset" start="8" size="16" signed="true"/>
1313  </ins>
1314
1315  <ins name="LOAD.i48" title="Global memory load" opcode="0x60" opcode2="4" unused="true" unit="LS">
1316    <desc>Loads from main memory</desc>
1317    <sr write="true"/>
1318    <memory_access/>
1319    <sr_count/>
1320    <va_mod name="load_lane_48_bit" start="36" size="3"/>
1321    <va_mod name="unsigned" start="39" size="1"/>
1322    <slot/>
1323    <src size="64">Address to load from after adding offset</src>
1324    <imm name="offset" start="8" size="16" signed="true"/>
1325  </ins>
1326
1327  <ins name="LOAD.i64" title="Global memory load" opcode="0x60" opcode2="5" unused="true" unit="LS">
1328    <desc>Loads from main memory</desc>
1329    <sr write="true"/>
1330    <memory_access/>
1331    <sr_count/>
1332    <va_mod name="load_lane_64_bit" start="36" size="3"/>
1333    <va_mod name="unsigned" start="39" size="1"/>
1334    <slot/>
1335    <src size="64">Address to load from after adding offset</src>
1336    <imm name="offset" start="8" size="16" signed="true"/>
1337  </ins>
1338
1339  <ins name="LOAD.i96" title="Global memory load" opcode="0x60" opcode2="6" unused="true" unit="LS">
1340    <desc>Loads from main memory</desc>
1341    <sr write="true"/>
1342    <memory_access/>
1343    <sr_count/>
1344    <va_mod name="load_lane_96_bit" start="36" size="3"/>
1345    <va_mod name="unsigned" start="39" size="1"/>
1346    <slot/>
1347    <src size="64">Address to load from after adding offset</src>
1348    <imm name="offset" start="8" size="16" signed="true"/>
1349  </ins>
1350
1351  <ins name="LOAD.i128" title="Global memory load" opcode="0x60" opcode2="7" unused="true" unit="LS">
1352    <desc>Loads from main memory</desc>
1353    <sr write="true"/>
1354    <memory_access/>
1355    <sr_count/>
1356    <va_mod name="load_lane_128_bit" start="36" size="3"/>
1357    <va_mod name="unsigned" start="39" size="1"/>
1358    <slot/>
1359    <src size="64">Address to load from after adding offset</src>
1360    <imm name="offset" start="8" size="16" signed="true"/>
1361  </ins>
1362
1363  <group name="STORE" title="Global memory store" opcode="0x61" unused="true" unit="LS">
1364    <desc>Stores to main memory</desc>
1365    <sr read="true"/>
1366    <ins name="STORE.i8" opcode2="0x0"/>
1367    <ins name="STORE.i16" opcode2="0x1"/>
1368    <ins name="STORE.i24" opcode2="0x2"/>
1369    <ins name="STORE.i32" opcode2="0x3"/>
1370    <ins name="STORE.i48" opcode2="0x4"/>
1371    <ins name="STORE.i64" opcode2="0x5"/>
1372    <ins name="STORE.i96" opcode2="0x6"/>
1373    <ins name="STORE.i128" opcode2="0x7"/>
1374    <sr_count/>
1375    <memory_access/>
1376    <slot/>
1377    <src size="64">Address to store to after adding offset</src>
1378    <imm name="offset" start="8" size="16" signed="true"/>
1379  </group>
1380
1381  <ins name="LEA_BUFFER" title="Load buffer effective address" message="attribute" opcode="0x6B" unit="LS">
1382    <desc>
1383      Load effective address of a simple buffer with an offset added.
1384    </desc>
1385    <sr write="true"/>
1386    <sr_count count="2"/>
1387    <slot/>
1388    <src>Offset</src>
1389    <src>Index</src>
1390  </ins>
1391
1392  <ins name="ST_CVT" title="Store with conversion" opcode="0x71" unused="true" unit="LS">
1393    <desc>
1394      Store to memory with data conversion. The address to store to is given in
1395      the first source, which must be a 64-bit register (a pair of 32-bit
1396      registers). The other source is the conversion descriptor used for the store.
1397
1398      Used with LEA_TEX_IMM to implement image stores.
1399    </desc>
1400    <slot/>
1401    <va_mod name="memory_access" start="37" size="3"/>
1402    <vecsize/>
1403    <regfmt/>
1404    <sr read="true"/>
1405    <sr_count/>
1406    <src size="64">64-bit address to store to</src>
1407    <imm name="offset" start="8" size="8"/>
1408    <src>Internal conversion descriptor</src>
1409  </ins>
1410
1411  <ins name="LD_TILE" title="Load from tilebuffer" opcode="0x78" unused="true" unit="NONE">
1412    <desc>
1413      Loads a given render target, specified in the pixel indices descriptor, at
1414      a given location and sample, and convert to the format specified in the
1415      internal conversion descriptor. Used to implement EXT_framebuffer_fetch
1416      and internally in blend shaders.
1417    </desc>
1418    <sr write="true"/>
1419    <sr_count/>
1420    <vecsize/>
1421    <regfmt/>
1422    <slot/>
1423    <src>Pixel indices descriptor</src>
1424    <src>Coverage mask</src>
1425    <src>Conversion descriptor</src>
1426  </ins>
1427
1428  <ins name="ST_TILE" title="Store to tilebuffer" opcode="0x79" unused="true" unit="NONE">
1429    <desc>
1430      Store to given render target, specified in the pixel indices descriptor, at
1431      a given location and sample, and convert to the format specified in the
1432      internal conversion descriptor. Used internally in blend shaders.
1433    </desc>
1434    <sr read="true"/>
1435    <sr_count/>
1436    <vecsize/>
1437    <regfmt/>
1438    <slot/>
1439    <src>Pixel indices descriptor</src>
1440    <src>Coverage mask</src>
1441    <src>Conversion descriptor</src>
1442  </ins>
1443
1444  <ins name="BLEND" title="Blend render target" opcode="0x7F" unused="true" unit="NONE">
1445    <desc>
1446      Blends a given render target. This loads the API-specified blend state for
1447      the render target from the first source. Blend descriptors are available
1448      as special immediates. It then reads the colour to be blended from the
1449      first staging register, with the specified vector size and register format
1450      as desired. The resulting coverage mask is stored to the second set of
1451      staging registers.
1452
1453      In the fixed-function path, `BLEND` sends the colour to the blender to be
1454      written to the tilebuffer. Then, if the instruction's flow control
1455      specifies termination, the fragment program is ended. If it does not
1456      specify termination, `BLEND` acts as a relative branch, branching with the
1457      offset specified as `target`. This allows the subsequent instructions to
1458      be skipped when fixed-function blending is used. Note this implicit branch
1459      can never introduce divergence, so `.reconverge` is not required.
1460
1461      In the blend shader path, `BLEND` ignores the specified flow control and
1462      does not branch to the specified offset. Instead, execution continues
1463      normally with the next instruction. The compiler should insert code for
1464      calling a blend shader after the `BLEND` instruction unless it is known
1465      that a blend shader will never be required.
1466
1467      The indirection is required to support both fixed-function and blend
1468      shaders efficiently and without shader variants.
1469    </desc>
1470    <sr read="true"/>
1471    <src size="64">Blend descriptor</src>
1472    <src>Sample coverage</src>
1473    <imm name="target" start="8" size="8"/>
1474    <slot/>
1475    <sr_count/>
1476    <vecsize/>
1477    <regfmt/>
1478  </ins>
1479
1480  <ins name="ATEST" title="Alpha test" opcode="0x7D" unused="true" unit="NONE">
1481    <desc>
1482      Does alpha-to-coverage testing, updating the sample coverage mask. ATEST
1483      does not do an implicit discard. It should be executed before the first
1484      ZS_EMIT or BLEND instruction.
1485    </desc>
1486    <sr write="true">Updated coverage mask</sr>
1487    <src>Input coverage mask</src>
1488    <src swizzle="true">Alpha value (render target 0)</src>
1489    <src/>
1490    <sr_count/>
1491  </ins>
1492
1493  <ins name="ZS_EMIT" title="Depth/stencil write" opcode="0x7E" unused="true" unit="NONE">
1494    <desc>
1495      Programatically writes out depth, stencil, or both, depending on which
1496      modifiers are set. Used to implement gl_FragDepth and gl_FragStencil.
1497    </desc>
1498    <va_mod name="z" start="25" size="1"/>
1499    <va_mod name="stencil" start="24" size="1"/>
1500    <sr write="true">Updated coverage mask</sr>
1501    <src>Depth value</src>
1502    <src>Stencil value</src>
1503    <src>Input coverage mask</src>
1504    <sr_count/>
1505    <slot/>
1506  </ins>
1507
1508  <group name="CONVERT" title="Data conversions" dests="1" opcode="0x90" unused="true" unit="CVT">
1509    <desc>
1510      Performs the given data conversion. Note that floating-point rounding is
1511      handled via the same hardware and therefore shares an encoding. Round mode
1512      is specified where it makes sense.
1513    </desc>
1514
1515    <ins name="V2S16_TO_V2F16" opcode2="0x7"/>
1516
1517    <ins name="S32_TO_F32" opcode2="0x9"/>
1518
1519    <ins name="V2U16_TO_V2F16" opcode2="0x17"/>
1520
1521    <ins name="U32_TO_F32" opcode2="0x19"/>
1522
1523    <roundmode/>
1524    <src widen="true">Value to convert</src>
1525  </group>
1526
1527  <group name="CONVERT" title="16->32 integer data conversions" dests="1" opcode="0x90" unused="true" unit="CVT">
1528    <desc>
1529      Performs the given data conversion.
1530    </desc>
1531
1532    <ins name="S16_TO_S32" opcode2="0x4"/>
1533    <ins name="S16_TO_F32" opcode2="0x5"/>
1534    <ins name="U16_TO_U32" opcode2="0x14"/>
1535    <ins name="U16_TO_F32" opcode2="0x15"/>
1536
1537    <src swizzle="true" size="16">Value to convert</src>
1538  </group>
1539
1540  <group name="CONVERT" title="Float-to-int data conversions" dests="1" opcode="0x90" unused="true" unit="CVT">
1541    <desc>Performs the given data conversion.</desc>
1542    <ins name="F32_TO_S32" opcode2="0xC"/>
1543    <ins name="F32_TO_U32" opcode2="0x1C"/>
1544    <roundmode/>
1545    <src absneg="true">Value to convert</src>
1546  </group>
1547
1548  <group name="CONVERT" title="Float-to-int data conversions" dests="1" opcode="0x90" unused="true" unit="CVT">
1549    <desc>Performs the given data conversion.</desc>
1550    <ins name="V2F16_TO_V2S16" opcode2="0xE"/>
1551    <ins name="V2F16_TO_V2U16" opcode2="0x1E"/>
1552    <ins name="F16_TO_S32" opcode2="0xA"/>
1553    <ins name="F16_TO_U32" opcode2="0x1A"/>
1554    <roundmode/>
1555    <src swizzle="true" absneg="true" size="16">Value to convert</src>
1556  </group>
1557
1558  <ins name="F16_TO_F32" title="16-bit float to 32-bit float conversion" dests="1" opcode="0x90" opcode2="0xB" unused="true" unit="CVT">
1559    <desc>Converts up with the specified round mode.</desc>
1560    <roundmode/>
1561    <src lane="28" size="16" absneg="true">Value to convert</src>
1562  </ins>
1563
1564  <group name="CONVERT" title="8-bit to 32-bit data conversions" dests="1" opcode="0x90" unused="true" unit="CVT">
1565    <desc>
1566      Performs the given data conversion.
1567    </desc>
1568
1569    <ins name="S8_TO_S32" opcode2="0x0"/>
1570    <ins name="S8_TO_F32" opcode2="0x1"/>
1571
1572    <ins name="U8_TO_U32" opcode2="0x10"/>
1573    <ins name="U8_TO_F32" opcode2="0x11"/>
1574
1575    <src lane="28" size="8">Value to convert</src>
1576  </group>
1577
1578  <group name="CONVERT" title="8-bit to 16-bit data conversions" dests="1" opcode="0x90" unused="true" unit="CVT">
1579    <desc>
1580      Performs the given data conversion.
1581    </desc>
1582
1583    <ins name="V2S8_TO_V2S16" opcode2="0x2"/>
1584    <ins name="V2S8_TO_V2F16" opcode2="0x3"/>
1585
1586    <ins name="V2U8_TO_V2U16" opcode2="0x12"/>
1587    <ins name="V2U8_TO_V2F16" opcode2="0x13"/>
1588
1589    <src halfswizzle="true" size="8">Value to convert</src>
1590  </group>
1591
1592  <group name="FROUND" title="Floating-point rounding" dests="1" opcode="0x90" unused="true" unit="CVT">
1593    <desc>
1594      Performs the given rounding, using the convert unit.
1595    </desc>
1596
1597    <ins name="FROUND.f32" opcode2="0xD"/>
1598    <ins name="FROUND.v2f16" opcode2="0xF"/>
1599
1600    <roundmode/>
1601    <src swizzle="true" absneg="true">Value to convert</src>
1602  </group>
1603
1604  <ins name="MOV.i32" title="Register move" dests="1" opcode="0x91" opcode2="0x0" unused="true" unit="CVT">
1605    <desc>Canonical register-to-register move.</desc>
1606    <src/>
1607  </ins>
1608
1609  <ins name="CLZ.u32" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x4" unused="true" unit="CVT">
1610    <desc>
1611      Used as a primitive for various bitwise operations.
1612    </desc>
1613    <src/>
1614  </ins>
1615
1616  <ins name="CLZ.v2u16" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x5" unused="true" unit="CVT">
1617    <desc>
1618      Used as a primitive for various bitwise operations.
1619    </desc>
1620    <src swizzle="true"/>
1621  </ins>
1622
1623  <ins name="CLZ.v4u8" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x6" unused="true" unit="CVT">
1624    <desc>
1625      Used as a primitive for various bitwise operations.
1626    </desc>
1627    <src/>
1628  </ins>
1629
1630  <ins name="IABS.s32" title="Absolute value" dests="1" opcode="0x91" opcode2="0x8" unused="true" unit="CVT">
1631    <desc>
1632      64-bit abs may be constructed in 4 instructions (5 clocks) by checking the
1633      sign with `ICMP.s32.lt.m1 hi, 0` and negating based on the result with
1634      `IADD.s64` and `LSHIFT_XOR.i32` on each half.
1635    </desc>
1636    <src widen="true"/>
1637  </ins>
1638
1639  <ins name="IABS.v2s16" title="Absolute value" dests="1" opcode="0x91" opcode2="0x9" unused="true" unit="CVT">
1640    <src widen="true"/>
1641  </ins>
1642
1643  <ins name="IABS.v4s8" title="Absolute value" dests="1" opcode="0x91" opcode2="0xa" unused="true" unit="CVT">
1644    <src/>
1645  </ins>
1646
1647  <ins name="POPCOUNT.i32" title="Population count" dests="1" opcode="0x91" opcode2="0xC" unused="true" unit="SFU">
1648    <desc>
1649      Only available as 32-bit. Smaller bitsizes require explicit conversions.
1650      64-bit popcount may be constructed in 3 clocks by separate 32-bit
1651      popcounts of each half and a 32-bit add, which is guaranteed not to
1652      overflow.
1653    </desc>
1654    <src/>
1655  </ins>
1656
1657  <ins name="BITREV.i32" title="Bitwise reverse" dests="1" opcode="0x91" opcode2="0xD" unused="true" unit="SFU">
1658    <desc>
1659      Only available as 32-bit. Other bitsizes may be derived with swizzles.
1660    </desc>
1661    <src/>
1662  </ins>
1663
1664  <ins name="NOT_OLD.i32" title="Bitwise complement" dests="1" opcode="0x91" opcode2="0xE" unused="true" unit="SFU">
1665    <desc>
1666      For fully featured bitwise operation, see the shift opcodes.
1667    </desc>
1668    <src/>
1669  </ins>
1670
1671  <ins name="NOT_OLD.i64" title="Bitwise complement" dests="1" opcode="0x191" opcode2="0xE" unused="true" unit="SFU">
1672    <desc>
1673      For fully featured bitwise operation, see the shift opcodes.
1674    </desc>
1675    <src/>
1676  </ins>
1677
1678  <ins name="WMASK" title="Warp mask" dests="1" opcode="0x95" unused="true" unit="CVT">
1679    <desc>
1680      Returns the mask of lanes ever active within the warp (subgroup), such
1681      that the source is nonzero. The number of work-items in a subgroup is
1682      given as the popcount of this value with a nonzero input.
1683
1684      An `all()` subgroup operation may be constructed as `WMASK` of the input
1685      compared for equality with `WMASK` of an nonzero value.
1686
1687      An `any()` subgroup operation may be constructed as `WMASK` of the input
1688      compared against zero.
1689    </desc>
1690    <src/>
1691    <subgroup/>
1692  </ins>
1693
1694  <group name="FREXP" title="Fraction/exponent extract" dests="1" opcode="0x99" unused="true" unit="CVT">
1695    <ins name="FREXPM.f32" opcode2="0"/>
1696    <ins name="FREXPM.v2f16" opcode2="1"/>
1697    <ins name="FREXPE.f32" opcode2="2"/>
1698    <ins name="FREXPE.v2f16" opcode2="3"/>
1699    <desc>
1700      Breaks up the floating-point input into its fractional (mantissa) and
1701      exponent parts. By default, this is compatible with the `frexp()` function
1702      in APIs. With the log/sqrt modifiers, the floating point format is
1703      adjusted to be compatible with Valhall's argument reduction for logarithm
1704      and square root computation respectively.
1705    </desc>
1706    <va_mod name="sqrt" start="24" size="1"/>
1707    <va_mod name="log" start="25" size="1"/>
1708    <src float="true" swizzle="true"/>
1709  </group>
1710
1711  <group name="SFU" title="Special function unit" dests="1" opcode="0x9C" unused="true" unit="SFU">
1712    <ins name="FRCP.f32" opcode2="0"/>
1713    <ins name="FRCP.f16" opcode2="1"/>
1714    <ins name="FRSQ.f32" opcode2="2"/>
1715    <ins name="FRSQ.f16" opcode2="3"/>
1716    <ins name="FLOGD.f32" opcode2="8"/>
1717    <ins name="FPCLASS.f32" opcode2="10"/>
1718    <ins name="FPCLASS.f16" opcode2="11"/>
1719    <ins name="FLOG_TABLE.f32" opcode2="12"/>
1720    <ins name="FRCP_APPROX.f32" opcode2="14"/>
1721    <ins name="FRSQ_APPROX.f32" opcode2="15"/>
1722    <desc>
1723      Performs a given special function. The floating-point reciprocal (`FRCP`)
1724      and reciprocal square root (`FRSQ`) instructions may be freely used as-is.
1725      The logarithm instruction (`FLOGD.f32`) requires an argument
1726      reduction. See the transcendentals section for more information. Like the
1727      Bifrost op, `FRSQ_APPROX.f32` does an implicit `FREXPM.f32.sqrt` on the
1728      source.
1729    </desc>
1730    <src float="true" swizzle="true" absneg="true"/>
1731  </group>
1732
1733  <group name="SFU" title="Special function unit" dests="1" opcode="0x9C" unused="true" unit="SFU">
1734    <ins name="FSIN_TABLE.u6" opcode2="4"/>
1735    <ins name="FCOS_TABLE.u6" opcode2="5"/>
1736    <ins name="FSINCOS_OFFSET.u6" opcode2="6"/>
1737    <ins name="FEXP_TABLE.u4" opcode2="13"/>
1738    <desc>
1739      Performs a given special function. The trigonometric tables
1740      (`FSIN_TABLE.u6` and `FCOS_TABLE.u6`) are crude, requiring both an
1741      argument reduction and postprocessing.
1742    </desc>
1743    <src/>
1744  </group>
1745
1746  <group name="FADD" title="Floating-point add" dests="1" opcode2="0" unused="true" unit="FMA">
1747    <ins name="FADD.f32" opcode="0xA4"/>
1748    <ins name="FADD.v2f16" opcode="0xA5"/>
1749    <desc>$A + B$</desc>
1750    <clamp/>
1751    <src absneg="true" swizzle="true">A</src>
1752    <src absneg="true" swizzle="true">B</src>
1753  </group>
1754
1755  <group name="FMIN" title="Floating-point minimum" dests="1" opcode2="2" unused="true" unit="CVT">
1756    <ins name="FMIN.f32" opcode="0xA4"/>
1757    <ins name="FMIN.v2f16" opcode="0xA5"/>
1758    <desc>$\min \{ A, B \}$</desc>
1759    <clamp/>
1760    <src absneg="true" swizzle="true">A</src>
1761    <src absneg="true" swizzle="true">B</src>
1762  </group>
1763
1764  <group name="FMAX" title="Floating-point maximum" dests="1" opcode2="3" unused="true" unit="CVT">
1765    <ins name="FMAX.f32" opcode="0xA4"/>
1766    <ins name="FMAX.v2f16" opcode="0xA5"/>
1767    <desc>$\max \{ A, B \}$</desc>
1768    <clamp/>
1769    <src absneg="true" swizzle="true">A</src>
1770    <src absneg="true" swizzle="true">B</src>
1771  </group>
1772
1773  <group name="V2F32_TO_V2F16" title="Vectorized floating-point conversion" dests="1" opcode2="4" unused="true" unit="CVT">
1774    <ins name="V2F32_TO_V2F16" opcode="0xA5"/>
1775    <desc>
1776      Given a pair of 32-bit floats, output a pair of 16-bit floats packed into
1777      a 32-bit destination.
1778    </desc>
1779    <clamp/>
1780    <roundmode/>
1781    <src absneg="true">A</src>
1782    <src absneg="true">B</src>
1783  </group>
1784
1785  <group name="LDEXP" title="Floating-point rescaling" dests="1" opcode2="6" unused="true" unit="FMA">
1786    <ins name="LDEXP.f32" opcode="0xA4"/>
1787    <ins name="LDEXP.v2f16" opcode="0xA5"/>
1788    <desc>
1789      Computes $A \cdot 2^B$ by adding B to the exponent of A. Used to calculate
1790      various special functions, particularly base-2 exponents. Special case
1791      handling differs from an actual floating-point multiply, so this should
1792      not be used outside fixed instruction sequences.
1793    </desc>
1794    <src absneg="true" swizzle="true">A</src>
1795    <src/>
1796    <roundmode/> <!-- Also has rtna -->
1797    <!-- Also has infinity handling for arctan -->
1798  </group>
1799
1800  <ins name="FEXP.f32" title="Floating-point exponent" dests="1" opcode="0xA4" opcode2="8" unused="true" unit="SFU">
1801    <desc>
1802      Calculates the base-2 exponent of an argument specified as a 8:24
1803      fixed-point. The original argument is passed as well for correct handling
1804      of special cases.
1805    </desc>
1806    <clamp/>
1807    <src>Input as 8:24 fixed-point</src>
1808    <src absneg="true">Input as 32-bit float</src>
1809  </ins>
1810
1811  <ins name="FADD_LSCALE.f32" title="Floating-point add with logarithm scale" dests="1" opcode="0xA4" opcode2="9" unused="true" unit="FMA">
1812    <desc>
1813      Performs a floating-point addition specialized for logarithm computation.
1814    </desc>
1815    <clamp/>
1816    <src absneg="true">A</src>
1817    <src absneg="true">B</src>
1818  </ins>
1819
1820  <ins name="FATAN_ASSIST.f32" title="ATAN calculation helper" dests="1" opcode="0xA4" opcode2="14" unused="true" unit="SFU">
1821    <desc>
1822      Used for `atan2()` implementation. Destination is two 16-bit
1823      values (int and float) for the first form, and a single 32-bit float when
1824      `.second` is set (indicating the FATAN_TABLE.f32 instruction).
1825    </desc>
1826    <va_mod name="second" start="24" size="1"/>
1827    <src>A</src>
1828    <src>B</src>
1829  </ins>
1830
1831  <group name="IADD" title="Integer addition" dests="1" opcode2="0" unused="true" unit="CVT">
1832    <desc>
1833      $A + B$ with optional saturation.
1834
1835      As Valhall lacks swizzle instructions, `IADD.v2i16` with zero is the
1836      canonical lowering for swizzles.
1837    </desc>
1838    <ins name="IADD.u32" opcode="0xA0"/>
1839    <ins name="IADD.v2u16" opcode="0xA1"/>
1840    <ins name="IADD.v4u8" opcode="0xA2"/>
1841    <ins name="IADD.s32" opcode="0xA8"/>
1842    <ins name="IADD.v2s16" opcode="0xA9"/>
1843    <ins name="IADD.v4s8" opcode="0xAA"/>
1844    <ins name="IADD.u64" opcode="0x1A3"/>
1845    <ins name="IADD.s64" opcode="0x1AB"/>
1846    <!-- <ins name="IADD.s32" opcode="0x1A0"/> -->
1847    <src widen="true">A</src>
1848    <src widen="true">B</src>
1849    <saturate/>
1850  </group>
1851
1852  <ins name="MKVEC.v2i16" title="Make 16-bit vector" dests="1" opcode="0xA1" opcode2="0x5" unused="true" unit="CVT">
1853    <desc>Calculates $A | (B \ll 16)$. Used to implement `(ushort2)(A, B)`</desc>
1854    <src swizzle="true">A</src>
1855    <src swizzle="true">B</src>
1856  </ins>
1857
1858  <group name="ISUB" title="Integer subtract" dests="1" opcode2="1" unused="true" unit="CVT">
1859    <ins name="ISUB.u32" opcode="0xA0"/>
1860    <ins name="ISUB.v2u16" opcode="0xA1"/>
1861    <ins name="ISUB.v4u8" opcode="0xA2"/>
1862    <ins name="ISUB.s32" opcode="0xA8"/>
1863    <ins name="ISUB.v2s16" opcode="0xA9"/>
1864    <ins name="ISUB.v4s8" opcode="0xAA"/>
1865    <ins name="ISUB.u64" opcode="0x1A3"/>
1866    <ins name="ISUB.s64" opcode="0x1AB"/>
1867    <desc>$A - B$ with optional saturation</desc>
1868    <src widen="true">A</src>
1869    <src widen="true">B</src>
1870    <saturate/>
1871  </group>
1872
1873  <group name="SEG_ADD" title="Segment addition" dests="1" opcode2="6" unused="true" unit="CVT">
1874    <desc>
1875      Similar to SHADDX, but especially used for loading offsets into
1876      WLS. Usually this is only required for atomic operations, which cannot
1877      directly use wls_pointer as an address.
1878
1879      .neg indicates SEG_SUB instead.
1880    </desc>
1881    <ins name="SEG_ADD.u64" opcode="0x1A3"/>
1882    <va_mod name="neg" start="38" size="1"/>
1883    <va_mod name="preserve_null" start="39" size="1"/>
1884    <src>A</src>
1885    <src widen="true">B</src>
1886  </group>
1887
1888  <group name="SHADDX" title="Shift, extend, and 64-bit add" dests="1" opcode2="7" unused="true" unit="CVT">
1889    <desc>
1890      Sign or zero extend B to 64-bits, left-shift by `shift`, and add the
1891      64-bit value A. These instructions accelerate address arithmetic, but may
1892      be used in full generality for 64-bit integer arithmetic.
1893    </desc>
1894    <ins name="SHADDX.u64" opcode="0x1A3"/>
1895    <ins name="SHADDX.s64" opcode="0x1AB"/>
1896    <imm name="shift" start="20" size="3"/>
1897    <src>A</src>
1898    <src widen="true">B</src>
1899  </group>
1900
1901  <group name="IMUL" title="Integer multiply" dests="1" opcode2="0x0A" unused="true" unit="SFU">
1902    <ins name="IMUL.i32" opcode="0xA0"/>
1903    <ins name="IMUL.v2i16" opcode="0xA1"/>
1904    <ins name="IMUL.v4i8" opcode="0xA2"/>
1905    <ins name="IMUL.s32" opcode="0xA8"/>
1906    <ins name="IMUL.v2s16" opcode="0xA9"/>
1907    <ins name="IMUL.v4s8" opcode="0xAA"/>
1908    <ins name="IMULD.u64" opcode="0x1A3"/>
1909    <!-- <ins name="IMUL.s32" opcode="0x1A0"/> -->
1910    <desc>
1911      $A \cdot B$ with optional saturation. Note the multipliers can only handle up to
1912      32-bit by 32-bit multiplies. The 64-bit "multiply" acts like IMUL.u32 but
1913      additionally writes the high half of the product to the high half of the
1914      64-bit destination. Along with IADD.u32 and IADD.u64, this allows the
1915      construction of a 64-bit multiply in 5 instructions (6 clocks).
1916    </desc>
1917    <src widen="true">A</src>
1918    <src widen="true">B</src>
1919    <saturate/>
1920  </group>
1921
1922  <group name="HADD" title="Integer half-add" dests="1" opcode2="0x0B" unused="true" unit="CVT">
1923    <ins name="HADD.u32" opcode="0xA0"/>
1924    <ins name="HADD.v2u16" opcode="0xA1"/>
1925    <ins name="HADD.v4u8" opcode="0xA2"/>
1926    <ins name="HADD.s32" opcode="0xA8"/>
1927    <ins name="HADD.v2s16" opcode="0xA9"/>
1928    <ins name="HADD.v4s8" opcode="0xAA"/>
1929    <va_mod name="rhadd" start="30" size="1"/>
1930    <src widen="true">A</src>
1931    <src widen="true">B</src>
1932    <desc>
1933      $(A + B) \gg 1$ without intermediate overflow, corresponding to `hadd()` in
1934      OpenCL. With the `.rhadd` modifier set, it instead calculates
1935      $(A + B + 1) \gg 1$ corresponding to `rhadd()` in OpenCL.
1936    </desc>
1937  </group>
1938
1939  <group name="CLPER" title="Cross-lane permute" dests="1" opcode2="0xF" unused="true" unit="SFU">
1940    <ins name="CLPER.i32" opcode="0xA0"/>
1941    <ins name="CLPER.v2u16" opcode="0xA1"/>
1942    <ins name="CLPER.v4u8" opcode="0xA2"/>
1943    <ins name="CLPER.s32" opcode="0xA8"/>
1944    <ins name="CLPER.v2s16" opcode="0xA9"/>
1945    <ins name="CLPER.v4s8" opcode="0xAA"/>
1946    <ins name="CLPER.u64" opcode="0x1A3"/>
1947    <ins name="CLPER.s64" opcode="0x1AB"/>
1948    <!-- <ins name="CLPER.s32" opcode="0x1A0"/> -->
1949    <desc>
1950      Selects the value of A in the subgroup lane given by B. This implements
1951      subgroup broadcasts. It may be used as a primitive for screen space
1952      derivatives in fragment shaders.
1953    </desc>
1954    <src>A</src>
1955    <src widen="true">B</src>
1956    <subgroup/>
1957    <lane_op/>
1958    <inactive_result/>
1959  </group>
1960
1961  <group name="FMA" title="Fused floating-point multiply add" dests="1" unused="true" unit="FMA">
1962    <ins name="FMA.f32" opcode="0xB2"/>
1963    <ins name="FMA.v2f16" opcode="0xB3"/>
1964    <desc>$A \cdot B + C$</desc>
1965    <clamp/>
1966    <src absneg="true" swizzle="true">A</src>
1967    <src absneg="true" swizzle="true">B</src>
1968    <src absneg="true" swizzle="true">C</src>
1969  </group>
1970
1971  <group name="LSHIFT_AND" title="Left shift and bitwise AND" dests="1" opcode2="0x100" unused="true" unit="SFU">
1972    <ins name="LSHIFT_AND.i32" opcode="0xB4"/>
1973    <ins name="LSHIFT_AND.v2i16" opcode="0xB5"/>
1974    <ins name="LSHIFT_AND.v4i8" opcode="0xB6"/>
1975    <ins name="LSHIFT_AND.i64" opcode="0x1B7"/>
1976    <va_mod name="left" start="128" size="1" implied="true"/>
1977    <desc>
1978      Left shifts its first source by a specified amount and bitwise ANDs it with the
1979      second source, optionally inverting the second source or the result.
1980    </desc>
1981    <not_result/>
1982    <src widen="true">A</src>
1983    <src lanes="true" size="8">shift</src>
1984    <src not="true">B</src>
1985  </group>
1986
1987  <group name="RSHIFT_AND" title="Right shift and bitwise AND" dests="1" opcode2="0x000" unused="true" unit="SFU">
1988    <ins name="RSHIFT_AND.i32" opcode="0xB4"/>
1989    <ins name="RSHIFT_AND.v2i16" opcode="0xB5"/>
1990    <ins name="RSHIFT_AND.v4i8" opcode="0xB6"/>
1991    <ins name="RSHIFT_AND.i64" opcode="0x1B7"/>
1992    <va_mod name="left" start="128" size="1" implied="true"/>
1993    <desc>
1994      Right shifts its first source by a specified amount and bitwise ANDs it with the
1995      second source, optionally inverting the second source or the result. If
1996      `signed` is set, the hardware performs an arithmetic right shift; otherwise,
1997      it performs an unsigned right shift.
1998    </desc>
1999    <va_mod name="signed" start="34" size="1"/>
2000    <not_result/>
2001    <src widen="true">A</src>
2002    <src lanes="true" size="8">shift</src>
2003    <src not="true">B</src>
2004  </group>
2005
2006  <group name="LSHIFT_OR" title="Left shift and bitwise OR" dests="1" opcode2="0x101" unused="true" unit="SFU">
2007    <ins name="LSHIFT_OR.i32" opcode="0xB4"/>
2008    <ins name="LSHIFT_OR.v2i16" opcode="0xB5"/>
2009    <ins name="LSHIFT_OR.v4i8" opcode="0xB6"/>
2010    <ins name="LSHIFT_OR.i64" opcode="0x1B7"/>
2011    <va_mod name="left" start="128" size="1" implied="true"/>
2012    <desc>
2013      Left shifts its first source by a specified amount and bitwise ORs it with the
2014      second source, optionally inverting the second source or the result.
2015    </desc>
2016    <not_result/>
2017    <src widen="true">A</src>
2018    <src lanes="true" size="8">shift</src>
2019    <src not="true">B</src>
2020  </group>
2021
2022  <group name="RSHIFT_OR" title="Right shift and bitwise OR" dests="1" opcode2="0x001" unused="true" unit="SFU">
2023    <ins name="RSHIFT_OR.i32" opcode="0xB4"/>
2024    <ins name="RSHIFT_OR.v2i16" opcode="0xB5"/>
2025    <ins name="RSHIFT_OR.v4i8" opcode="0xB6"/>
2026    <ins name="RSHIFT_OR.i64" opcode="0x1B7"/>
2027    <va_mod name="left" start="128" size="1" implied="true"/>
2028    <desc>
2029      Right shifts its first source by a specified amount and bitwise ORs it with the
2030      second source, optionally inverting the second source or the result. If
2031      `signed` is set, the hardware performs an arithmetic right shift; otherwise,
2032      it performs an unsigned right shift.
2033   </desc>
2034    <va_mod name="signed" start="34" size="1"/>
2035    <not_result/>
2036    <src widen="true">A</src>
2037    <src lanes="true" size="8">shift</src>
2038    <src not="true">B</src>
2039  </group>
2040
2041  <group name="LSHIFT_XOR" title="Left shift and bitwise XOR" dests="1" opcode2="0x102" unused="true" unit="SFU">
2042    <ins name="LSHIFT_XOR.i32" opcode="0xB4"/>
2043    <ins name="LSHIFT_XOR.v2i16" opcode="0xB5"/>
2044    <ins name="LSHIFT_XOR.v4i8" opcode="0xB6"/>
2045    <ins name="LSHIFT_XOR.i64" opcode="0x1B7"/>
2046    <va_mod name="left" start="128" size="1" implied="true"/>
2047    <desc>
2048      Left shifts its first source by a specified amount and bitwise XORs it with the
2049      second source, optionally inverting the second source or the result.
2050    </desc>
2051    <not_result/>
2052    <src widen="true">A</src>
2053    <src lanes="true" size="8">shift</src>
2054    <src not="true">B</src>
2055  </group>
2056
2057  <group name="RSHIFT_XOR" title="Right shift and bitwise XOR" dests="1" opcode2="0x002" unused="true" unit="SFU">
2058    <ins name="RSHIFT_XOR.i32" opcode="0xB4"/>
2059    <ins name="RSHIFT_XOR.v2i16" opcode="0xB5"/>
2060    <ins name="RSHIFT_XOR.v4i8" opcode="0xB6"/>
2061    <ins name="RSHIFT_XOR.i64" opcode="0x1B7"/>
2062    <va_mod name="left" start="128" size="1" implied="true"/>
2063    <desc>
2064      Right shifts its first source by a specified amount and bitwise XORs it with the
2065      second source, optionally inverting the second source or the result. If
2066      `signed` is set, the hardware performs an arithmetic right shift; otherwise,
2067      it performs an unsigned right shift.
2068    </desc>
2069    <va_mod name="signed" start="34" size="1"/>
2070    <not_result/>
2071    <src widen="true">A</src>
2072    <src lanes="true" size="8">shift</src>
2073    <src not="true">B</src>
2074  </group>
2075
2076  <ins name="MUX.i32" title="Mux" dests="1" opcode="0xB8" unused="true" unit="SFU">
2077    <desc>
2078      Mux between A and B based on the provided mask. The condition specified
2079      as the `mux` modifier is evaluated on the mask. If true, `A` is chosen,
2080      else `B` is chosen. The `bit` modifier acts bitwise, equivalent to
2081      `bitselect()` in OpenCL, so `MUX.i32.bit A, B, mask` calculates
2082      `(A &amp; mask) | (B &amp; ~mask)`.
2083    </desc>
2084    <va_mod name="mux" start="32" size="2"/>
2085    <src>A</src>
2086    <src>B</src>
2087    <src>Mask</src>
2088  </ins>
2089
2090  <ins name="MUX.v2i16" title="Mux" dests="1" opcode="0xB9" unused="true" unit="SFU">
2091    <desc>
2092      Mux between A and B based on the provided mask. The condition specified
2093      as the `mux` modifier is evaluated on the mask. If true, `A` is chosen,
2094      else `B` is chosen. The `bit` modifier acts bitwise, equivalent to
2095      `bitselect()` in OpenCL, so `MUX.v2i16.bit A, B, mask` calculates
2096      `(A &amp; mask) | (B &amp; ~mask)`.
2097    </desc>
2098    <va_mod name="mux" start="32" size="2"/>
2099    <src swizzle="true">A</src>
2100    <src swizzle="true">B</src>
2101    <src swizzle="true">Mask</src>
2102  </ins>
2103
2104  <ins name="MUX.v4i8" title="Mux" dests="1" opcode="0xBA" unused="true" unit="SFU">
2105    <desc>
2106      Mux between A and B based on the provided mask. The condition specified
2107      as the `mux` modifier is evaluated on the mask. If true, `A` is chosen,
2108      else `B` is chosen. The `bit` modifier acts bitwise, equivalent to
2109      `bitselect()` in OpenCL, so `MUX.v4i8.bit A, B, mask` calculates
2110      `(A &amp; mask) | (B &amp; ~mask)`.
2111    </desc>
2112    <va_mod name="mux" start="32" size="2"/>
2113    <src>A</src>
2114    <src>B</src>
2115    <src>Mask</src>
2116  </ins>
2117
2118  <ins name="CUBE_SSEL" title="Cube S-coordinate select" dests="1" opcode="0xBC" opcode2="0" unused="true" unit="SFU">
2119    <desc>During a cube map transform, select the S coordinate given a selected face.</desc>
2120    <src absneg="true">Z coordinate as 32-bit floating point</src>
2121    <src absneg="true">X coordinate as 32-bit floating point</src>
2122    <src>Cube face index</src>
2123  </ins>
2124
2125  <ins name="CUBE_TSEL" title="Cube T-coordinate select" dests="1" opcode="0xBC" opcode2="1" unused="true" unit="SFU">
2126    <desc>During a cube map transform, select the T coordinate given a selected face.</desc>
2127    <src absneg="true">Y coordinate as 32-bit floating point</src>
2128    <src absneg="true">Z coordinate as 32-bit floating point</src>
2129    <src>Cube face index</src>
2130  </ins>
2131
2132  <ins name="MKVEC.v2i8" title="Make 8-bit vector" dests="1" opcode="0xBD" unit="CVT">
2133    <desc>
2134      Calculates $A | (B \ll 8) | (CD \ll 16)$ for 8-bit A and B and 16-bit CD.
2135
2136      To implement `(uchar4) (A, B, C, D)` in full generality, use the sequence
2137      `MKVEC.v2i8 CD, C, D, #0; MKVEC.v2i8 out, A, B, CD`
2138
2139      `MKVEC.v2i8` also allows zero extending arbitrary 8-bit lanes. For
2140      example, to extend `r0.b3` to `r1`, use `MKVEC.v2i8 r1, r0.b3, 0x0.b0, 0x0`.
2141    </desc>
2142    <src lane="true">A</src>
2143    <src lane="true">B</src>
2144    <src>CD</src>
2145  </ins>
2146
2147  <ins name="CUBEFACE1" title="Cube map transform step 1" dests="1" opcode="0xC0" unused="true" unit="SFU">
2148    <desc>Select the maximum absolute value of its arguments.</desc>
2149    <src absneg="true">X coordinate as 32-bit floating point</src>
2150    <src absneg="true">Y coordinate as 32-bit floating point</src>
2151    <src absneg="true">Z coordinate as 32-bit floating point</src>
2152  </ins>
2153
2154  <ins name="CUBEFACE2_V9" title="Cube map transform step 2" dests="1" opcode="0xC1" unit="SFU">
2155    <desc>Select the cube face index corresponding to the arguments.</desc>
2156    <src absneg="true">X coordinate as 32-bit floating point</src>
2157    <src absneg="true">Y coordinate as 32-bit floating point</src>
2158    <src absneg="true">Z coordinate as 32-bit floating point</src>
2159  </ins>
2160
2161  <group name="IDP" title="8-bit dot product" dests="1" opcode="0xC2" unused="true" unit="FMA">
2162    <desc>
2163      8-bit integer dot product between 4 channel vectors, intended for machine
2164      learning. Available in both unsigned and signed variants, controlling
2165      sign-extension/zero-extension behaviour to the final 32-bit destination.
2166      Saturation is available. Corresponds to the `cl_arm_integer_dot_product_*`
2167      family of OpenCL extensions. Not for actual use, just for completeness.
2168      Instead, use your platform's neural accelerator.
2169
2170      For $A, B \in \{ 0, \ldots, 255 \}^4$ and $\text{Accumulator} \in
2171      \mathbb{Z}$, calculates $(A \cdot B) + \text{Accumulator}$ and optionally
2172      saturates.
2173    </desc>
2174    <ins name="IDP.v4s8" opcode2="0"/>
2175    <ins name="IDP.v4u8" opcode2="1"/>
2176    <src>A</src>
2177    <src>B</src>
2178    <src>Accumulator</src>
2179    <saturate/>
2180  </group>
2181
2182  <group name="ICMP_OR" title="Unsigned integer compare" dests="1" unit="CVT" opcode2="0">
2183    <desc>
2184      Evaluates the given condition, do a logical or with the condition in
2185      the result source, and return in the given result type (integer
2186      one, integer minus one, or floating-point one). The third source is useful
2187      for chaining together conditions without intermediate bitwise arithmetic;
2188      when this is not desired, tie it to zero.
2189    </desc>
2190    <ins name="ICMP_OR.u32" opcode="0xF0"/>
2191    <ins name="ICMP_OR.v2u16" opcode="0xF1"/>
2192    <ins name="ICMP_OR.v4u8" opcode="0xF2"/>
2193    <cmp int_only="true"/>
2194    <result_type/>
2195    <src widen="true">A</src>
2196    <src widen="true">B</src>
2197    <src>C</src>
2198  </group>
2199
2200  <group name="ICMP_AND" title="Unsigned integer compare" dests="1" unit="CVT" opcode2="1">
2201    <desc>
2202      Evaluates the given condition, do a logical and with the condition in
2203      the result source, and return in the given result type (integer
2204      one, integer minus one, or floating-point one). The third source is useful
2205      for chaining together conditions without intermediate bitwise arithmetic.
2206    </desc>
2207    <ins name="ICMP_AND.u32" opcode="0xF0"/>
2208    <ins name="ICMP_AND.v2u16" opcode="0xF1"/>
2209    <ins name="ICMP_AND.v4u8" opcode="0xF2"/>
2210    <cmp int_only="true"/>
2211    <result_type/>
2212    <src widen="true">A</src>
2213    <src widen="true">B</src>
2214    <src>C</src>
2215  </group>
2216
2217  <group name="FCMP_OR" title="Floating-point compare" dests="1" unit="CVT" opcode2="0">
2218    <desc>
2219      Evaluates the given condition, do a logical or with the condition in
2220      the result source, and return in the given result type (integer
2221      one, integer minus one, or floating-point one). The third source is useful
2222      for chaining together conditions without intermediate bitwise arithmetic;
2223      when this is not desired, tie it to zero.
2224    </desc>
2225    <ins name="FCMP_OR.f32" opcode="0xF4"/>
2226    <ins name="FCMP_OR.v2f16" opcode="0xF5"/>
2227    <cmp/>
2228    <result_type/>
2229    <src absneg="true" swizzle="true">A</src>
2230    <src absneg="true" swizzle="true">B</src>
2231    <src>C</src>
2232  </group>
2233
2234  <group name="FCMP_AND" title="Floating-point compare" dests="1" unit="CVT" opcode2="1">
2235    <desc>
2236      Evaluates the given condition, do a logical and/or with the condition in
2237      the result source, and return in the given result type (integer
2238      one, integer minus one, or floating-point one). The third source is useful
2239      for chaining together conditions without intermediate bitwise arithmetic.
2240    </desc>
2241    <ins name="FCMP_AND.f32" opcode="0xF4"/>
2242    <ins name="FCMP_AND.v2f16" opcode="0xF5"/>
2243    <cmp/>
2244    <result_type/>
2245    <src absneg="true" swizzle="true">A</src>
2246    <src absneg="true" swizzle="true">B</src>
2247    <src>C</src>
2248  </group>
2249
2250  <group name="ICMP_OR" title="Signed integer compare" dests="1" unit="CVT" opcode2="0">
2251    <desc>
2252      Evaluates the given condition, do a logical or with the condition in
2253      the result source, and return in the given result type (integer
2254      one, integer minus one, or floating-point one). The third source is useful
2255      for chaining together conditions without intermediate bitwise arithmetic.
2256    </desc>
2257    <ins name="ICMP_OR.s32" opcode="0xF8"/>
2258    <ins name="ICMP_OR.v2s16" opcode="0xF9"/>
2259    <ins name="ICMP_OR.v4s8" opcode="0xFA"/>
2260    <cmp int_only="true"/>
2261    <result_type/>
2262    <src widen="true">A</src>
2263    <src widen="true">B</src>
2264    <src>C</src>
2265  </group>
2266
2267  <group name="ICMP_AND" title="Signed integer compare" dests="1" unit="CVT" opcode2="1">
2268    <desc>
2269      Evaluates the given condition, do a logical and with the condition in
2270      the result source, and return in the given result type (integer
2271      one, integer minus one, or floating-point one). The third source is useful
2272      for chaining together conditions without intermediate bitwise arithmetic.
2273    </desc>
2274    <ins name="ICMP_AND.s32" opcode="0xF8"/>
2275    <ins name="ICMP_AND.v2s16" opcode="0xF9"/>
2276    <ins name="ICMP_AND.v4s8" opcode="0xFA"/>
2277    <cmp int_only="true"/>
2278    <result_type/>
2279    <src widen="true">A</src>
2280    <src widen="true">B</src>
2281    <src>C</src>
2282  </group>
2283
2284  <group name="ICMP_MULTI" title="Integer compare" dests="1" unit="CVT" opcode2="2">
2285    <desc>
2286      Evaluates the given condition, do a logical and/or with the condition in
2287      the result source, and return in the given result type (integer
2288      one, integer minus one, or floating-point one). The third source is useful
2289      for chaining together conditions without intermediate bitwise arithmetic;
2290      when this is not desired, tie it to zero and use the OR combine mode (do
2291      not set the `.and` modifier).
2292
2293      Used to construct signed 64-bit compares
2294      in 1 `ICMP.u32` and 1 `ICMP.s32` instruction, in conjunction with the `u1`
2295      result type on the low half, the `m1` result type on the high half, and
2296      the result of the low half comparison passed as the third source.
2297    </desc>
2298    <ins name="ICMP_MULTI.u32" opcode="0xF0"/>
2299    <ins name="ICMP_MULTI.s32" opcode="0xF8"/>
2300    <cmp int_only="true"/>
2301    <result_type/>
2302    <src widen="true">A</src>
2303    <src widen="true">B</src>
2304    <src>C</src>
2305  </group>
2306
2307  <ins name="IADD_IMM.i32" title="Integer addition with immediate" dests="1" opcode="0x110" unit="CVT">
2308    <desc>
2309      Adds an arbitrary 32-bit immediate embedded within the instruction stream.
2310      If no modifiers are required, this is preferred to `IADD.i32` with a
2311      constant accessed as a uniform. However, if the constant is available
2312      inline, `IADD.i32` is preferred.
2313
2314      `IADD_IMM.i32` with the source tied to zero is the canonical immediate move.
2315    </desc>
2316    <src>A</src>
2317    <imm name="constant" ir_name="index" start="8" size="32"/>
2318  </ins>
2319
2320  <ins name="IADD_IMM.v2i16" title="Integer addition with immediate" dests="1" opcode="0x111" unit="CVT">
2321    <desc>
2322      Adds an arbitrary pair of 16-bit immediates embedded within the
2323      instruction stream. If no modifiers are required, this is preferred to
2324      `IADD.v2i16` with a constant accessed as a uniform. However, if the
2325      constant is available inline, `IADD.v2i16` is preferred. Adding only a
2326      single 16-bit constant requires replication of the constant.
2327    </desc>
2328    <src>A</src>
2329    <imm name="constant" ir_name="index" start="8" size="32"/>
2330  </ins>
2331
2332  <ins name="IADD_IMM.v4i8" title="Integer addition with immediate" dests="1" opcode="0x112" unit="CVT">
2333    <desc>
2334      Adds an arbitrary quad of 8-bit immediates embedded within the
2335      instruction stream. If no modifiers are required, this is preferred to
2336      `IADD.v4i8` with a constant accessed as a uniform. However, if the
2337      constant is available inline, `IADD.v4i8` is preferred. Adding only a
2338      single 8-bit constant requires replication of the constant.
2339    </desc>
2340    <src>A</src>
2341    <imm name="constant" ir_name="index" start="8" size="32"/>
2342  </ins>
2343
2344  <ins name="FADD_IMM.f32" title="Floating-point addition with immediate" dests="1" opcode="0x114" unit="FMA">
2345    <desc>
2346      Adds an arbitrary 32-bit immediate embedded within the instruction stream.
2347      If no modifiers are required, this is preferred to `FADD.f32` with a
2348      constant accessed as a uniform. However, if the constant is available
2349      inline, `FADD.f32` is preferred.
2350    </desc>
2351    <src>A</src>
2352    <imm name="constant" ir_name="index" start="8" size="32"/>
2353  </ins>
2354
2355  <ins name="FADD_IMM.v2f16" title="Floating-point addition with immediate" dests="1" opcode="0x115" unit="FMA">
2356    <desc>
2357      Adds an arbitrary pair of 16-bit immediates embedded within the
2358      instruction stream. If no modifiers are required, this is preferred to
2359      `FADD.v2f16` with a constant accessed as a uniform. However, if the
2360      constant is available inline, `FADD.v2f16` is preferred. Adding only a
2361      single 16-bit constant requires replication of the constant.
2362    </desc>
2363    <src float="true">A</src>
2364    <imm name="constant" ir_name="index" start="8" size="32"/>
2365  </ins>
2366
2367  <ins name="ATOM1_RETURN.i32" title="Atomic operations on memory with 1" opcode="0x69" opcode2="3" unused="true" unit="LS">
2368    <slot/>
2369    <sr_count/>
2370    <atom_opc_1/>
2371    <va_mod name="memory_width" start="128" size="1" implied="true"/>
2372
2373    <!-- Optional for ATOM1.i32, in which sr_count must be 0 -->
2374    <sr write="true"/>
2375    <src size="64">64-bit address to operate on</src>
2376    <imm name="offset" start="8" size="8"/>
2377  </ins>
2378
2379  <ins name="ATOM1_RETURN.i64" title="Atomic operations on memory with 1" opcode="0x69" opcode2="5" unused="true" unit="LS">
2380    <slot/>
2381    <sr_count/>
2382    <atom_opc_1/>
2383    <va_mod name="memory_width" start="128" size="1" implied="true"/>
2384
2385    <!-- Optional for ATOM1.i64, in which sr_count must be 0 -->
2386    <sr write="true"/>
2387    <src size="64">64-bit address to operate on</src>
2388    <imm name="offset" start="8" size="8"/>
2389  </ins>
2390
2391  <ins name="ATOM.i32" title="Atomic operations on memory" opcode="0x68" opcode2="3" unused="true" unit="LS">
2392    <slot/>
2393    <sr_count/>
2394    <atom_opc/>
2395    <va_mod name="memory_width" start="128" size="1" implied="true"/>
2396
2397    <sr read="true"/>
2398    <src size="64">64-bit address to operate on</src>
2399    <imm name="offset" start="8" size="8"/>
2400  </ins>
2401
2402  <ins name="ATOM.i64" title="Atomic operations on memory" opcode="0x68" opcode2="5" unused="true" unit="LS">
2403    <slot/>
2404    <sr_count/>
2405    <atom_opc/>
2406    <va_mod name="memory_width" start="128" size="1" implied="true"/>
2407
2408    <sr read="true"/>
2409    <src size="64">64-bit address to operate on</src>
2410    <imm name="offset" start="8" size="8"/>
2411  </ins>
2412
2413  <ins name="ATOM_RETURN.i32" title="Atomic operations on memory" opcode="0x120" opcode2="3" unused="true" unit="LS">
2414    <slot/>
2415    <sr_count/>
2416    <sr_write_count/>
2417
2418    <!-- Only valid with .xchg to implement ACMPXCHG -->
2419    <va_mod name="compare" start="26" size="1"/>
2420
2421    <atom_opc/>
2422    <va_mod name="memory_width" start="128" size="1" implied="true"/>
2423
2424    <sr write="true" flags="false"/>
2425    <sr read="true" flags="rw"/>
2426    <src size="64">64-bit address to operate on</src>
2427    <imm name="offset" start="8" size="8"/>
2428  </ins>
2429
2430  <ins name="ATOM_RETURN.i64" title="Atomic operations on memory" opcode="0x120" opcode2="5" unused="true" unit="LS">
2431    <slot/>
2432    <sr_count/>
2433    <sr_write_count/>
2434    <va_mod name="compare" start="26" size="1"/>
2435    <atom_opc/>
2436    <va_mod name="memory_width" start="128" size="1" implied="true"/>
2437
2438    <sr write="true" flags="false"/>
2439    <sr read="true" flags="rw"/>
2440    <src size="64">64-bit address to operate on</src>
2441    <imm name="offset" start="8" size="8"/>
2442  </ins>
2443
2444  <ins name="TEX_FETCH" title="Texel fetch" opcode="0x125" message="tex" unit="T">
2445    <desc>Unfiltered textured instruction.</desc>
2446    <slot/>
2447    <skip/>
2448    <register_type/>
2449    <register_width/>
2450    <write_mask/>
2451    <dimension/>
2452    <wide_indices/>
2453    <array_enable/>
2454    <texel_offset/>
2455    <regfmt pseudo="true"/>
2456
2457    <!-- Leave secondary_register_width as 0 -->
2458    <sr_count/>
2459    <sr_write_count/>
2460
2461    <sr write="true" flags="false"/>
2462    <sr read="true" flags="false"/>
2463    <src size="64">Image to read from</src>
2464    <src pseudo="true">Dummy for IR</src>
2465    <immediate name="sr_count" size="4" pseudo="true"/>
2466  </ins>
2467
2468  <ins name="TEX_SINGLE" title="Texture load" opcode="0x128" message="tex" unit="T">
2469    <desc>Ordinary texturing instruction using a sampler.</desc>
2470    <slot/>
2471    <skip/>
2472    <register_type/>
2473    <register_width/>
2474    <write_mask/>
2475    <dimension/>
2476    <wide_indices/>
2477    <array_enable/>
2478    <texel_offset/>
2479    <regfmt pseudo="true"/>
2480    <shadow/>
2481    <lod_mode/>
2482
2483    <!-- Leave secondary_register_width as 0 -->
2484    <sr_count/>
2485    <sr_write_count/>
2486
2487    <sr write="true" flags="false"/>
2488    <sr read="true" flags="false"/>
2489    <src size="64">Image to read from</src>
2490    <src pseudo="true">Dummy for IR</src>
2491    <immediate name="sr_count" size="4" pseudo="true"/>
2492  </ins>
2493
2494  <ins name="TEX_GATHER" title="Texel gather" opcode="0x129" message="tex" unit="T">
2495    <desc>Texture gather instruction.</desc>
2496    <slot/>
2497    <skip/>
2498    <register_type/>
2499    <register_width/>
2500    <write_mask/>
2501    <dimension/>
2502    <wide_indices/>
2503    <array_enable/>
2504    <texel_offset/>
2505    <integer_coordinates/>
2506    <fetch_component/>
2507    <regfmt pseudo="true"/>
2508    <shadow/>
2509
2510    <!-- Leave secondary_register_width as 0 -->
2511    <sr_count count="sr_count"/>
2512    <sr_write_count/>
2513
2514    <sr write="true" flags="false"/>
2515    <sr read="true" flags="false"/>
2516    <src size="64">Image to read from</src>
2517    <src pseudo="true">Dummy source for IR</src>
2518    <immediate name="sr_count" size="4" pseudo="true"/>
2519  </ins>
2520
2521  <ins name="TEX_DUAL" title="Dual texture" opcode="0x12F" unused="true" unit="T">
2522    <desc>Pair of texture instructions.</desc>
2523    <slot/>
2524    <skip/>
2525    <register_type/>
2526    <register_width/>
2527    <secondary_register_width/>
2528    <write_mask/>
2529    <dimension/>
2530    <wide_indices/>
2531    <array_enable/>
2532    <texel_offset/>
2533    <shadow/>
2534    <lod_mode/>
2535
2536    <sr_count/>
2537    <sr_write_count/>
2538
2539    <sr write="true" flags="false"/>
2540    <sr read="true" flags="false"/>
2541    <src size="64">Image to read from</src>
2542  </ins>
2543
2544  <ins name="VAR_TEX_BUF_SINGLE" title="Fused varying-texturing" opcode="0x130" unused="true" unit="VT">
2545    <desc>
2546      Only works for FP32 varyings. Performance characteristics are similar
2547      to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units.
2548    </desc>
2549    <slot/>
2550    <skip/>
2551    <sample_and_update/>
2552    <register_type/>
2553    <vartex_register_width/>
2554    <dimension/>
2555    <array_enable/>
2556    <shadow/>
2557    <lod_mode/>
2558
2559    <sr_write_count/>
2560
2561    <sr write="true"/>
2562    <src size="64">Image to read from</src>
2563    <src>Varying offset</src>
2564  </ins>
2565
2566  <ins name="VAR_TEX_BUF_GATHER" title="Fused varying-texturing" opcode="0x131" unused="true" unit="VT">
2567    <desc>
2568      Only works for FP32 varyings. Performance characteristics are similar
2569      to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units.
2570    </desc>
2571    <slot/>
2572    <skip/>
2573    <sample_and_update/>
2574    <register_type/>
2575    <vartex_register_width/>
2576    <dimension/>
2577    <array_enable/>
2578    <integer_coordinates/>
2579    <fetch_component/>
2580    <shadow/>
2581
2582    <sr_write_count/>
2583
2584    <sr write="true"/>
2585    <src size="64">Image to read from</src>
2586    <src>Varying offset</src>
2587  </ins>
2588
2589  <ins name="VAR_TEX_BUF_GRADIENT" title="Fused varying-texturing" opcode="0x132" unused="true" unit="VT">
2590    <desc>
2591      Only works for FP32 varyings. Performance characteristics are similar
2592      to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units.
2593    </desc>
2594    <slot/>
2595    <skip/>
2596    <sample_and_update/>
2597    <register_type/>
2598    <vartex_register_width/>
2599    <dimension/>
2600    <array_enable/>
2601    <shadow/>
2602    <lod_bias_disable/>
2603    <lod_clamp_disable/>
2604
2605    <sr_write_count/>
2606
2607    <sr write="true"/>
2608    <src size="64">Image to read from</src>
2609    <src>Varying offset</src>
2610  </ins>
2611
2612  <ins name="VAR_TEX_BUF_DUAL" title="Fused varying-texturing" opcode="0x137" unused="true" unit="VT">
2613    <desc>
2614      Only works for FP32 varyings. Performance characteristics are similar
2615      to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX_DUAL, using both V and T units.
2616    </desc>
2617    <slot/>
2618    <skip/>
2619    <sample_and_update/>
2620    <register_type/>
2621    <vartex_register_width/>
2622    <dimension/>
2623    <array_enable/>
2624    <shadow/>
2625    <lod_mode/>
2626
2627    <sr_write_count/>
2628
2629    <sr write="true"/>
2630    <src size="64">Image to read from</src>
2631    <src>Varying offset</src>
2632  </ins>
2633
2634  <ins name="VAR_TEX_SINGLE" title="Fused varying-texturing" opcode="0x138" unused="true" unit="VT">
2635    <desc>
2636      Only works for FP32 varyings. Performance characteristics are similar
2637      to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units.
2638    </desc>
2639    <slot/>
2640    <skip/>
2641    <sample_and_update/>
2642    <register_type/>
2643    <vartex_register_width/>
2644    <dimension/>
2645    <array_enable/>
2646    <shadow/>
2647    <lod_mode/>
2648
2649    <sr_write_count/>
2650
2651    <sr write="true"/>
2652    <src size="64">Image to read from</src>
2653    <src>Varying offset</src>
2654  </ins>
2655
2656  <ins name="VAR_TEX_GATHER" title="Fused varying-texturing" opcode="0x139" unused="true" unit="VT">
2657    <desc>
2658      Only works for FP32 varyings. Performance characteristics are similar
2659      to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units.
2660    </desc>
2661    <slot/>
2662    <skip/>
2663    <sample_and_update/>
2664    <register_type/>
2665    <vartex_register_width/>
2666    <dimension/>
2667    <array_enable/>
2668    <integer_coordinates/>
2669    <fetch_component/>
2670    <shadow/>
2671
2672    <sr_write_count/>
2673
2674    <sr write="true"/>
2675    <src size="64">Image to read from</src>
2676    <src>Varying offset</src>
2677  </ins>
2678
2679  <ins name="VAR_TEX_GRADIENT" title="Fused varying-texturing" opcode="0x13A" unused="true" unit="VT">
2680    <desc>
2681      Only works for FP32 varyings. Performance characteristics are similar
2682      to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units.
2683    </desc>
2684    <slot/>
2685    <skip/>
2686    <sample_and_update/>
2687    <register_type/>
2688    <vartex_register_width/>
2689    <dimension/>
2690    <array_enable/>
2691    <shadow/>
2692    <lod_bias_disable/>
2693    <lod_clamp_disable/>
2694
2695    <sr_write_count/>
2696
2697    <sr write="true"/>
2698    <src size="64">Image to read from</src>
2699    <src>Varying offset</src>
2700  </ins>
2701
2702  <ins name="VAR_TEX_DUAL" title="Fused varying-texturing" opcode="0x13F" unused="true" unit="VT">
2703    <desc>
2704      Only works for FP32 varyings. Performance characteristics are similar
2705      to LD_VAR_IMM_F32.v2.f32 followed by TEX_DUAL, using both V and T units.
2706    </desc>
2707    <slot/>
2708    <skip/>
2709    <sample_and_update/>
2710    <register_type/>
2711    <vartex_register_width/>
2712    <dimension/>
2713    <array_enable/>
2714    <shadow/>
2715    <lod_mode/>
2716
2717    <sr_write_count/>
2718
2719    <sr write="true"/>
2720    <src size="64">Image to read from</src>
2721    <src>Varying offset</src>
2722  </ins>
2723
2724  <ins name="FMA_RSCALE.f32" title="Fused floating-point multiply add with exponent bias" dests="1" opcode="0x160" unused="true" unit="FMA">
2725    <desc>
2726      First calculates $A \cdot B + C$ and then biases the exponent by D. Used in
2727      special transcendental function sequences. It should not be used for
2728      general code as its special case handling differs from two back-to-back
2729      `FMA.f32` operations. Equivalent to `FMA.f32` back-to-back with
2730      `LDEXP.f32`
2731    </desc>
2732    <clamp/>
2733    <src absneg="true">A</src>
2734    <src absneg="true">B</src>
2735    <src absneg="true">C</src>
2736    <src>D</src>
2737  </ins>
2738
2739  <ins name="FMA_RSCALE_N.f32" title="Fused floating-point multiply add with exponent bias and zero override" dests="1" opcode="0x161" unused="true" unit="FMA">
2740    <desc>
2741      First calculates $A \cdot B + C$ and then biases the exponent by D. If $A
2742      = 0$ or $B = 0$, the multiply $A \cdot B$ is treated as zero even if an
2743      ordinary multiply would return NaN. Used in special transcendental
2744      function sequences. It should not be used for general code as its special
2745      case handling differs from two back-to-back `FMA.f32` operations.
2746      Equivalent to `FMA.f32` back-to-back with `LDEXP.f32`
2747    </desc>
2748    <clamp/>
2749    <src absneg="true">A</src>
2750    <src absneg="true">B</src>
2751    <src absneg="true">C</src>
2752    <src>D</src>
2753  </ins>
2754
2755  <ins name="FMA_RSCALE_LEFT.f32" title="Fused floating-point multiply add with exponent bias and asymmetric zero handling" dests="1" opcode="0x162" unused="true" unit="FMA">
2756    <desc>
2757      First calculates $A \cdot B + C$ and then biases the exponent by D. If $A
2758      = 0$ or $B = 0$, the multiply is treated as $A$ even if an
2759      ordinary multiply would return NaN. Used in special transcendental
2760      function sequences. It should not be used for general code as its special
2761      case handling differs from two back-to-back `FMA.f32` operations.
2762      Equivalent to `FMA.f32` back-to-back with `LDEXP.f32`
2763    </desc>
2764    <clamp/>
2765    <src absneg="true">A</src>
2766    <src absneg="true">B</src>
2767    <src absneg="true">C</src>
2768    <src>D</src>
2769  </ins>
2770
2771  <ins name="FMA_RSCALE_SCALE16.f32" title="Fused floating-point multiply add with 16-bit exponent bias" dests="1" opcode="0x163" unused="true" unit="FMA">
2772    <desc>
2773      First calculates $A \cdot B + C$ and then biases the exponent by D,
2774      interpreted as a 16-bit value. Used in special transcendental function
2775      sequences. It should not be used for general code as its special case
2776      handling differs from two back-to-back `FMA.f32` operations.  Equivalent
2777      to `FMA.f32` back-to-back with `LDEXP.f32`
2778    </desc>
2779    <clamp/>
2780    <src absneg="true">A</src>
2781    <src absneg="true">B</src>
2782    <src absneg="true">C</src>
2783    <src>D</src>
2784  </ins>
2785
2786</valhall>
2787