xref: /aosp_15_r20/external/mesa3d/src/compiler/nir/nir_opt_algebraic.py (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2014 Intel Corporation
4#
5# Permission is hereby granted, free of charge, to any person obtaining a
6# copy of this software and associated documentation files (the "Software"),
7# to deal in the Software without restriction, including without limitation
8# the rights to use, copy, modify, merge, publish, distribute, sublicense,
9# and/or sell copies of the Software, and to permit persons to whom the
10# Software is furnished to do so, subject to the following conditions:
11#
12# The above copyright notice and this permission notice (including the next
13# paragraph) shall be included in all copies or substantial portions of the
14# Software.
15#
16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22# IN THE SOFTWARE.
23
24import argparse
25from collections import OrderedDict
26import nir_algebraic
27from nir_opcodes import type_sizes
28import itertools
29import struct
30from math import pi
31import math
32
33# Convenience variables
34a = 'a'
35b = 'b'
36c = 'c'
37d = 'd'
38e = 'e'
39NAN = math.nan
40
41has_fmulz = '(options->has_fmulz || \
42              (options->has_fmulz_no_denorms && \
43               !nir_is_denorm_preserve(info->float_controls_execution_mode, 32)))'
44
45ignore_exact = nir_algebraic.ignore_exact
46
47# Written in the form (<search>, <replace>) where <search> is an expression
48# and <replace> is either an expression or a value.  An expression is
49# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>)
50# where each source is either an expression or a value.  A value can be
51# either a numeric constant or a string representing a variable name.
52#
53# If the opcode in a search expression is prefixed by a '~' character, this
54# indicates that the operation is inexact.  Such operations will only get
55# applied to SSA values that do not have the exact bit set.  This should be
56# used by by any optimizations that are not bit-for-bit exact.  It should not,
57# however, be used for backend-requested lowering operations as those need to
58# happen regardless of precision.
59#
60# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where:
61# "#" indicates that the given variable will only match constants,
62# type indicates that the given variable will only match values from ALU
63#    instructions with the given output type,
64# (cond) specifies an additional condition function (see nir_search_helpers.h),
65# swiz is a swizzle applied to the variable (only in the <replace> expression)
66#
67# For constants, you have to be careful to make sure that it is the right
68# type because python is unaware of the source and destination types of the
69# opcodes.
70#
71# All expression types can have a bit-size specified.  For opcodes, this
72# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a
73# type and size.  In the search half of the expression this indicates that it
74# should only match that particular bit-size.  In the replace half of the
75# expression this indicates that the constructed value should have that
76# bit-size.
77#
78# If the opcode in a replacement expression is prefixed by a '!' character,
79# this indicated that the new expression will be marked exact.
80#
81# A special condition "many-comm-expr" can be used with expressions to note
82# that the expression and its subexpressions have more commutative expressions
83# than nir_replace_instr can handle.  If this special condition is needed with
84# another condition, the two can be separated by a comma (e.g.,
85# "(many-comm-expr,is_used_once)").
86#
87# Another set of special "conditions" are
88# "nsz": sign of zero is not preserved
89# "ninf": infinities are not preserved
90# "nnan": nan is not preserved
91# These relate to the float controls/fpfastmath and more descriptions of the
92# expression than conditions. That is, an expression with the "nsz" condition
93# means that the replacement expression won't preserve the sign of zero of the
94# result, and so it will be skipped if the matching instruction has the
95# 'signed_zero_preserve' flag set.
96
97# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
98def lowered_sincos(c):
99    x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0)
100    x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0)
101    return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x)
102
103def intBitsToFloat(i):
104    return struct.unpack('!f', struct.pack('!I', i))[0]
105
106# Takes a pattern as input and returns a list of patterns where each
107# pattern has a different permutation of fneg/fabs(value) as the replacement
108# for the key operands in replacements.
109def add_fabs_fneg(pattern, replacements, commutative = True):
110    def to_list(pattern):
111        return [to_list(i) if isinstance(i, tuple) else i for i in pattern]
112
113    def to_tuple(pattern):
114        return tuple(to_tuple(i) if isinstance(i, list) else i for i in pattern)
115
116    def replace_varible(pattern, search, replace):
117        for i in range(len(pattern)):
118            if pattern[i] == search:
119                pattern[i] = replace
120            elif isinstance(pattern[i], list):
121                replace_varible(pattern[i], search, replace)
122
123    if commutative:
124        perms = itertools.combinations_with_replacement(range(4), len(replacements))
125    else:
126        perms = itertools.product(range(4), repeat=len(replacements))
127
128    result = []
129
130    for perm in perms:
131        curr = to_list(pattern)
132
133        for i, (search, base) in enumerate(replacements.items()):
134            if perm[i] == 0:
135                replace = ['fneg', ['fabs', base]]
136            elif perm[i] == 1:
137                replace = ['fabs', base]
138            elif perm[i] == 2:
139                replace = ['fneg', base]
140            elif perm[i] == 3:
141                replace = base
142
143            replace_varible(curr, search, replace)
144
145        result.append(to_tuple(curr))
146    return result
147
148
149optimizations = [
150   # These will be recreated by late_algebraic if supported.
151   # Lowering here means we don't have to duplicate all other optimization patterns.
152   (('fgeu', a, b), ('inot', ('flt', a, b))),
153   (('fltu', a, b), ('inot', ('fge', a, b))),
154   (('fneo', 0.0, a), ('flt', 0.0, ('fabs', a))),
155   (('fequ', 0.0, a), ('inot', ('flt', 0.0, ('fabs', a)))),
156
157
158   (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
159   (('imul', 'a@8', 0x80), ('ishl', a, 7), '!options->lower_bitops'),
160   (('imul', 'a@16', 0x8000), ('ishl', a, 15), '!options->lower_bitops'),
161   (('imul', 'a@32', 0x80000000), ('ishl', a, 31), '!options->lower_bitops'),
162   (('imul', 'a@64', 0x8000000000000000), ('ishl', a, 63), '!options->lower_bitops'),
163   (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
164   (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'),
165
166   (('imul@64', a, '#b(is_bitcount2)'), ('iadd', ('ishl', a, ('ufind_msb', b)), ('ishl', a, ('find_lsb', b))),
167    '!options->lower_bitops && (options->lower_int64_options & (nir_lower_imul64 | nir_lower_shift64)) == nir_lower_imul64'),
168
169   (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
170   (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
171   (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'),
172   (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'),
173   (('udiv', a, 1), a),
174   (('idiv', a, 1), a),
175   (('umod', a, 1), 0),
176   (('imod', a, 1), 0),
177   (('imod', a, -1), 0),
178   (('irem', a, 1), 0),
179   (('irem', a, -1), 0),
180   (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'),
181   (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), '!options->lower_bitops'),
182   (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'),
183   (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'),
184   (('imod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'),
185   (('imod', a, '#b(is_neg_power_of_two)'), ('bcsel', ('ieq', ('ior', a, b), b), 0, ('ior', a, b)), '!options->lower_bitops'),
186   # 'irem(a, b)' -> 'a - ((a < 0 ? (a + b - 1) : a) & -b)'
187   (('irem', a, '#b(is_pos_power_of_two)'),
188    ('isub', a, ('iand', ('bcsel', ('ilt', a, 0), ('iadd', a, ('isub', b, 1)), a), ('ineg', b))),
189    '!options->lower_bitops'),
190   (('irem', a, '#b(is_neg_power_of_two)'), ('irem', a, ('iabs', b)), '!options->lower_bitops'),
191
192   (('~fmul', ('fsign', a), ('ffloor', ('fadd', ('fabs', a), 0.5))), ('ftrunc', ('fadd', a, ('fmul', ('fsign', a), 0.5))), '!options->lower_ftrunc || options->lower_ffloor'),
193
194   (('~fneg', ('fneg', a)), a),
195   (('ineg', ('ineg', a)), a),
196   (('fabs', ('fneg', a)), ('fabs', a)),
197   (('fabs', ('u2f', a)), ('u2f', a)),
198   (('iabs', ('iabs', a)), ('iabs', a)),
199   (('iabs', ('ineg', a)), ('iabs', a)),
200   (('~fadd', a, 0.0), a),
201   # a+0.0 is 'a' unless 'a' is denormal or -0.0. If it's only used by a
202   # floating point instruction, they should flush any input denormals and we
203   # can replace -0.0 with 0.0 if the float execution mode allows it.
204   (('fadd(is_only_used_as_float,nsz)', 'a', 0.0), a),
205   (('iadd', a, 0), a),
206   (('iadd_sat', a, 0), a),
207   (('isub_sat', a, 0), a),
208   (('uadd_sat', a, 0), a),
209   (('usub_sat', a, 0), a),
210   (('usadd_4x8_vc4', a, 0), a),
211   (('usadd_4x8_vc4', a, ~0), ~0),
212   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
213   (('~fadd', ('fmulz', a, b), ('fmulz', a, c)), ('fmulz', a, ('fadd', b, c))),
214   (('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),
215   (('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))),
216   (('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),
217   (('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))),
218   (('~ffmaz', a, b, ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)),
219   (('~ffmaz', a, b, ('fmulz(is_used_once)', a, c)), ('fmulz', a, ('fadd', b, c))),
220   (('~fadd', ('fmulz(is_used_once)', a, b), ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)),
221   (('~ffmaz', a, ('fmulz(is_used_once)', b, c), ('fmulz(is_used_once)', b, d)), ('fmulz', b, ('ffmaz', a, c, d))),
222   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
223   (('iadd', ('ishl', b, a), ('ishl', c, a)), ('ishl', ('iadd', b, c), a)),
224   (('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))),
225   (('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))),
226   (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), b), ('ine', ('iand', a, b), 0)),
227   (('ine', ('iand', a, '#b(is_pos_power_of_two)'), b), ('ieq', ('iand', a, b), 0)),
228   (('ieq', ('ushr(is_used_once)', a, '#b'), 0), ('ult', a, ('ishl', 1, b))),
229   (('ine', ('ushr(is_used_once)', a, '#b'), 0), ('uge', a, ('ishl', 1, b))),
230   (('~fadd', ('fneg', a), a), 0.0),
231   (('iadd', ('ineg', a), a), 0),
232   (('iadd', ('ineg', a), ('iadd', a, b)), b),
233   (('iadd', a, ('iadd', ('ineg', a), b)), b),
234   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
235   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
236   (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))),
237   (('~fmul', a, 0.0), 0.0),
238   # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN
239   (('fmul(nsz,nnan)', 'a', 0.0), 0.0),
240   (('fmulz', a, 0.0), 0.0),
241   (('fmulz(nsz)', a, 'b(is_finite_not_zero)'), ('fmul', a, b)),
242   (('fmulz', 'a(is_finite)', 'b(is_finite)'), ('fmul', a, b)),
243   (('fmulz', a, a), ('fmul', a, a)),
244   (('ffmaz(nsz)', a, 'b(is_finite_not_zero)', c), ('ffma', a, b, c)),
245   (('ffmaz', 'a(is_finite)', 'b(is_finite)', c), ('ffma', a, b, c)),
246   (('ffmaz', a, a, b), ('ffma', a, a, b)),
247   (('imul', a, 0), 0),
248   (('umul_unorm_4x8_vc4', a, 0), 0),
249   (('umul_unorm_4x8_vc4', a, ~0), a),
250   (('~fmul', a, 1.0), a),
251   (('~fmulz', a, 1.0), a),
252   # The only effect a*1.0 can have is flushing denormals. If it's only used by
253   # a floating point instruction, they should flush any input denormals and
254   # this multiplication isn't needed.
255   (('fmul(is_only_used_as_float)', a, 1.0), a),
256   (('imul', a, 1), a),
257   (('fmul', a, -1.0), ('fneg', a)),
258   (('imul', a, -1), ('ineg', a)),
259   # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a
260   # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a
261   # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0
262   # If a != a: fsign(a)*a*a => 0*NaN*NaN => abs(NaN)*NaN
263   (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)),
264   (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)),
265   (('~ffma', 0.0, a, b), b),
266   (('ffma(is_only_used_as_float,nsz,nnan,ninf)', 0.0, a, b), b),
267   (('ffmaz', 0.0, a, b), ('fadd', 0.0, b)),
268   (('~ffma', a, b, 0.0), ('fmul', a, b)),
269   (('ffma(nsz)', a, b, 0.0), ('fmul', a, b)),
270   (('ffmaz(nsz)', a, b, 0.0), ('fmulz', a, b)),
271   (('ffma', 1.0, a, b), ('fadd', a, b)),
272   (('ffmaz(nsz)', 1.0, a, b), ('fadd', a, b)),
273   (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)),
274   (('ffmaz(nsz)', -1.0, a, b), ('fadd', ('fneg', a), b)),
275   (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)),
276   (('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)),
277   (('~flrp', a, b, 0.0), a),
278   (('~flrp', a, b, 1.0), b),
279   (('~flrp', a, a, b), a),
280   (('~flrp', 0.0, a, b), ('fmul', a, b)),
281
282   # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
283   (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
284
285   (('sdot_4x8_iadd', a, 0, b), b),
286   (('udot_4x8_uadd', a, 0, b), b),
287   (('sdot_4x8_iadd_sat', a, 0, b), b),
288   (('udot_4x8_uadd_sat', a, 0, b), b),
289   (('sdot_2x16_iadd', a, 0, b), b),
290   (('udot_2x16_uadd', a, 0, b), b),
291   (('sdot_2x16_iadd_sat', a, 0, b), b),
292   (('udot_2x16_uadd_sat', a, 0, b), b),
293
294   # sudot_4x8_iadd is not commutative at all, so the patterns must be
295   # duplicated with zeros on each of the first positions.
296   (('sudot_4x8_iadd', a, 0, b), b),
297   (('sudot_4x8_iadd', 0, a, b), b),
298   (('sudot_4x8_iadd_sat', a, 0, b), b),
299   (('sudot_4x8_iadd_sat', 0, a, b), b),
300
301   (('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))),
302   (('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))),
303   (('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))),
304   (('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))),
305   (('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))),
306
307   # Try to let constant folding eliminate the dot-product part.  These are
308   # safe because the dot product cannot overflow 32 bits.
309   (('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)),
310   (('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)),
311   (('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)),
312   (('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)),
313   (('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)),
314   (('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)),
315   (('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)),
316   (('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)),
317   (('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)),
318   (('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)),
319   (('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)),
320   (('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
321   (('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'),
322   (('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
323   (('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
324   (('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'),
325
326   # Optimize open-coded fmulz.
327   # (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b) -> fmulz(a, b)
328   *add_fabs_fneg((('fmul@32(nsz)', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, 'ma'), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, 'mb')),
329    ('fmulz', 'ma', 'mb'), has_fmulz), {'ma' : a, 'mb' : b}),
330   *add_fabs_fneg((('fmul@32(nsz)', 'ma', ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)')),
331    ('fmulz', 'ma', b), has_fmulz), {'ma' : a}),
332
333   # ffma(b==0.0 ? 0.0 : a, a==0.0 ? 0.0 : b, c) -> ffmaz(a, b, c)
334   *add_fabs_fneg((('ffma@32(nsz)', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, 'ma'), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, 'mb'), c),
335    ('ffmaz', 'ma', 'mb', c), has_fmulz), {'ma' : a, 'mb' : b}),
336   *add_fabs_fneg((('ffma@32(nsz)', 'ma', ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)'), c),
337    ('ffmaz', 'ma', b, c), has_fmulz), {'ma' : a}),
338
339   # b == 0.0 ? 1.0 : fexp2(fmul(a, b)) -> fexp2(fmulz(a, b))
340   *add_fabs_fneg((('bcsel(nsz,nnan,ninf)', ignore_exact('feq', b, 0.0), 1.0, ('fexp2', ('fmul@32', a, 'mb'))),
341    ('fexp2', ('fmulz', a, 'mb')),
342    has_fmulz), {'mb': b}),
343   *add_fabs_fneg((('bcsel', ignore_exact('feq', b, 0.0), 1.0, ('fexp2', ('fmulz', a, 'mb'))),
344    ('fexp2', ('fmulz', a, 'mb'))), {'mb': b}),
345]
346
347# Shorthand for the expansion of just the dot product part of the [iu]dp4a
348# instructions.
349sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)),
350                                 ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))),
351                        ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)),
352                                 ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3))))
353udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)),
354                                 ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))),
355                        ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)),
356                                 ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3))))
357sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)),
358                                  ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))),
359                         ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)),
360                                  ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3))))
361sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)),
362                         ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1)))
363udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)),
364                         ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1)))
365
366optimizations.extend([
367   (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),
368   (('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_udot_4x8'),
369   (('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
370   (('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
371   (('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'),
372
373   # For the unsigned dot-product, the largest possible value 4*(255*255) =
374   # 0x3f804, so we don't have to worry about that intermediate result
375   # overflowing.  0x100000000 - 0x3f804 = 0xfffc07fc.  If c is a constant
376   # that is less than 0xfffc07fc, then the result cannot overflow ever.
377   (('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)),
378   (('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->has_udot_4x8_sat'),
379
380   # For the signed dot-product, the largest positive value is 4*(-128*-128) =
381   # 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00.  We
382   # don't have to worry about that intermediate result overflowing or
383   # underflowing.
384   (('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->has_sdot_4x8_sat'),
385
386   (('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->has_sudot_4x8_sat'),
387
388   (('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'),
389   (('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
390])
391
392# Float sizes
393for s in [16, 32, 64]:
394    optimizations.extend([
395       (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
396
397       (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)),
398       (('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)),
399       (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)),
400
401       (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
402       # These are the same as the previous three rules, but it depends on
403       # 1-fsat(x) <=> fsat(1-x).  See below.
404       (('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)),
405       (('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
406
407       (('~fadd@{}'.format(s),    ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f',  c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
408       (('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
409
410       (('~ffma@{}'.format(s), a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1'))), ('fmul', b, ('b2f', 'c@1'))), ('bcsel', c, b, a)),
411       (('~ffma@{}'.format(s), b, ('b2f', 'c@1'), ('ffma', ('fneg', a), ('b2f', 'c@1'), a)), ('bcsel', c, b, a)),
412
413       # These two aren't flrp lowerings, but do appear in some shaders.
414       (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('fadd', b, ('fneg', a)), a), ('bcsel', c, b, a)),
415       (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('ffma', ('fneg', a), b, d), ('fmul', a, b)), ('bcsel', c, d, ('fmul', a, b))),
416
417       # 1 - ((1 - a) * (1 - b))
418       # 1 - (1 - a - b + a*b)
419       # 1 - 1 + a + b - a*b
420       # a + b - a*b
421       # a + b*(1 - a)
422       # b*(1 - a) + 1*a
423       # flrp(b, 1, a)
424       (('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)),
425    ])
426
427optimizations.extend([
428   (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)),
429
430   (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
431
432   (('ftrunc@16', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
433   (('ftrunc@32', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
434   (('ftrunc@64', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))),
435    '(options->lower_ftrunc || (options->lower_doubles_options & nir_lower_dtrunc)) && !(options->lower_doubles_options & nir_lower_dfloor)'),
436
437   (('ffloor@16', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
438   (('ffloor@32', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
439   (('ffloor@64', a), ('fsub', a, ('ffract', a)), '(options->lower_ffloor || (options->lower_doubles_options & nir_lower_dfloor)) && !(options->lower_doubles_options & nir_lower_dfract)'),
440   (('fadd@16', a, ('fadd@16', b, ('fneg', ('ffract', a)))), ('fadd@16', b, ('ffloor', a)), '!options->lower_ffloor'),
441   (('fadd@32', a, ('fadd@32', b, ('fneg', ('ffract', a)))), ('fadd@32', b, ('ffloor', a)), '!options->lower_ffloor'),
442   (('fadd@64', a, ('fadd@64', b, ('fneg', ('ffract', a)))), ('fadd@64', b, ('ffloor', a)), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'),
443   (('fadd@16', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
444   (('fadd@32', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
445   (('fadd@64', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'),
446   (('ffract@16', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
447   (('ffract@32', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
448   (('ffract@64', a), ('fsub', a, ('ffloor', a)),
449    '(options->lower_ffract || (options->lower_doubles_options & nir_lower_dfract)) && !(options->lower_doubles_options & nir_lower_dfloor)'),
450   (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),
451   (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'),
452   (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'),
453   (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'),
454   (('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'),
455   # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late).
456   (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'),
457   (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'),
458   (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'),
459   (('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'),
460
461   (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
462    ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
463
464   (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'),
465
466   (('fdot4', a, 0.0), 0.0),
467   (('fdot3', a, 0.0), 0.0),
468   (('fdot2', a, 0.0), 0.0),
469
470   (('fdot4', ('vec4', a, b,   c,   1.0), d), ('fdph',  ('vec3', a, b, c), d), '!options->lower_fdph'),
471   (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),
472   (('fdot4', ('vec4', a, b,   0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),
473   (('fdot4', ('vec4', a, b,   c,   0.0), d), ('fdot3', ('vec3', a, b, c), d)),
474
475   (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)),
476   (('fdot3', ('vec3', a, b,   0.0), c), ('fdot2', ('vec2', a, b), c)),
477
478   (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)),
479   (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')),
480
481   # Lower fdot to fsum when it is available
482   (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'),
483   (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'),
484   (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'),
485   (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'),
486
487   # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially
488   # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
489   # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
490   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
491
492   # (a * #b + #c) << #d
493   # ((a * #b) << #d) + (#c << #d)
494   # (a * (#b << #d)) + (#c << #d)
495   (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'),
496    ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))),
497
498   # (a * #b) << #c
499   # a * (#b << #c)
500   (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))),
501])
502
503# Care must be taken here.  Shifts in NIR uses only the lower log2(bitsize)
504# bits of the second source.  These replacements must correctly handle the
505# case where (b % bitsize) + (c % bitsize) >= bitsize.
506for s in [8, 16, 32, 64]:
507   mask = s - 1
508
509   ishl = "ishl@{}".format(s)
510   ishr = "ishr@{}".format(s)
511   ushr = "ushr@{}".format(s)
512
513   in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)
514
515   optimizations.extend([
516       ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),
517       ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),
518
519       # To get get -1 for large shifts of negative values, ishr must instead
520       # clamp the shift count to the maximum value.
521       ((ishr, (ishr, a, '#b'), '#c'),
522        (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
523   ])
524
525# Optimize a pattern of address calculation created by DXVK where the offset is
526# divided by 4 and then multipled by 4. This can be turned into an iand and the
527# additions before can be reassociated to CSE the iand instruction.
528
529for size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)):
530    a_sz = 'a@{}'.format(size)
531
532    optimizations.extend([
533       # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)'
534       (('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))),
535       (('ishl', ('ishr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))),
536
537       # This does not trivially work with ishr.
538       (('ushr', ('ishl', a_sz, '#b'), b), ('iand', a, ('ushr', mask, b))),
539    ])
540
541optimizations.extend([
542    (('iand', ('ishl', 'a@32', '#b(is_first_5_bits_uge_2)'), -4), ('ishl', a, b)),
543    (('iand', ('imul', a, '#b(is_unsigned_multiple_of_4)'), -4), ('imul', a, b)),
544])
545
546for log2 in range(1, 7): # powers of two from 2 to 64
547   v = 1 << log2
548   mask = 0xffffffff & ~(v - 1)
549   b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v)
550
551   optimizations.extend([
552       # Reassociate for improved CSE
553       (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
554   ])
555
556# To save space in the state tables, reduce to the set that is known to help.
557# Previously, this was range(1, 32).  In addition, a couple rules inside the
558# loop are commented out.  Revisit someday, probably after mesa/#2635 has some
559# resolution.
560for i in [1, 2, 16, 24]:
561    lo_mask = 0xffffffff >> i
562    hi_mask = (0xffffffff << i) & 0xffffffff
563
564    optimizations.extend([
565        # This pattern seems to only help in the soft-fp64 code.
566        (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)),
567#        (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)),
568#        (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)),
569
570        (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)),
571        (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)),
572#        (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct
573    ])
574
575optimizations.extend([
576   # This is common for address calculations.  Reassociating may enable the
577   # 'a<<c' to be CSE'd.  It also helps architectures that have an ISHLADD
578   # instruction or a constant offset field for in load / store instructions.
579   (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))),
580
581   # (a + #b) * #c => (a * #c) + (#b * #c)
582   (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))),
583
584   # ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d)
585   (('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),
586    ('iadd', ('imul', ('iadd', a, c), d), ('imul', b, d))),
587   (('ishl', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),
588    ('iadd', ('ishl', ('iadd', a, c), d), ('ishl', b, d))),
589
590   # Comparison simplifications
591   (('inot', ('flt(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('fge', a, b)),
592   (('inot', ('fge(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('flt', a, b)),
593   (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)),
594   (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)),
595   (('inot', ('ilt(is_used_once)', a, b)), ('ige', a, b)),
596   (('inot', ('ult(is_used_once)', a, b)), ('uge', a, b)),
597   (('inot', ('ige(is_used_once)', a, b)), ('ilt', a, b)),
598   (('inot', ('uge(is_used_once)', a, b)), ('ult', a, b)),
599   (('inot', ('ieq(is_used_once)', a, b)), ('ine', a, b)),
600   (('inot', ('ine(is_used_once)', a, b)), ('ieq', a, b)),
601
602   (('iand', ('feq', a, b), ('fneu', a, b)), False),
603   (('iand', ('flt', a, b), ('flt', b, a)), False),
604   (('iand', ('ieq', a, b), ('ine', a, b)), False),
605   (('iand', ('ilt', a, b), ('ilt', b, a)), False),
606   (('iand', ('ult', a, b), ('ult', b, a)), False),
607
608   # This helps some shaders because, after some optimizations, they end up
609   # with patterns like (-a < -b) || (b < a).  In an ideal world, this sort of
610   # matching would be handled by CSE.
611   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
612   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
613   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
614   (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
615   (('flt', ('fneg', 'a(is_not_const)'), '#b'), ('flt', ('fneg', b), a)),
616   (('flt', '#b', ('fneg', 'a(is_not_const)')), ('flt', a, ('fneg', b))),
617   (('fge', ('fneg', 'a(is_not_const)'), '#b'), ('fge', ('fneg', b), a)),
618   (('fge', '#b', ('fneg', 'a(is_not_const)')), ('fge', a, ('fneg', b))),
619   (('fneu', ('fneg', 'a(is_not_const)'), '#b'), ('fneu', ('fneg', b), a)),
620   (('feq', '#b', ('fneg', 'a(is_not_const)')), ('feq', a, ('fneg', b))),
621   (('flt', a, '#b(is_negative_zero)'), ('flt', a, 0.0)),
622   (('flt', '#b(is_negative_zero)', a), ('flt', 0.0, a)),
623   (('fge', a, '#b(is_negative_zero)'), ('fge', a, 0.0)),
624   (('fge', '#b(is_negative_zero)', a), ('fge', 0.0, a)),
625   (('fneu', a, '#b(is_negative_zero)'), ('fneu', 0.0, a)),
626   (('feq', '#b(is_negative_zero)', a), ('feq', a, 0.0)),
627
628   (('ieq', ('ineg', a), 0),  ('ieq', a, 0)),
629   (('ine', ('ineg', a), 0),  ('ine', a, 0)),
630   (('ieq', ('iabs', a), 0),  ('ieq', a, 0)),
631   (('ine', ('iabs', a), 0),  ('ine', a, 0)),
632   (('fneu', ('fabs', a), 0.0), ('fneu', a, 0.0)),
633   (('feq', ('fabs', a), 0.0), ('feq', a, 0.0)),
634   (('fneu', ('fabs', a), ('fabs', a)), ('fneu', a, a)),
635   (('feq', ('fabs', a), ('fabs', a)), ('feq', a, a)),
636
637   # b < fsat(NaN) -> b < 0 -> false, and b < Nan -> false.
638   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
639
640   # fsat(NaN) >= b -> 0 >= b -> false, and NaN >= b -> false.
641   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
642
643   # b == fsat(NaN) -> b == 0 -> false, and b == NaN -> false.
644   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
645
646   # b != fsat(NaN) -> b != 0 -> true, and b != NaN -> true.
647   (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
648
649   # fsat(NaN) >= 1 -> 0 >= 1 -> false, and NaN >= 1 -> false.
650   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
651
652   # 0 < fsat(NaN) -> 0 < 0 -> false, and 0 < NaN -> false.
653   (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)),
654
655   # 0.0 >= b2f(a)
656   # b2f(a) <= 0.0
657   # b2f(a) == 0.0 because b2f(a) can only be 0 or 1
658   # inot(a)
659   (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)),
660
661   (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)),
662
663   (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
664   (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('ior', a, b)),
665   (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('ior', a, b)),
666   (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
667   (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('iand', a, b)),
668   (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)),
669   (('fneu',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ixor', a, b)),
670   (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ixor', a, b)),
671   (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),
672   (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('inot', ('ior', a, b))),
673   (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('inot', ('ior', a, b))),
674   (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))),
675   (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('inot', ('iand', a, b))),
676   (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)),
677   (('feq',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ieq', a, b)),
678   (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ieq', a, b)),
679
680   # -(b2f(a) + b2f(b)) < 0
681   # 0 < b2f(a) + b2f(b)
682   # 0 != b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
683   # a || b
684   (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)),
685   (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)),
686
687   # -(b2f(a) + b2f(b)) >= 0
688   # 0 >= b2f(a) + b2f(b)
689   # 0 == b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
690   # !(a || b)
691   (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))),
692   (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))),
693
694   (('flt', a, ('fneg', a)), ('flt', a, 0.0)),
695   (('fge', a, ('fneg', a)), ('fge', a, 0.0)),
696
697   # Some optimizations (below) convert things like (a < b || c < b) into
698   # (min(a, c) < b).  However, this interfers with the previous optimizations
699   # that try to remove comparisons with negated sums of b2f.  This just
700   # breaks that apart.
701   (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0),
702    ('ior', ('flt', c, 0.0), ('ior', a, b))),
703
704   (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)),
705   (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),
706   (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)),
707   (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)),
708   (('~flt',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('flt', a, ('fadd', c, ('fneg', b)))),
709   (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)),
710   (('~fge',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fge', a, ('fadd', c, ('fneg', b)))),
711   (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)),
712   (('~feq',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('feq', a, ('fadd', c, ('fneg', b)))),
713   (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)),
714   (('~fneu',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))),
715   (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)),
716
717   # Cannot remove the addition from ilt or ige due to overflow.
718   (('ieq', ('iadd', a, b), a), ('ieq', b, 0)),
719   (('ine', ('iadd', a, b), a), ('ine', b, 0)),
720
721   (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)),
722   (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)),
723   (('fneu', ('b2f', 'a@1'), 0.0), a),
724   (('flt',  0.0, ('b2f', 'a@1')), a),
725   (('ieq', ('b2i', 'a@1'), 0),   ('inot', a)),
726   (('ine', ('b2i', 'a@1'), 0),   a),
727   (('ieq', 'a@1', False), ('inot', a)),
728   (('ieq', 'a@1', True), a),
729   (('ine', 'a@1', False), a),
730   (('ine', 'a@1', True), ('inot', a)),
731
732   (('fneu', ('u2f', a), 0.0), ('ine', a, 0)),
733   (('feq', ('u2f', a), 0.0), ('ieq', a, 0)),
734   (('fge', ('u2f', a), 0.0), True),
735   (('fge', 0.0, ('u2f', a)), ('uge', 0, a)),    # ieq instead?
736   (('flt', ('u2f', a), 0.0), False),
737   (('flt', 0.0, ('u2f', a)), ('ult', 0, a)),    # ine instead?
738   (('fneu', ('i2f', a), 0.0), ('ine', a, 0)),
739   (('feq', ('i2f', a), 0.0), ('ieq', a, 0)),
740   (('fge', ('i2f', a), 0.0), ('ige', a, 0)),
741   (('fge', 0.0, ('i2f', a)), ('ige', 0, a)),
742   (('flt', ('i2f', a), 0.0), ('ilt', a, 0)),
743   (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)),
744
745   # 0.0 < fabs(a)
746   # fabs(a) > 0.0
747   # fabs(a) != 0.0 because fabs(a) must be >= 0
748   # a != 0.0
749   (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)),
750
751   # -fabs(a) < 0.0
752   # fabs(a) > 0.0
753   (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)),
754
755   # 0.0 >= fabs(a)
756   # 0.0 == fabs(a)   because fabs(a) must be >= 0
757   # 0.0 == a
758   (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)),
759
760   # -fabs(a) >= 0.0
761   # 0.0 >= fabs(a)
762   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
763
764   # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a
765   #
766   # This should be NaN safe.
767   #
768   # NaN >= 0 && 1 >= NaN -> false && false -> false
769   #
770   # vs.
771   #
772   # NaN == fsat(NaN) -> NaN == 0 -> false
773   (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'),
774
775   # Note: fmin(-a, -b) == -fmax(a, b)
776   (('fmax',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('ior', a, b))),
777   (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))),
778   (('fmin',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('iand', a, b))),
779   (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))),
780
781   # fmin(b2f(a), b)
782   # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b))
783   # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b))
784   # bcsel(a, fmin(1.0, b), fmin(0.0, b))
785   #
786   # Since b is a constant, constant folding will eliminate the fmin and the
787   # fmax.  If b is > 1.0, the bcsel will be replaced with a b2f.
788   (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))),
789
790   (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)),
791
792   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
793   (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)),
794   (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
795   (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)),
796   (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)),
797   (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)),
798   (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
799   (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)),
800   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
801   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
802   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
803   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
804   (('bcsel', a, True, b), ('ior', a, b)),
805   (('bcsel', a, a, b), ('ior', a, b)),
806   (('bcsel', a, b, False), ('iand', a, b)),
807   (('bcsel', a, b, a), ('iand', a, b)),
808   (('~fmin', a, a), a),
809   (('~fmax', a, a), a),
810   (('imin', a, a), a),
811   (('imax', a, a), a),
812   (('umin', a, a), a),
813   (('umin', a, 0), 0),
814   (('umin', a, -1), a),
815   (('umax', a, a), a),
816   (('umax', a, 0), a),
817   (('umax', a, -1), -1),
818   (('fmax', ('fmax', a, b), b), ('fmax', a, b)),
819   (('umax', ('umax', a, b), b), ('umax', a, b)),
820   (('imax', ('imax', a, b), b), ('imax', a, b)),
821   (('fmin', ('fmin', a, b), b), ('fmin', a, b)),
822   (('umin', ('umin', a, b), b), ('umin', a, b)),
823   (('imin', ('imin', a, b), b), ('imin', a, b)),
824   (('fmax', ('fmax', ('fmax', a, b), c), a), ('fmax', ('fmax', a, b), c)),
825   (('umax', ('umax', ('umax', a, b), c), a), ('umax', ('umax', a, b), c)),
826   (('imax', ('imax', ('imax', a, b), c), a), ('imax', ('imax', a, b), c)),
827   (('fmin', ('fmin', ('fmin', a, b), c), a), ('fmin', ('fmin', a, b), c)),
828   (('umin', ('umin', ('umin', a, b), c), a), ('umin', ('umin', a, b), c)),
829   (('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)),
830])
831
832for N in [8, 16, 32, 64]:
833    b2iN = 'b2i{0}'.format(N)
834    optimizations.extend([
835        (('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)),
836        (('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)),
837    ])
838
839for N in [16, 32, 64]:
840    b2fN = 'b2f{0}'.format(N)
841    optimizations.extend([
842        (('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)),
843        (('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)),
844    ])
845
846# Integer sizes
847for s in [8, 16, 32, 64]:
848    optimizations.extend([
849       (('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)),
850
851       # Simplify logic to detect sign of an integer.
852       (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0),            ('ige', a, 0)),
853       (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)),
854       (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0),            ('ilt', a, 0)),
855       (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)),
856       (('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),
857       (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),
858       (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)),
859       (('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)),
860       (('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),
861       (('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),
862       (('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)),
863       (('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)),
864    ])
865
866optimizations.extend([
867   (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),
868   (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),
869   (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),
870   (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))),
871   (('~fmin', a, ('fabs', a)), a),
872   (('imin', a, ('iabs', a)), a),
873   (('~fmax', a, ('fneg', ('fabs', a))), a),
874   (('imax', a, ('ineg', ('iabs', a))), a),
875   (('fmax', a, ('fabs', a)), ('fabs', a)),
876   (('imax', a, ('iabs', a)), ('iabs', a)),
877   (('fmax', a, ('fneg', a)), ('fabs', a)),
878   (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'),
879   (('~fmax', ('fabs', a), 0.0), ('fabs', a)),
880   (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
881   # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while
882   # fsat(a) returns 0.0.
883   (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
884   # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while
885   # fneg(fsat(fneg(a))) returns -0.0 on NaN.
886   (('~fmin', ('fmax', a, -1.0),  0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
887   # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while
888   # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if
889   # SignedZeroInfNanPreserve is set, but we don't currently have any way of
890   # representing this in the optimizations other than the usual ~.
891   (('~fmax', ('fmin', a,  0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
892   # fsat(fsign(NaN)) = fsat(0) = 0, and b2f(0 < NaN) = b2f(False) = 0. Mark
893   # the new comparison precise to prevent it being changed to 'a != 0'.
894   (('fsat', ('fsign', a)), ('b2f', ('!flt', 0.0, a))),
895   (('fsat', ('b2f', a)), ('b2f', a)),
896   (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
897   (('fsat', ('fsat', a)), ('fsat', a)),
898   (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'),
899   (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'),
900   (('fsat(nsz)', ('fneg(is_used_once)', ('fmulz(is_used_once)', a, b))), ('fsat', ('fmulz', ('fneg', a), b)), '!options->lower_fsat'),
901   (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'),
902   (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
903   (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
904   (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
905   # Both the left and right patterns are "b" when isnan(a), so this is exact.
906   (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))),
907   (('fmax', ('fsat(is_used_once)', a), ('fsat(is_used_once)', b)), ('fsat', ('fmax', a, b))),
908   # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) ->
909   # fmin(0.0, b)) while the right one is "b", so this optimization is inexact.
910   (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
911
912   # max(-min(b, a), b) -> max(abs(b), -a)
913   # min(-max(b, a), b) -> min(-abs(b), -a)
914   (('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))),
915   (('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))),
916
917   # If a in [0,b] then b-a is also in [0,b].  Since b in [0,1], max(b-a, 0) =
918   # fsat(b-a).
919   #
920   # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0
921   #
922   # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0.
923   (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0),
924    ('fsat', ('fadd', ('fneg',  a), b)), '!options->lower_fsat'),
925
926   (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)),
927
928   # The ior versions are exact because fmin and fmax will always pick a
929   # non-NaN value, if one exists.  Therefore (a < NaN) || (a < c) == a <
930   # fmax(NaN, c) == a < c.  Mark the fmin or fmax in the replacement as exact
931   # to prevent other optimizations from ruining the "NaN clensing" property
932   # of the fmin or fmax.
933   (('ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('!fmax', b, c))),
934   (('ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('!fmin', a, b), c)),
935   (('ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('!fmin', b, c))),
936   (('ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('!fmax', a, b), c)),
937   (('ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('!fmax', b, c))),
938   (('ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('!fmin', a, b), c)),
939   (('ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('!fmin', b, c))),
940   (('ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('!fmax', a, b), c)),
941   (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))),
942   (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)),
943   (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))),
944   (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)),
945   (('iand', ('flt', a, '#b(is_a_number)'), ('flt', a, '#c(is_a_number)')), ('flt', a, ('fmin', b, c))),
946   (('iand', ('flt', '#a(is_a_number)', c), ('flt', '#b(is_a_number)', c)), ('flt', ('fmax', a, b), c)),
947   (('iand', ('fge', a, '#b(is_a_number)'), ('fge', a, '#c(is_a_number)')), ('fge', a, ('fmax', b, c))),
948   (('iand', ('fge', '#a(is_a_number)', c), ('fge', '#b(is_a_number)', c)), ('fge', ('fmin', a, b), c)),
949
950   (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))),
951   (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)),
952   (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))),
953   (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)),
954   (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))),
955   (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)),
956   (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))),
957   (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)),
958   (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))),
959   (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)),
960   (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))),
961   (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)),
962   (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))),
963   (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)),
964   (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))),
965   (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)),
966
967   # A number of shaders contain a pattern like a.x < 0.0 || a.x > 1.0 || a.y
968   # < 0.0, || a.y > 1.0 || ...  These patterns rearrange and replace in a
969   # single step.  Doing just the replacement can lead to an infinite loop as
970   # the pattern is repeatedly applied to the result of the previous
971   # application of the pattern.
972   (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, c), d), ('flt', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),
973   (('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),
974   (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),
975   (('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),
976
977   # This is how SpvOpFOrdNotEqual might be implemented.  If both values are
978   # numbers, then it can be replaced with fneu.
979   (('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)),
980
981   # Other patterns may optimize the resulting iand tree further.
982   (('umin', ('iand', a, '#b(is_pos_power_of_two)'), ('iand', c, b)),
983    ('iand', ('iand', a, b), ('iand', c, b))),
984])
985
986# Float sizes
987for s in [16, 32, 64]:
988    if s == 64:
989        match_fsign_cond = "!options->lower_fsign & !(options->lower_doubles_options & nir_lower_dsign)"
990    else:
991        match_fsign_cond = "!options->lower_fsign"
992    optimizations.extend([
993       # These derive from the previous patterns with the application of b < 0 <=>
994       # 0 < -b.  The transformation should be applied if either comparison is
995       # used once as this ensures that the number of comparisons will not
996       # increase.  The sources to the ior and iand are not symmetric, so the
997       # rules have to be duplicated to get this behavior.
998       (('ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
999       (('ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
1000       (('ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
1001       (('ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
1002       (('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
1003       (('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
1004       (('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
1005       (('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
1006
1007       (('ior', ('feq(is_used_once)', 'a@{}'.format(s), 0.0), ('feq', 'b@{}'.format(s), 0.0)), ('feq', ('fmin', ('fabs', a), ('fabs', b)), 0.0)),
1008       (('ior', ('fneu(is_used_once)', 'a@{}'.format(s), 0.0), ('fneu', 'b@{}'.format(s), 0.0)), ('fneu', ('fadd', ('fabs', a), ('fabs', b)), 0.0)),
1009       (('iand', ('feq(is_used_once)', 'a@{}'.format(s), 0.0), ('feq', 'b@{}'.format(s), 0.0)), ('feq', ('fadd', ('fabs', a), ('fabs', b)), 0.0)),
1010       (('iand', ('fneu(is_used_once)', 'a@{}'.format(s), 0.0), ('fneu', 'b@{}'.format(s), 0.0)), ('fneu', ('fmin', ('fabs', a), ('fabs', b)), 0.0)),
1011
1012       # The (i2f32, ...) part is an open-coded fsign.  When that is combined
1013       # with the bcsel, it's basically copysign(1.0, a).  There are some
1014       # behavior differences between this pattern and copysign w.r.t. ±0 and
1015       # NaN.  copysign(x, y) blindly takes the sign bit from y and applies it
1016       # to x, regardless of whether either or both values are NaN.
1017       #
1018       # If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0,
1019       #            int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0
1020       # If a == ±0: bcsel(True, 1.0, ...) = 1.0,
1021       #            int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1
1022       #
1023       # For all other values of 'a', the original and replacement behave as
1024       # copysign.
1025       #
1026       # Marking the replacement comparisons as precise prevents any future
1027       # optimizations from replacing either of the comparisons with the
1028       # logical-not of the other.
1029       #
1030       # Note: Use b2i32 in the replacement because some platforms that
1031       # support fp16 don't support int16.
1032       (('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))),
1033        ('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))),
1034
1035       (('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))),
1036
1037       # The C spec says, "If the value of the integral part cannot be represented
1038       # by the integer type, the behavior is undefined."  "Undefined" can mean
1039       # "the conversion doesn't happen at all."
1040       (('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)),
1041
1042       # Ironically, mark these as imprecise because removing the conversions may
1043       # preserve more precision than doing the conversions (e.g.,
1044       # uint(float(0x81818181u)) == 0x81818200).
1045       (('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a),
1046       (('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a),
1047       (('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a),
1048       (('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a),
1049
1050       (('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), match_fsign_cond),
1051       (('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), match_fsign_cond),
1052
1053       # float? -> float? -> floatS ==> float? -> floatS
1054       (('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)),
1055
1056       # int? -> float? -> floatS ==> int? -> floatS
1057       (('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)),
1058       (('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)),
1059
1060       # float? -> float? -> intS ==> float? -> intS
1061       (('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)),
1062       (('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)),
1063
1064       # HLSL's sign function returns an integer
1065       (('i2f{}'.format(s), ('f2i', ('fsign', 'a@{}'.format(s)))), ('fsign', a)),
1066    ])
1067
1068    for B in [32, 64]:
1069        if s < B:
1070            optimizations.extend([
1071               # S = smaller, B = bigger
1072               # floatS -> floatB -> floatS ==> identity
1073               (('~f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a),
1074
1075               # floatS -> floatB -> intB ==> floatS -> intB
1076               (('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)),
1077               (('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)),
1078
1079               # int? -> floatB -> floatS ==> int? -> floatS
1080               (('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)),
1081               (('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)),
1082            ])
1083
1084for S in [1, 8, 16, 32]:
1085    for B in [8, 16, 32, 64]:
1086        if B <= S:
1087            continue
1088        optimizations.extend([
1089            # intS -> intB -> intS ==> identity
1090            (('i2i{}'.format(S), ('i2i{}'.format(B), 'a@{}'.format(S))), a),
1091            (('u2u{}'.format(S), ('u2u{}'.format(B), 'a@{}'.format(S))), a),
1092        ])
1093
1094        if B < 16:
1095            continue
1096        for C in [8, 16, 32, 64]:
1097            if C <= S:
1098                continue
1099            optimizations.extend([
1100                # intS -> intC -> floatB ==> intS -> floatB
1101                (('u2f{}'.format(B), ('u2u{}'.format(C), 'a@{}'.format(S))), ('u2f{}'.format(B), a)),
1102                (('i2f{}'.format(B), ('i2i{}'.format(C), 'a@{}'.format(S))), ('i2f{}'.format(B), a)),
1103            ])
1104
1105# mediump variants of the above
1106optimizations.extend([
1107    # int32 -> float32 -> float16 ==> int32 -> float16
1108    (('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)),
1109    (('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)),
1110
1111    # float32 -> float16 -> int16 ==> float32 -> int16
1112    (('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)),
1113    (('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)),
1114
1115    # float32 -> int32 -> int16 ==> float32 -> int16
1116    (('i2imp', ('f2u32', 'a@32')), ('f2ump', a)),
1117    (('i2imp', ('f2i32', 'a@32')), ('f2imp', a)),
1118
1119    # int32 -> int16 -> float16 ==> int32 -> float16
1120    (('u2f16', ('i2imp', 'a@32')), ('u2f16', a)),
1121    (('i2f16', ('i2imp', 'a@32')), ('i2f16', a)),
1122])
1123
1124# Clean up junk left from 8-bit integer to 16-bit integer lowering.
1125optimizations.extend([
1126    # The u2u16(u2u8(X)) just masks off the upper 8-bits of X.  This can be
1127    # accomplished by mask the upper 8-bit of the immediate operand to the
1128    # iand instruction.  Often times, both patterns will end up being applied
1129    # to the same original expression tree.
1130    (('iand', ('u2u16', ('u2u8', 'a@16')), '#b'),               ('iand', a, ('iand', b, 0xff))),
1131    (('u2u16', ('u2u8(is_used_once)', ('iand', 'a@16', '#b'))), ('iand', a, ('iand', b, 0xff))),
1132])
1133
1134for op in ['iand', 'ior', 'ixor']:
1135    optimizations.extend([
1136        (('u2u8', (op, ('u2u16', ('u2u8', 'a@16')), ('u2u16', ('u2u8', 'b@16')))), ('u2u8', (op, a, b))),
1137        (('u2u8', (op, ('u2u16', ('u2u8', 'a@32')), ('u2u16', ('u2u8', 'b@32')))), ('u2u8', (op, a, b))),
1138
1139        # Undistribute extract from a logic op
1140        ((op, ('extract_i8', a, '#b'), ('extract_i8', c, b)), ('extract_i8', (op, a, c), b)),
1141        ((op, ('extract_u8', a, '#b'), ('extract_u8', c, b)), ('extract_u8', (op, a, c), b)),
1142        ((op, ('extract_i16', a, '#b'), ('extract_i16', c, b)), ('extract_i16', (op, a, c), b)),
1143        ((op, ('extract_u16', a, '#b'), ('extract_u16', c, b)), ('extract_u16', (op, a, c), b)),
1144
1145        # Undistribute shifts from a logic op
1146        ((op, ('ushr(is_used_once)', a, '#b'), ('ushr', c, b)), ('ushr', (op, a, c), b)),
1147        ((op, ('ishr(is_used_once)', a, '#b'), ('ishr', c, b)), ('ishr', (op, a, c), b)),
1148        ((op, ('ishl(is_used_once)', a, '#b'), ('ishl', c, b)), ('ishl', (op, a, c), b)),
1149    ])
1150
1151# Integer sizes
1152for s in [8, 16, 32, 64]:
1153    last_shift_bit = int(math.log2(s)) - 1
1154
1155    lower_umin = 'options->lower_umin'
1156    lower_umax = 'options->lower_umax'
1157    lower_imin = 'false'
1158    lower_imax = 'false'
1159    lower_ior = 'options->lower_bitops'
1160    if s == 64:
1161       lower_umin = '(options->lower_umin || (options->lower_int64_options & nir_lower_minmax64) != 0)'
1162       lower_umax = '(options->lower_umax || (options->lower_int64_options & nir_lower_minmax64) != 0)'
1163       lower_imin = '((options->lower_int64_options & nir_lower_minmax64) != 0)'
1164       lower_imax = '((options->lower_int64_options & nir_lower_minmax64) != 0)'
1165       lower_ior = '(options->lower_bitops || (options->lower_int64_options & nir_lower_logic64) != 0)'
1166
1167    optimizations.extend([
1168       (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), lower_umax + ' && !' + lower_ior),
1169       (('ior',  ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), lower_umin + ' && !' + lower_ior),
1170       (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!'+lower_umax),
1171       (('ior',  ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!'+lower_umin),
1172       (('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!'+lower_umin),
1173       (('ior',  ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!'+lower_umax),
1174
1175       (('bcsel', ('ult', 'b@{}'.format(s), a), b, a), ('umin', a, b), '!'+lower_umin),
1176       (('bcsel', ('ult', 'a@{}'.format(s), b), b, a), ('umax', a, b), '!'+lower_umax),
1177       (('bcsel', ('uge', 'a@{}'.format(s), b), b, a), ('umin', a, b), '!'+lower_umin),
1178       (('bcsel', ('uge', 'b@{}'.format(s), a), b, a), ('umax', a, b), '!'+lower_umax),
1179       (('bcsel', ('ilt', 'b@{}'.format(s), a), b, a), ('imin', a, b), '!'+lower_imin),
1180       (('bcsel', ('ilt', 'a@{}'.format(s), b), b, a), ('imax', a, b), '!'+lower_imax),
1181       (('bcsel', ('ige', 'a@{}'.format(s), b), b, a), ('imin', a, b), '!'+lower_imin),
1182       (('bcsel', ('ige', 'b@{}'.format(s), a), b, a), ('imax', a, b), '!'+lower_imax),
1183
1184       # True/False are ~0 and 0 in NIR.  b2i of True is 1, and -1 is ~0 (True).
1185       (('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a),
1186
1187       # SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits)
1188       (('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)),
1189       (('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)),
1190       (('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)),
1191       (('ushr', 'a@{}'.format(s), ('ishl(is_used_once)', ('iand', b, 1), last_shift_bit)), ('ushr', a, ('ishl', b, last_shift_bit))),
1192    ])
1193
1194optimizations.extend([
1195   # Common pattern like 'if (i == 0 || i == 1 || ...)'
1196   (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)),
1197   (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),
1198   (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)),
1199   (('ior', a, ('ieq', a, False)), True),
1200
1201   (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)),
1202   (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))),
1203
1204   (('ishl', ('b2i32', ('ine', ('iand', 'a@32', '#b(is_pos_power_of_two)'), 0)), '#c'),
1205    ('bcsel', ('ige', ('iand', c, 31), ('find_lsb', b)),
1206              ('ishl', ('iand', a, b), ('iadd', ('iand', c, 31), ('ineg', ('find_lsb', b)))),
1207              ('ushr', ('iand', a, b), ('iadd', ('ineg', ('iand', c, 31)), ('find_lsb', b)))
1208    )
1209   ),
1210
1211   (('b2i32', ('ine', ('iand', 'a@32', '#b(is_pos_power_of_two)'), 0)),
1212    ('ushr', ('iand', a, b), ('find_lsb', b)), '!options->lower_bitops'),
1213
1214   (('ior',  ('b2i', a), ('iand', b, 1)), ('iand', ('ior', ('b2i', a), b), 1)),
1215   (('iand', ('b2i', a), ('iand', b, 1)), ('iand', ('b2i', a), b)),
1216
1217   # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code.
1218   # The first part of the iand comes from the !__feq64_nonnan.
1219   #
1220   # The second pattern is a reformulation of the first based on the relation
1221   # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation
1222   # happens to be y == 0.
1223   (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0),  b), c)), ('ilt', a, 0)),
1224    ('iand', ('inot', ('iand',                         b , c)), ('ilt', a, 0))),
1225   (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)),
1226    ('iand', ('inot', ('iand', ('ieq',             b , 0), c)), ('ilt', a, 0))),
1227
1228   # These patterns can result when (a < b || a < c) => (a < min(b, c))
1229   # transformations occur before constant propagation and loop-unrolling.
1230   #
1231   # The flt versions are exact.  If isnan(a), the original pattern is
1232   # trivially false, and the replacements are false too.  If isnan(b):
1233   #
1234   #    a < fmax(NaN, a) => a < a => false vs a < NaN => false
1235   (('flt', a, ('fmax', b, a)), ('flt', a, b)),
1236   (('flt', ('fmin', a, b), a), ('flt', b, a)),
1237   (('~fge', a, ('fmin', b, a)), True),
1238   (('~fge', ('fmax', a, b), a), True),
1239   (('flt', a, ('fmin', b, a)), False),
1240   (('flt', ('fmax', a, b), a), False),
1241   (('~fge', a, ('fmax', b, a)), ('fge', a, b)),
1242   (('~fge', ('fmin', a, b), a), ('fge', b, a)),
1243
1244   (('ilt', a, ('imax', b, a)), ('ilt', a, b)),
1245   (('ilt', ('imin', a, b), a), ('ilt', b, a)),
1246   (('ige', a, ('imin', b, a)), True),
1247   (('ige', ('imax', a, b), a), True),
1248   (('ult', a, ('umax', b, a)), ('ult', a, b)),
1249   (('ult', ('umin', a, b), a), ('ult', b, a)),
1250   (('uge', a, ('umin', b, a)), True),
1251   (('uge', ('umax', a, b), a), True),
1252   (('ilt', a, ('imin', b, a)), False),
1253   (('ilt', ('imax', a, b), a), False),
1254   (('ige', a, ('imax', b, a)), ('ige', a, b)),
1255   (('ige', ('imin', a, b), a), ('ige', b, a)),
1256   (('ult', a, ('umin', b, a)), False),
1257   (('ult', ('umax', a, b), a), False),
1258   (('uge', a, ('umax', b, a)), ('uge', a, b)),
1259   (('uge', ('umin', a, b), a), ('uge', b, a)),
1260   (('ult', a, ('iand', b, a)), False),
1261   (('ult', ('ior', a, b), a), False),
1262   (('uge', a, ('iand', b, a)), True),
1263   (('uge', ('ior', a, b), a), True),
1264
1265   (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))),
1266   (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))),
1267   (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))),
1268   (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))),
1269   (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))),
1270   (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))),
1271   (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))),
1272   (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))),
1273   (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))),
1274   (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))),
1275   (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))),
1276   (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))),
1277   (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))),
1278   (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))),
1279   (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))),
1280   (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))),
1281
1282   # Thanks to sign extension, the ishr(a, b) is negative if and only if a is
1283   # negative.
1284   (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)),
1285    ('iabs', ('ishr', a, b))),
1286   (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)),
1287
1288   (('fabs', ('slt', a, b)), ('slt', a, b)),
1289   (('fabs', ('sge', a, b)), ('sge', a, b)),
1290   (('fabs', ('seq', a, b)), ('seq', a, b)),
1291   (('fabs', ('sne', a, b)), ('sne', a, b)),
1292   (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),
1293   (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
1294   (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
1295   (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'),
1296   (('seq', ('seq', a, b), 1.0), ('seq', a, b)),
1297   (('seq', ('sne', a, b), 1.0), ('sne', a, b)),
1298   (('seq', ('slt', a, b), 1.0), ('slt', a, b)),
1299   (('seq', ('sge', a, b), 1.0), ('sge', a, b)),
1300   (('sne', ('seq', a, b), 0.0), ('seq', a, b)),
1301   (('sne', ('sne', a, b), 0.0), ('sne', a, b)),
1302   (('sne', ('slt', a, b), 0.0), ('slt', a, b)),
1303   (('sne', ('sge', a, b), 0.0), ('sge', a, b)),
1304   (('seq', ('seq', a, b), 0.0), ('sne', a, b)),
1305   (('seq', ('sne', a, b), 0.0), ('seq', a, b)),
1306   (('seq', ('slt', a, b), 0.0), ('sge', a, b)),
1307   (('seq', ('sge', a, b), 0.0), ('slt', a, b)),
1308   (('sne', ('seq', a, b), 1.0), ('sne', a, b)),
1309   (('sne', ('sne', a, b), 1.0), ('seq', a, b)),
1310   (('sne', ('slt', a, b), 1.0), ('sge', a, b)),
1311   (('sne', ('sge', a, b), 1.0), ('slt', a, b)),
1312   (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1313   (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'),
1314   (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'),
1315   (('fall_equal8', a, b), ('seq', ('fany_nequal8', a, b), 0.0), 'options->lower_vector_cmp'),
1316   (('fall_equal16', a, b), ('seq', ('fany_nequal16', a, b), 0.0), 'options->lower_vector_cmp'),
1317   (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1318   (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1319   (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1320   (('fany_nequal8', a, b), ('fsat', ('fdot8', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1321   (('fany_nequal16', a, b), ('fsat', ('fdot16', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1322])
1323
1324def vector_cmp(reduce_op, cmp_op, comps):
1325   if len(comps) == 1:
1326      return (cmp_op, 'a.' + comps[0], 'b.' + comps[0])
1327   else:
1328      mid = len(comps) // 2
1329      return (reduce_op, vector_cmp(reduce_op, cmp_op, comps[:mid]),
1330                         vector_cmp(reduce_op, cmp_op, comps[mid:]))
1331
1332for op in [
1333   ('ball_iequal', 'ieq', 'iand'),
1334   ('ball_fequal', 'feq', 'iand'),
1335   ('bany_inequal', 'ine', 'ior'),
1336   ('bany_fnequal', 'fneu', 'ior'),
1337]:
1338   optimizations.extend([
1339      ((op[0] + '2', a, b), vector_cmp(op[2], op[1], 'xy'), 'options->lower_vector_cmp'),
1340      ((op[0] + '3', a, b), vector_cmp(op[2], op[1], 'xyz'), 'options->lower_vector_cmp'),
1341      ((op[0] + '4', a, b), vector_cmp(op[2], op[1], 'xyzw'), 'options->lower_vector_cmp'),
1342      ((op[0] + '8', a, b), vector_cmp(op[2], op[1], 'abcdefgh'), 'options->lower_vector_cmp'),
1343      ((op[0] + '16', a, b), vector_cmp(op[2], op[1], 'abcdefghijklmnop'), 'options->lower_vector_cmp'),
1344   ])
1345
1346optimizations.extend([
1347   (('feq', ('seq', a, b), 1.0), ('feq', a, b)),
1348   (('feq', ('sne', a, b), 1.0), ('fneu', a, b)),
1349   (('feq', ('slt', a, b), 1.0), ('flt', a, b)),
1350   (('feq', ('sge', a, b), 1.0), ('fge', a, b)),
1351   (('fneu', ('seq', a, b), 0.0), ('feq', a, b)),
1352   (('fneu', ('sne', a, b), 0.0), ('fneu', a, b)),
1353   (('fneu', ('slt', a, b), 0.0), ('flt', a, b)),
1354   (('fneu', ('sge', a, b), 0.0), ('fge', a, b)),
1355   (('feq', ('seq', a, b), 0.0), ('fneu', a, b)),
1356   (('feq', ('sne', a, b), 0.0), ('feq', a, b)),
1357   (('feq', ('slt', a, b), 0.0), ('fge', a, b)),
1358   (('feq', ('sge', a, b), 0.0), ('flt', a, b)),
1359   (('fneu', ('seq', a, b), 1.0), ('fneu', a, b)),
1360   (('fneu', ('sne', a, b), 1.0), ('feq', a, b)),
1361   (('fneu', ('slt', a, b), 1.0), ('fge', a, b)),
1362   (('fneu', ('sge', a, b), 1.0), ('flt', a, b)),
1363
1364   (('fneu', ('fneg', a), a), ('fneu', a, 0.0)),
1365   (('feq', ('fneg', a), a), ('feq', a, 0.0)),
1366   # Emulating booleans
1367   (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1368   (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1369   (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))),
1370   (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),
1371   (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))),
1372   (('iand', 'a@bool16', 1.0), ('b2f', a)),
1373   (('iand', 'a@bool32', 1.0), ('b2f', a)),
1374   (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.
1375   # Comparison with the same args.  Note that these are only done for the
1376   # float versions when the source must be a number.  Generally, NaN cmp NaN
1377   # produces the opposite result of X cmp X.  flt is the outlier.  NaN < NaN
1378   # is false, and, for any number X, X < X is also false.
1379   (('ilt', a, a), False),
1380   (('ige', a, a), True),
1381   (('ieq', a, a), True),
1382   (('ine', a, a), False),
1383   (('ult', a, a), False),
1384   (('uge', a, a), True),
1385   (('flt', a, a), False),
1386   (('fge', 'a(is_a_number)', a), True),
1387   (('feq', 'a(is_a_number)', a), True),
1388   (('fneu', 'a(is_a_number)', a), False),
1389   # Logical and bit operations
1390   (('iand', a, a), a),
1391   (('iand', a, 0), 0),
1392   (('iand', a, -1), a),
1393   (('iand', a, ('inot', a)), 0),
1394   (('ior', a, a), a),
1395   (('ior', a, 0), a),
1396   (('ior', a, -1), -1),
1397   (('ior', a, ('inot', a)), -1),
1398   (('ixor', a, a), 0),
1399   (('ixor', a, 0), a),
1400   (('ixor', a, ('ixor', a, b)), b),
1401   (('ixor', a, -1), ('inot', a)),
1402   (('inot', ('inot', a)), a),
1403   (('ior', ('iand', a, b), b), b),
1404   (('ior', ('ior', a, b), b), ('ior', a, b)),
1405   (('iand', ('ior', a, b), b), b),
1406   (('iand', ('iand', a, b), b), ('iand', a, b)),
1407
1408   # It is common for sequences of (x & 1) to occur in large trees.  Replacing
1409   # an expression like ((a & 1) & (b & 1)) with ((a & b) & 1) allows the "&
1410   # 1" to eventually bubble up to the top of the tree.
1411   (('iand', ('iand(is_used_once)', a, b), ('iand(is_used_once)', a, c)),
1412    ('iand', a, ('iand', b, c))),
1413
1414   (('iand@64', a, '#b(is_lower_half_zero)'),
1415    ('pack_64_2x32_split', 0,
1416                           ('iand', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1417     '!options->lower_pack_64_2x32_split'),
1418   (('iand@64', a, '#b(is_upper_half_zero)'),
1419    ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1420                           0),
1421     '!options->lower_pack_64_2x32_split'),
1422   (('iand@64', a, '#b(is_lower_half_negative_one)'),
1423    ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1424                           ('iand', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1425     '!options->lower_pack_64_2x32_split'),
1426   (('iand@64', a, '#b(is_upper_half_negative_one)'),
1427    ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1428                           ('unpack_64_2x32_split_y', a)),
1429     '!options->lower_pack_64_2x32_split'),
1430
1431   (('ior@64', a, '#b(is_lower_half_zero)'),
1432    ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1433                           ('ior', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1434     '!options->lower_pack_64_2x32_split'),
1435   (('ior@64', a, '#b(is_upper_half_zero)'),
1436    ('pack_64_2x32_split', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1437                           ('unpack_64_2x32_split_y', a)),
1438     '!options->lower_pack_64_2x32_split'),
1439   (('ior@64', a, '#b(is_lower_half_negative_one)'),
1440    ('pack_64_2x32_split', -1,
1441                           ('ior', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1442     '!options->lower_pack_64_2x32_split'),
1443   (('ior@64', a, '#b(is_upper_half_negative_one)'),
1444    ('pack_64_2x32_split', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1445                           -1),
1446     '!options->lower_pack_64_2x32_split'),
1447
1448   (('ixor@64', a, '#b(is_lower_half_zero)'),
1449    ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1450                           ('ixor', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))),
1451     '!options->lower_pack_64_2x32_split'),
1452   (('ixor@64', a, '#b(is_upper_half_zero)'),
1453    ('pack_64_2x32_split', ('ixor', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)),
1454                           ('unpack_64_2x32_split_y', a)),
1455     '!options->lower_pack_64_2x32_split'),
1456
1457   # DeMorgan's Laws
1458   (('iand', ('inot', a), ('inot', b)), ('inot', ('ior',  a, b))),
1459   (('ior',  ('inot', a), ('inot', b)), ('inot', ('iand', a, b))),
1460   # Shift optimizations
1461   (('ishl', 0, a), 0),
1462   (('ishl', a, 0), a),
1463   (('ishr', 0, a), 0),
1464   (('ishr', -1, a), -1),
1465   (('ishr', a, 0), a),
1466   (('ushr', 0, a), 0),
1467   (('ushr', a, 0), a),
1468   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), 'options->has_rotate16'),
1469   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), 'options->has_rotate16'),
1470   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), 'options->has_rotate32'),
1471   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), 'options->has_rotate32'),
1472   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), 'options->has_rotate16'),
1473   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), 'options->has_rotate16'),
1474   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), 'options->has_rotate32'),
1475   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), 'options->has_rotate32'),
1476   (('urol@8',  a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub',  8, b))), '!options->has_rotate8'),
1477   (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), '!options->has_rotate16'),
1478   (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), '!options->has_rotate32'),
1479   (('urol@64', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 64, b)))),
1480   (('uror@8',  a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub',  8, b))), '!options->has_rotate8'),
1481   (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), '!options->has_rotate16'),
1482   (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), '!options->has_rotate32'),
1483   (('uror@64', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 64, b)))),
1484
1485   (('bitfield_select', 0xff000000, ('ishl', 'b@32', 24), ('ushr', a, 8)), ('shfr', b, a, 8), 'options->has_shfr32'),
1486   (('bitfield_select', 0xffff0000, ('ishl', 'b@32', 16), ('extract_u16', a, 1)), ('shfr', b, a, 16), 'options->has_shfr32'),
1487   (('bitfield_select', 0xffffff00, ('ishl', 'b@32', 8), ('extract_u8', a, 3)), ('shfr', b, a, 24), 'options->has_shfr32'),
1488   (('ior', ('ishl', 'b@32', 24), ('ushr', a, 8)), ('shfr', b, a, 8), 'options->has_shfr32'),
1489   (('ior', ('ishl', 'b@32', 16), ('extract_u16', a, 1)), ('shfr', b, a, 16), 'options->has_shfr32'),
1490   (('ior', ('ishl', 'b@32', 8), ('extract_u8', a, 3)), ('shfr', b, a, 24), 'options->has_shfr32'),
1491   (('ior', ('ishl', 'b@32', ('iadd', 32, ('ineg', c))), ('ushr@32', a, c)), ('shfr', b, a, c), 'options->has_shfr32'),
1492
1493   # bfi(X, a, b) = (b & ~X) | (a & X)
1494   # If X = ~0: (b & 0) | (a & 0xffffffff) = a
1495   # If X = 0:  (b & 0xffffffff) | (a & 0) = b
1496   (('bfi', 0xffffffff, a, b), a),
1497   (('bfi', 0x00000000, a, b), b),
1498
1499   # The result of -int(some_bool) is 0 or 0xffffffff, so the result of the
1500   # bfi is either b or c.
1501   (('bfi', ('ineg', ('b2i', 'a@1')), b, c), ('bcsel', a, b, c)),
1502
1503   # bfi(a, a, b) = ((a << find_lsb(a)) & a) | (b & ~a)
1504   #              = (a & a) | (b & ~a)    If a is odd, find_lsb(a) == 0
1505   #              = a | (b & ~a)
1506   #              = a | b
1507   (('bfi', '#a(is_odd)', a, b), ('ior', a, b)),
1508
1509   # bfi(a, b, 0) = ((b << find_lsb(a)) & a) | (0 & ~a)
1510   #              = ((b << find_lsb(a)) & a)
1511   #              = (b & a)               If a is odd, find_lsb(a) == 0
1512   (('bfi', '#a(is_odd)', b, 0), ('iand', a, b)),
1513
1514   # Because 'a' is a positive power of two, the result of the bfi is either 0
1515   # or 'a' depending on whether or not 'b' is odd.  Use 'b&1' for the zero
1516   # value to help platforms that can't have two constants in a bcsel.
1517   (('u2f32', ('bfi', '#a(is_pos_power_of_two)', b, 0)),
1518    ('bcsel', ('ieq', ('iand', b, 1), 0), ('iand', b, 1), ('u2f', a))),
1519   (('u2f', ('bfi', '#a(is_pos_power_of_two)', b, 0)),
1520    ('bcsel', ('ieq', ('iand', b, 1), 0), 0, ('u2f', a))),
1521
1522   # Exponential/logarithmic identities
1523   (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
1524   (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
1525   # 32-bit fpow should use fmulz to fix https://gitlab.freedesktop.org/mesa/mesa/-/issues/11464 (includes apitrace)
1526   (('fpow@32', a, b), ('fexp2', ('fmulz', ('flog2', a), b)), 'options->lower_fpow && ' + has_fmulz), # a^b = 2^(lg2(a)*b)
1527   (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)
1528   (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
1529   (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
1530    ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
1531   (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)),
1532   (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)),
1533   (('~fexp2', ('fmul', ('flog2', a), 3.0)), ('fmul', ('fmul', a, a), a)),
1534   (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))),
1535   (('~fexp2', ('fmul', ('flog2', a), 5.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), a)),
1536   (('~fexp2', ('fmul', ('flog2', a), 6.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), ('fmul', a, a))),
1537   (('~fexp2', ('fmul', ('flog2', a), 8.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), ('fmul', ('fmul', a, a), ('fmul', a, a)))),
1538   (('~fpow', a, 1.0), a),
1539   (('~fpow', a, 2.0), ('fmul', a, a)),
1540   (('~fpow', a, 3.0), ('fmul', ('fmul', a, a), a)),
1541   (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
1542   (('~fpow', 2.0, a), ('fexp2', a)),
1543   (('~fpow', ('fpow', a, 2.2), 0.454545), a),
1544   (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
1545   (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
1546   (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
1547   (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
1548   (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
1549   (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))),
1550   (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
1551   (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
1552   (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))),
1553   (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))),
1554   (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)),
1555   (('~fmulz', ('fsqrt', a), ('fsqrt', a)), ('fabs', a)),
1556   # Division and reciprocal
1557   (('~fdiv', 1.0, a), ('frcp', a)),
1558   (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
1559   (('~frcp', ('frcp', a)), a),
1560   (('~frcp', ('fsqrt', a)), ('frsq', a)),
1561   (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
1562   (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
1563   # Trig
1564   (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'),
1565   (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'),
1566   # Boolean simplifications
1567   (('ieq', a, True), a),
1568   (('ine(is_not_used_by_if)', a, True), ('inot', a)),
1569   (('ine', a, False), a),
1570   (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')),
1571   (('bcsel', a, True, False), a),
1572   (('bcsel', a, False, True), ('inot', a)),
1573   (('bcsel', True, b, c), b),
1574   (('bcsel', False, b, c), c),
1575
1576   (('bcsel@16', a, 1.0, 0.0), ('b2f', a)),
1577   (('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))),
1578   (('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))),
1579   (('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
1580   (('bcsel@32', a, 1.0, 0.0), ('b2f', a)),
1581   (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))),
1582   (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))),
1583   (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
1584   (('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1585   (('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1586   (('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1587   (('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1588
1589   (('bcsel', a, b, b), b),
1590   (('~fcsel', a, b, b), b),
1591
1592   # D3D Boolean emulation
1593   (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))),
1594   (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))),
1595   (('bcsel', a, 1, 0), ('b2i', 'a@1')),
1596   (('bcsel', a, 0, 1), ('b2i', ('inot', a))),
1597   (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1598    ('ineg', ('b2i', ('iand', a, b)))),
1599   (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))),
1600    ('ineg', ('b2i', ('ior', a, b)))),
1601   (('ieq', ('ineg', ('b2i', 'a@1')), -1), a),
1602   (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)),
1603   (('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),
1604   (('ilt', ('ineg', ('b2i', 'a@1')), 0), a),
1605   (('ult', 0, ('ineg', ('b2i', 'a@1'))), a),
1606   (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)),
1607   (('iand', ('ineg', ('b2i', a)), 1),   ('b2i', a)),
1608
1609   # With D3D booleans, imax is AND and umax is OR
1610   (('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1611    ('ineg', ('b2i', ('iand', a, b)))),
1612   (('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1613    ('ineg', ('b2i', ('ior', a, b)))),
1614   (('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1615    ('ineg', ('b2i', ('ior', a, b)))),
1616   (('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1617    ('ineg', ('b2i', ('iand', a, b)))),
1618   (('umax', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior',  a, b))),
1619   (('umin', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1620
1621   # Clean up LLVM booleans. b2i output is 0/1 so iand is a no-op.
1622   (('iand', ('b2i', a), 1), ('b2i', a)),
1623
1624   (('ine', ('umin', ('ineg', ('b2i', 'a@1')), b), 0), ('iand', a, ('ine', b, 0))),
1625   (('ine', ('umax', ('ineg', ('b2i', 'a@1')), b), 0), ('ior' , a, ('ine', b, 0))),
1626
1627   # Conversions
1628   (('f2i', ('ftrunc', a)), ('f2i', a)),
1629   (('f2u', ('ftrunc', a)), ('f2u', a)),
1630
1631   # Conversions from 16 bits to 32 bits and back can always be removed
1632   (('f2fmp', ('f2f32', 'a@16')), a),
1633   (('i2imp', ('i2i32', 'a@16')), a),
1634   (('i2imp', ('u2u32', 'a@16')), a),
1635
1636   (('f2imp', ('f2f32', 'a@16')), ('f2i16', a)),
1637   (('f2ump', ('f2f32', 'a@16')), ('f2u16', a)),
1638   (('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)),
1639   (('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)),
1640
1641   (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)),
1642   (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
1643   (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
1644
1645   (('f2imp', ('b2f32', 'a@1')), ('b2i16', a)),
1646   (('f2ump', ('b2f32', 'a@1')), ('b2i16', a)),
1647   (('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)),
1648   (('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)),
1649
1650   # Conversions to 16 bits would be lossy so they should only be removed if
1651   # the instruction was generated by the precision lowering pass.
1652   (('f2f32', ('f2fmp', 'a@32')), a),
1653   (('i2i32', ('i2imp', 'a@32')), a),
1654   (('u2u32', ('i2imp', 'a@32')), a),
1655
1656   # typeA@32 -> typeB@16 -> typeB@32 ==> typeA@32 -> typeB@32
1657   (('i2i32', ('f2imp', 'a@32')), ('f2i32', a)),
1658   (('u2u32', ('f2ump', 'a@32')), ('f2u32', a)),
1659   (('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)),
1660   (('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)),
1661
1662   # typeA@32 -> typeA@16 -> typeB@32 ==> typeA@32 -> typeB@32
1663   (('f2i32', ('f2fmp', 'a@32')), ('f2i32', a)),
1664   (('f2u32', ('f2fmp', 'a@32')), ('f2u32', a)),
1665   (('i2f32', ('i2imp', 'a@32')), ('i2f32', a)),
1666
1667   (('ffloor', 'a(is_integral)'), a),
1668   (('fceil', 'a(is_integral)'), a),
1669   (('ftrunc', 'a(is_integral)'), a),
1670   (('fround_even', 'a(is_integral)'), a),
1671
1672   # fract(x) = x - floor(x), so fract(NaN) = NaN
1673   (('~ffract', 'a(is_integral)'), 0.0),
1674   (('fabs', 'a(is_not_negative)'), a),
1675   (('iabs', 'a(is_not_negative)'), a),
1676   (('fsat', 'a(is_not_positive)'), 0.0),
1677
1678   (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'),
1679
1680   # The result of the multiply must be in [-1, 0], so the result of the ffma
1681   # must be in [0, 1].
1682   (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False),
1683   (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False),
1684   (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
1685   (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
1686
1687   (('fneu', 'a(is_not_zero)', 0.0), True),
1688   (('feq', 'a(is_not_zero)', 0.0), False),
1689
1690   # In this chart, + means value > 0 and - means value < 0.
1691   #
1692   # + >= + -> unknown  0 >= + -> false    - >= + -> false
1693   # + >= 0 -> true     0 >= 0 -> true     - >= 0 -> false
1694   # + >= - -> true     0 >= - -> true     - >= - -> unknown
1695   #
1696   # Using grouping conceptually similar to a Karnaugh map...
1697   #
1698   # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true
1699   # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false
1700   # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false
1701   #
1702   # The flt / ilt cases just invert the expected result.
1703   #
1704   # The results expecting true, must be marked imprecise.  The results
1705   # expecting false are fine because NaN compared >= or < anything is false.
1706
1707   (('fge', 'a(is_a_number_not_negative)', 'b(is_a_number_not_positive)'), True),
1708   (('fge', 'a(is_not_positive)',          'b(is_gt_zero)'),               False),
1709   (('fge', 'a(is_lt_zero)',               'b(is_not_negative)'),          False),
1710
1711   (('flt', 'a(is_not_negative)',          'b(is_not_positive)'),          False),
1712   (('flt', 'a(is_a_number_not_positive)', 'b(is_a_number_gt_zero)'),      True),
1713   (('flt', 'a(is_a_number_lt_zero)',      'b(is_a_number_not_negative)'), True),
1714
1715   (('ine', 'a(is_not_zero)', 0), True),
1716   (('ieq', 'a(is_not_zero)', 0), False),
1717
1718   (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True),
1719   (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'),      False),
1720   (('ige', 'a(is_lt_zero)',      'b(is_not_negative)'), False),
1721
1722   (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
1723   (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'),      True),
1724   (('ilt', 'a(is_lt_zero)',      'b(is_not_negative)'), True),
1725
1726   (('ult', 0, 'a(is_gt_zero)'), True),
1727   (('ult', a, 0), False),
1728
1729   # Packing and then unpacking does nothing
1730   (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
1731   (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),
1732   (('unpack_64_2x32_split_x', ('pack_64_2x32', a)), 'a.x'),
1733   (('unpack_64_2x32_split_y', ('pack_64_2x32', a)), 'a.y'),
1734   (('unpack_64_2x32_split_x', ('u2u64', 'a@32')), a),
1735   (('unpack_64_2x32_split_y', ('u2u64', a)), 0),
1736   (('unpack_64_2x32_split_x', ('i2i64', 'a@32')), a),
1737   (('unpack_64_2x32_split_y', ('i2i64(is_used_once)', 'a@32')), ('ishr', a, 31)),
1738   (('unpack_64_2x32', ('pack_64_2x32_split', a, b)), ('vec2', a, b)),
1739   (('unpack_64_2x32', ('pack_64_2x32', a)), a),
1740   (('unpack_double_2x32_dxil', ('pack_double_2x32_dxil', a)), a),
1741   (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1742                           ('unpack_64_2x32_split_y', a)), a),
1743   (('pack_64_2x32', ('vec2', ('unpack_64_2x32_split_x', a),
1744                              ('unpack_64_2x32_split_y', a))), a),
1745   (('pack_64_2x32', ('unpack_64_2x32', a)), a),
1746   (('pack_double_2x32_dxil', ('unpack_double_2x32_dxil', a)), a),
1747
1748   (('unpack_64_4x16', ('pack_64_4x16', a)), a),
1749   (('unpack_64_4x16', ('pack_64_2x32', ('vec2', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d)))), ('vec4', a, b, c, d)),
1750   (('unpack_64_4x16', ('pack_64_2x32_split', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d))), ('vec4', a, b, c, d)),
1751
1752   # Comparing two halves of an unpack separately.  While this optimization
1753   # should be correct for non-constant values, it's less obvious that it's
1754   # useful in that case.  For constant values, the pack will fold and we're
1755   # guaranteed to reduce the whole tree to one instruction.
1756   (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'),
1757             ('ieq', ('unpack_32_2x16_split_y', a), '#c')),
1758    ('ieq', a, ('pack_32_2x16_split', b, c))),
1759
1760   # Byte extraction
1761   (('ushr', 'a@16',  8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1762   (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
1763   (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'),
1764   (('ishr', 'a@16',  8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
1765   (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
1766   (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
1767   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
1768   (('ishr', ('iand', a, 0x0000ff00),  8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1769   (('ishr', ('iand', a, 0x00ff0000), 16), ('extract_u8', a, 2), '!options->lower_extract_byte'),
1770
1771   # Common pattern in many Vulkan CTS tests that read 8-bit integers from a
1772   # storage buffer.
1773   (('u2u8', ('extract_u16', a, 1)), ('u2u8', ('extract_u8', a, 2)), '!options->lower_extract_byte'),
1774   (('u2u8', ('ushr', a, 8)), ('u2u8', ('extract_u8', a, 1)), '!options->lower_extract_byte'),
1775
1776   # Common pattern after lowering 8-bit integers to 16-bit.
1777   (('i2i16', ('u2u8', ('extract_u8', a, b))), ('i2i16', ('extract_i8', a, b))),
1778   (('u2u16', ('u2u8', ('extract_u8', a, b))), ('u2u16', ('extract_u8', a, b))),
1779
1780   (('ubfe', a,  0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'),
1781   (('ubfe', a,  8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1782   (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'),
1783   (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'),
1784   (('ibfe', a,  0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'),
1785   (('ibfe', a,  8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
1786   (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'),
1787   (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'),
1788
1789   (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),
1790   (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),
1791
1792   # The extract_X8(a & 0xff) patterns aren't included because the iand will
1793   # already be converted to extract_u8.
1794   (('extract_i8', ('iand', a, 0x0000ff00), 1), ('extract_i8', a, 1)),
1795   (('extract_i8', ('iand', a, 0x00ff0000), 2), ('extract_i8', a, 2)),
1796   (('extract_i8', ('iand', a, 0xff000000), 3), ('extract_i8', a, 3)),
1797
1798   (('extract_u8', ('iand', a, 0x0000ff00), 1), ('extract_u8', a, 1)),
1799   (('extract_u8', ('iand', a, 0x00ff0000), 2), ('extract_u8', a, 2)),
1800   (('extract_u8', ('iand', a, 0xff000000), 3), ('extract_u8', a, 3)),
1801
1802   (('iand', ('extract_u8',  a, 0), '#b'), ('iand', a, ('iand', b, 0x00ff))),
1803   (('iand', ('extract_u16', a, 0), '#b'), ('iand', a, ('iand', b, 0xffff))),
1804
1805   (('ieq', ('iand', ('extract_u8',  a, '#b'), '#c'), 0), ('ieq', ('iand', a, ('ishl', ('iand', c, 0x00ff), ('imul', ('i2i32', b),  8))), 0)),
1806   (('ine', ('iand', ('extract_u8',  a, '#b'), '#c'), 0), ('ine', ('iand', a, ('ishl', ('iand', c, 0x00ff), ('imul', ('i2i32', b),  8))), 0)),
1807   (('ieq', ('iand', ('extract_u16(is_used_once)', a, '#b'), '#c'), 0), ('ieq', ('iand', a, ('ishl', ('iand', c, 0xffff), ('imul', ('i2i32', b), 16))), 0)),
1808   (('ine', ('iand', ('extract_u16(is_used_once)', a, '#b'), '#c'), 0), ('ine', ('iand', a, ('ishl', ('iand', c, 0xffff), ('imul', ('i2i32', b), 16))), 0)),
1809
1810    # Word extraction
1811   (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
1812   (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
1813   (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
1814   (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
1815   (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
1816
1817   (('ubfe', a,  0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
1818   (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
1819   (('ibfe', a,  0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
1820   (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
1821
1822   # Collapse nop packing.
1823   (('unpack_32_4x8', ('pack_32_4x8', a)), a),
1824   (('unpack_32_2x16', ('pack_32_2x16', a)), a),
1825   (('unpack_64_4x16', ('pack_64_4x16', a)), a),
1826   (('unpack_64_2x32', ('pack_64_2x32', a)), a),
1827   (('pack_32_4x8', ('unpack_32_4x8', a)), a),
1828   (('pack_32_2x16', ('unpack_32_2x16', a)), a),
1829   (('pack_64_4x16', ('unpack_64_4x16', a)), a),
1830   (('pack_64_2x32', ('unpack_64_2x32', a)), a),
1831
1832   # Packing a u8vec4 to write to an SSBO.
1833   (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))),
1834    ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'),
1835
1836   (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)),
1837   (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)),
1838
1839   # The extract_X16(a & 0xff) patterns aren't included because the iand will
1840   # already be converted to extract_u8.
1841   (('extract_i16', ('iand', a, 0x00ff0000), 1), ('extract_u8', a, 2), '!options->lower_extract_byte'), # extract_u8 is correct
1842   (('extract_u16', ('iand', a, 0x00ff0000), 1), ('extract_u8', a, 2), '!options->lower_extract_byte'),
1843
1844   # Lower pack/unpack
1845   (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'),
1846   (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split || options->lower_pack_split'),
1847   (('pack_half_2x16_split', a, b), ('pack_half_2x16_rtz_split', a, b), 'options->has_pack_half_2x16_rtz'),
1848   (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'),
1849   (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'),
1850   (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split || options->lower_pack_split'),
1851   (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split || options->lower_pack_split'),
1852
1853   (('unpack_64_2x32_split_x', ('ushr', a, 32)), ('unpack_64_2x32_split_y', a), '!options->lower_unpack_64_2x32_split'),
1854   (('u2u32', ('ushr', 'a@64', 32)), ('unpack_64_2x32_split_y', a), '!options->lower_unpack_64_2x32_split'),
1855
1856   # Useless masking before unpacking
1857   (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),
1858   (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),
1859   (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)),
1860   (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),
1861   (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
1862   (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
1863
1864   (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)),
1865   (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)),
1866   (('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)),
1867   (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)),
1868   (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)),
1869
1870   # Optimize half packing
1871   (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))),
1872   (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),
1873
1874   (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
1875    ('pack_half_2x16', ('vec2', a, b))),
1876   (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
1877    ('pack_half_2x16', ('vec2', a, b))),
1878
1879   (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)),
1880   (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)),
1881   (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)),
1882
1883   (('ishl', ('pack_half_2x16_rtz_split', a, 0), 16), ('pack_half_2x16_rtz_split', 0, a)),
1884   (('ushr', ('pack_half_2x16_rtz_split', 0, a), 16), ('pack_half_2x16_rtz_split', a, 0)),
1885   (('extract_u16', ('pack_half_2x16_rtz_split', 0, a), 1), ('pack_half_2x16_rtz_split', a, 0)),
1886
1887   (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
1888   (('ior',  ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
1889
1890   (('iadd', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)),
1891   (('ior',  ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)),
1892
1893   (('pack_uint_2x16', ('vec2', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', b, 0))), ('pack_half_2x16_rtz_split', a, b)),
1894
1895   (('bfi', 0xffff0000, ('pack_half_2x16_split', a, b), ('pack_half_2x16_split', c, d)),
1896    ('pack_half_2x16_split', c, a)),
1897
1898   # The important part here is that ~0xf & 0xfffffffc = ~0xf.
1899   (('iand', ('bfi', 0x0000000f, '#a', b), 0xfffffffc),
1900    ('bfi', 0x0000000f, ('iand', a, 0xfffffffc), b)),
1901   (('iand', ('bfi', 0x00000007, '#a', b), 0xfffffffc),
1902    ('bfi', 0x00000007, ('iand', a, 0xfffffffc), b)),
1903
1904   # 0x0f << 3 == 0x78, so that's already the maximum possible value.
1905   (('umin', ('ishl', ('iand', a, 0xf), 3), 0x78), ('ishl', ('iand', a, 0xf), 3)),
1906
1907   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)),
1908   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)),
1909   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)),
1910   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)),
1911   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)),
1912   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)),
1913   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)),
1914   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)),
1915
1916   # Reduce intermediate precision with int64.
1917   (('u2u32', ('iadd(is_used_once)', 'a@64', b)),
1918    ('iadd', ('u2u32', a), ('u2u32', b))),
1919
1920   # Lowered pack followed by lowered unpack, for the high bits
1921   (('u2u32', ('ushr', ('ior', ('ishl', a, 32), ('u2u64', b)), 32)), ('u2u32', a)),
1922   (('u2u16', ('ushr', ('ior', ('ishl', a, 16), ('u2u32', b)), 16)), ('u2u16', a)),
1923])
1924
1925# After the ('extract_u8', a, 0) pattern, above, triggers, there will be
1926# patterns like those below.
1927for op in ('ushr', 'ishr'):
1928   optimizations.extend([(('extract_u8', (op, 'a@16',  8),     0), ('extract_u8', a, 1))])
1929   optimizations.extend([(('extract_u8', (op, 'a@32',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)])
1930   optimizations.extend([(('extract_u8', (op, 'a@64',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)])
1931
1932optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))])
1933
1934# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be
1935# patterns like those below.
1936for op in ('extract_u8', 'extract_i8'):
1937   optimizations.extend([((op, ('ishl', 'a@16',      8),     1), (op, a, 0))])
1938   optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)])
1939   optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)])
1940
1941for op, repl in [('ieq', 'ieq'), ('ine', 'ine'),
1942                 ('ult', 'ult'), ('ilt', 'ult'),
1943                 ('uge', 'uge'), ('ige', 'uge')]:
1944   optimizations.extend([
1945      ((op, ('pack_64_2x32_split', a, 0), ('pack_64_2x32_split', b, 0)), (repl, a, b)),
1946      ((op, ('pack_64_2x32_split', a, 0), '#b(is_upper_half_zero)'), (repl, a, ('unpack_64_2x32_split_x', b))),
1947      ((op, '#a(is_upper_half_zero)', ('pack_64_2x32_split', b, 0)), (repl, ('unpack_64_2x32_split_x', a), b)),
1948
1949      ((op, ('pack_64_2x32_split', 0, a), ('pack_64_2x32_split', 0, b)), (op, a, b)),
1950      ((op, ('pack_64_2x32_split', 0, a), '#b(is_lower_half_zero)'), (op, a, ('unpack_64_2x32_split_y', b))),
1951      ((op, '#a(is_lower_half_zero)', ('pack_64_2x32_split', 0, b)), (op, ('unpack_64_2x32_split_y', a), b)),
1952   ])
1953
1954optimizations.extend([
1955   # Subtracts
1956   (('ussub_4x8_vc4', a, 0), a),
1957   (('ussub_4x8_vc4', a, ~0), 0),
1958   # Lower all Subtractions first - they can get recombined later
1959   (('fsub', a, b), ('fadd', a, ('fneg', b))),
1960   (('isub', a, b), ('iadd', a, ('ineg', b))),
1961   (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
1962   # This is correct.  We don't need isub_sat because the result type is unsigned, so it cannot overflow.
1963   (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
1964   (('bitz', a, b), ('inot', ('bitnz', a, b))),
1965
1966   # Propagate negation up multiplication chains
1967   (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))),
1968   (('fmulz(is_used_by_non_fsat,nsz)', ('fneg', a), b), ('fneg', ('fmulz', a, b))),
1969   (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
1970   (('ffmaz', ('fneg', a), ('fneg', b), c), ('ffmaz', a, b, c)),
1971   (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
1972
1973   # Propagate constants up multiplication chains
1974   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)),
1975   (('~fmulz(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmulz', ('fmulz', a, c), b)),
1976   (('~fmul(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)'), ('fmulz', ('fmul', a, c), b)),
1977   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)),
1978   (('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)),
1979   (('~ffmaz', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffmaz', ('fmulz', a, c), b, d)),
1980   (('~ffma', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)', d), ('ffmaz', ('fmul', a, c), b, d)),
1981   # Prefer moving out a multiplication for more MAD/FMA-friendly code
1982   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)),
1983   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)),
1984   (('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)),
1985   (('~fadd(is_used_once)', ('ffmaz(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffmaz', a, b, d), c)),
1986   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)),
1987
1988   # Reassociate constants in add/mul chains so they can be folded together.
1989   # For now, we mostly only handle cases where the constants are separated by
1990   # a single non-constant.  We could do better eventually.
1991   (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)),
1992   (('~fmulz', '#a', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmulz', a, c), b)),
1993   (('~fmul', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmul', a, c), b)),
1994   (('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)),
1995   (('~ffmaz', '#a', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmulz', a, c), b, d)),
1996   (('~ffmaz', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmul', a, c), b, d)),
1997   (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)),
1998   (('~fadd', '#a',          ('fadd', 'b(is_not_const)', '#c')),  ('fadd', ('fadd', a,          c),           b)),
1999   (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
2000   (('~fadd', '#a',          ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')),  ('ffma',          b,  c, ('fadd', a,          d))),
2001   (('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))),
2002   (('~fadd', '#a',          ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d')),  ('ffmaz',          b,  c, ('fadd', a,          d))),
2003   (('~fadd', '#a', ('fneg', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffmaz', ('fneg', b), c, ('fadd', a, ('fneg', d)))),
2004   (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
2005   (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)),
2006   (('ior',  '#a', ('ior',  'b(is_not_const)', '#c')), ('ior',  ('ior',  a, c), b)),
2007   (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)),
2008   (('ior', ('iand', a, '#c'), ('ior', b, ('iand', a, '#d'))), ('ior', b, ('iand', a, ('ior', c, d)))),
2009
2010   # Reassociate add chains for more MAD/FMA-friendly code
2011   (('~fadd', ('fadd(is_used_once)', 'a(is_fmul)', 'b(is_fmul)'), 'c(is_not_fmul)'), ('fadd', ('fadd', a, c), b)),
2012
2013   # Drop mul-div by the same value when there's no wrapping.
2014   (('idiv', ('imul(no_signed_wrap)', a, b), b), a),
2015
2016   # By definition...
2017   (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)),
2018   (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
2019   (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
2020   (('bcsel', ('ige', ('ifind_msb_rev', a), 0), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)),
2021   (('bcsel', ('ige', ('ufind_msb_rev', a), 0), ('ufind_msb_rev', a), -1), ('ufind_msb_rev', a)),
2022
2023   (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)),
2024   (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
2025   (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
2026   (('bcsel', ('ine', a, 0), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)),
2027   (('bcsel', ('ine', a, 0), ('ufind_msb_rev', a), -1), ('ufind_msb_rev', a)),
2028
2029   (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
2030   (('bcsel', ('ine', a, -1), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)),
2031
2032   (('bcsel', ('ine', ('ifind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb', a))), -1), ('ifind_msb_rev', a), 'options->has_find_msb_rev'),
2033   (('bcsel', ('ine', ('ufind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb', a))), -1), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2034   (('bcsel', ('ieq', ('ifind_msb', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ifind_msb', a)))), ('ifind_msb_rev', a), 'options->has_find_msb_rev'),
2035   (('bcsel', ('ieq', ('ufind_msb', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2036   (('bcsel', ('ine', ('ifind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb', a))), ('ifind_msb', a)), ('ifind_msb_rev', a), 'options->has_find_msb_rev'),
2037   (('bcsel', ('ine', ('ufind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb', a))), ('ufind_msb', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2038   (('bcsel', ('ieq', ('ifind_msb', 'a@32'), -1), ('ifind_msb', a), ('iadd', 31, ('ineg', ('ifind_msb', a)))), ('ifind_msb_rev', a), 'options->has_find_msb_rev'),
2039   (('bcsel', ('ieq', ('ufind_msb', 'a@32'), -1), ('ufind_msb', a), ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2040   (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb', a))), -1), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2041   (('bcsel', ('ieq', 'a@32', 0), -1, ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2042   (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb', a))), ('ufind_msb', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2043   (('bcsel', ('ieq', 'a@32', 0), ('ufind_msb', a), ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2044
2045   (('bcsel', ('ine', ('ifind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb_rev', a))), -1), ('ifind_msb', a), '!options->lower_ifind_msb'),
2046   (('bcsel', ('ine', ('ufind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), -1), ('ufind_msb', a), '!options->lower_ufind_msb'),
2047   (('bcsel', ('ieq', ('ifind_msb_rev', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ifind_msb_rev', a)))), ('ifind_msb', a), '!options->lower_ifind_msb'),
2048   (('bcsel', ('ieq', ('ufind_msb_rev', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'),
2049   (('bcsel', ('ine', ('ifind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb_rev', a))), ('ifind_msb_rev', a)), ('ifind_msb', a), '!options->lower_ifind_msb'),
2050   (('bcsel', ('ine', ('ufind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), ('ufind_msb_rev', a)), ('ufind_msb', a), '!options->lower_ufind_msb'),
2051   (('bcsel', ('ieq', ('ifind_msb_rev', 'a@32'), -1), ('ifind_msb_rev', a), ('iadd', 31, ('ineg', ('ifind_msb_rev', a)))), ('ifind_msb', a), '!options->lower_ifind_msb'),
2052   (('bcsel', ('ieq', ('ufind_msb_rev', 'a@32'), -1), ('ufind_msb_rev', a), ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'),
2053   (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), -1), ('ufind_msb', a), '!options->lower_ufind_msb'),
2054   (('bcsel', ('ieq', 'a@32', 0), -1, ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'),
2055   (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), ('ufind_msb_rev', a)), ('ufind_msb', a), '!options->lower_ufind_msb'),
2056   (('bcsel', ('ieq', 'a@32', 0), ('ufind_msb_rev', a), ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'),
2057
2058   # Clear the LSB
2059   (('iand', a, ('inot', ('ishl', 1, ('find_lsb', a)))), ('iand', a, ('inot', ('ineg', a)))),
2060
2061   # This is safe. Both ufind_msb_rev and bitfield_reverse can only have
2062   # 32-bit sources, so the transformation can only generate correct NIR.
2063   (('find_lsb', ('bitfield_reverse', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'),
2064   (('ufind_msb_rev', ('bitfield_reverse', a)), ('find_lsb', a), '!options->lower_find_lsb'),
2065
2066   (('ifind_msb', ('f2i32(is_used_once)', a)), ('ufind_msb', ('f2i32', ('fabs', a)))),
2067   (('ifind_msb', ('extract_u8', a, b)),       ('ufind_msb', ('extract_u8', a, b))),
2068   (('ifind_msb', ('extract_u16', a, b)),      ('ufind_msb', ('extract_u16', a, b))),
2069   (('ifind_msb', ('imax', a, 1)),             ('ufind_msb', ('imax', a, 1))),
2070
2071   (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
2072   (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
2073   (('~fmulz', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
2074   (('~fmulz', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
2075   (('fabs', ('bcsel(is_used_once)', b, ('fneg', a), a)), ('fabs', a)),
2076   (('fabs', ('bcsel(is_used_once)', b, a, ('fneg', a))), ('fabs', a)),
2077   (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)),
2078
2079   (('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)),
2080   (('bcsel', a, b, ('bcsel', c, b, d)), ('bcsel', ('ior', a, c), b, d)),
2081
2082   # Misc. lowering
2083   (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
2084   (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
2085   (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
2086   (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
2087
2088   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
2089    ('bcsel', ('ult', 31, 'bits'), 'insert',
2090              ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),
2091    'options->lower_bitfield_insert && options->has_bfm && options->has_bfi'),
2092   (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
2093   (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
2094   (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
2095   (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
2096   (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
2097   (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
2098   (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
2099   (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
2100
2101   (('imul_32x16', a, b), ('imul', a, ('extract_i16', b, 0)), 'options->lower_mul_32x16'),
2102   (('umul_32x16', a, b), ('imul', a, ('extract_u16', b, 0)), 'options->lower_mul_32x16'),
2103
2104   (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)),
2105    'options->lower_uadd_sat || (options->lower_int64_options & (nir_lower_iadd64 | nir_lower_uadd_sat64)) != 0'),
2106   (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'),
2107   (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat'),
2108   (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), '(options->lower_int64_options & nir_lower_usub_sat64) != 0'),
2109
2110   # int64_t sum = a + b;
2111   #
2112   # if (a < 0 && b < 0 && a < sum)
2113   #    sum = INT64_MIN;
2114   # } else if (a >= 0 && b >= 0 && sum < a)
2115   #    sum = INT64_MAX;
2116   # }
2117   #
2118   # A couple optimizations are applied.
2119   #
2120   # 1. a < sum => sum >= 0.  This replacement works because it is known that
2121   #    a < 0 and b < 0, so sum should also be < 0 unless there was
2122   #    underflow.
2123   #
2124   # 2. sum < a => sum < 0.  This replacement works because it is known that
2125   #    a >= 0 and b >= 0, so sum should also be >= 0 unless there was
2126   #    overflow.
2127   #
2128   # 3. Invert the second if-condition and swap the order of parameters for
2129   #    the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >=
2130   #    0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0)
2131   #
2132   # On Intel Gen11, this saves ~11 instructions.
2133   (('iadd_sat@64', a, b), ('bcsel',
2134                            ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
2135                            0x8000000000000000,
2136                            ('bcsel',
2137                             ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
2138                             ('iadd', a, b),
2139                             0x7fffffffffffffff)),
2140    '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'),
2141
2142   # int64_t sum = a - b;
2143   #
2144   # if (a < 0 && b >= 0 && a < sum)
2145   #    sum = INT64_MIN;
2146   # } else if (a >= 0 && b < 0 && a >= sum)
2147   #    sum = INT64_MAX;
2148   # }
2149   #
2150   # Optimizations similar to the iadd_sat case are applied here.
2151   (('isub_sat@64', a, b), ('bcsel',
2152                            ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
2153                            0x8000000000000000,
2154                            ('bcsel',
2155                             ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
2156                             ('isub', a, b),
2157                             0x7fffffffffffffff)),
2158    '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'),
2159
2160   # These are done here instead of in the backend because the int64 lowering
2161   # pass will make a mess of the patterns.  The first patterns are
2162   # conditioned on nir_lower_minmax64 because it was not clear that it was
2163   # always an improvement on platforms that have real int64 support.  No
2164   # shaders in shader-db hit this, so it was hard to say one way or the
2165   # other.
2166   (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
2167   (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
2168   (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
2169   (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
2170   (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2171   (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2172
2173   (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2174   (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2175   # 0u < uint(a) <=> uint(a) != 0u
2176   (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
2177
2178   # Alternative lowering that doesn't rely on bfi.
2179   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
2180    ('bcsel', ('ult', 31, 'bits'),
2181     'insert',
2182    (('ior',
2183     ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))),
2184     ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))),
2185    'options->lower_bitfield_insert && (!options->has_bfm || (!options->has_bfi && !options->has_bitfield_select))'),
2186
2187   # Alternative lowering that uses bitfield_select.
2188   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
2189    ('bcsel', ('ult', 31, 'bits'), 'insert',
2190              ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')),
2191    'options->lower_bitfield_insert && options->has_bfm && options->has_bitfield_select'),
2192
2193   (('ibitfield_extract', 'value', 'offset', 'bits'),
2194    ('bcsel', ('ult', 31, 'bits'), 'value',
2195              ('ibfe', 'value', 'offset', 'bits')),
2196    'options->lower_bitfield_extract && options->has_bfe'),
2197
2198   (('ubitfield_extract', 'value', 'offset', 'bits'),
2199    ('bcsel', ('ult', 31, 'bits'), 'value',
2200              ('ubfe', 'value', 'offset', 'bits')),
2201    'options->lower_bitfield_extract && options->has_bfe'),
2202
2203   # (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0.
2204   (('bitfield_select', a, b, 0), ('iand', a, b)),
2205   (('bitfield_select', a, ('iand', a, b), c), ('bitfield_select', a, b, c)),
2206
2207   # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits'
2208   (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')),
2209   (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')),
2210   (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')),
2211   (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')),
2212   (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')),
2213   (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')),
2214
2215   # Optimizations for ubitfield_extract(value, offset, umin(bits, 32-(offset&0x1f))) and such
2216   (('ult', a, ('umin', ('iand', a, b), c)), False),
2217   (('ult', 31, ('umin', '#bits(is_ult_32)', a)), False),
2218   (('ubfe', 'value', 'offset', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset'))))),
2219    ('ubfe', 'value', 'offset', 'width')),
2220   (('ibfe', 'value', 'offset', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset'))))),
2221    ('ibfe', 'value', 'offset', 'width')),
2222   (('bfm', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset')))), 'offset'),
2223    ('bfm', 'width', 'offset')),
2224
2225   # open-coded BFM
2226   (('iadd@32', ('ishl', 1, a), -1), ('bfm', a, 0), 'options->has_bfm'),
2227   (('ishl', ('bfm', a, 0), b), ('bfm', a, b)),
2228
2229   # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
2230   #
2231   #    If bits is zero, the result will be zero.
2232   #
2233   # These patterns prevent other patterns from generating invalid results
2234   # when count is zero.
2235   (('ubfe', a, b, 0), 0),
2236   (('ibfe', a, b, 0), 0),
2237
2238   (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))),
2239
2240   (('b2i32', ('ine', ('ubfe', a, b, 1), 0)), ('ubfe', a, b, 1)),
2241   (('b2i32', ('ine', ('ibfe', a, b, 1), 0)), ('ubfe', a, b, 1)), # ubfe in the replacement is correct
2242   (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
2243   (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
2244   (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
2245   (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
2246
2247   (('ibitfield_extract', 'value', 'offset', 'bits'),
2248    ('bcsel', ('ieq', 0, 'bits'),
2249     0,
2250     ('ishr',
2251       ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')),
2252       ('isub', 32, 'bits'))),
2253    'options->lower_bitfield_extract && !options->has_bfe'),
2254
2255   (('ubitfield_extract', 'value', 'offset', 'bits'),
2256    ('iand',
2257     ('ushr', 'value', 'offset'),
2258     ('bcsel', ('ieq', 'bits', 32),
2259      0xffffffff,
2260      ('isub', ('ishl', 1, 'bits'), 1))),
2261    'options->lower_bitfield_extract && !options->has_bfe'),
2262
2263   (('ifind_msb', 'value'),
2264    ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')),
2265    'options->lower_ifind_msb && !options->has_find_msb_rev && !options->has_uclz'),
2266
2267   (('ifind_msb', 'value'),
2268    ('bcsel', ('ige', ('ifind_msb_rev', 'value'), 0),
2269     ('isub', 31, ('ifind_msb_rev', 'value')),
2270     ('ifind_msb_rev', 'value')),
2271    'options->lower_ifind_msb && options->has_find_msb_rev'),
2272
2273   # uclz of an absolute value source almost always does the right thing.
2274   # There are a couple problem values:
2275   #
2276   # * 0x80000000.  Since abs(0x80000000) == 0x80000000, uclz returns 0.
2277   #   However, findMSB(int(0x80000000)) == 30.
2278   #
2279   # * 0xffffffff.  Since abs(0xffffffff) == 1, uclz returns 31.  Section 8.8
2280   #   (Integer Functions) of the GLSL 4.50 spec says:
2281   #
2282   #    For a value of zero or negative one, -1 will be returned.
2283   #
2284   # * Negative powers of two.  uclz(abs(-(1<<x))) returns x, but
2285   #   findMSB(-(1<<x)) should return x-1.
2286   #
2287   # For all negative number cases, including 0x80000000 and 0xffffffff, the
2288   # correct value is obtained from uclz if instead of negating the (already
2289   # negative) value the logical-not is used.  A conditional logical-not can
2290   # be achieved by (x ^ (x >> 31)).
2291   (('ifind_msb', 'value'),
2292    ('isub', 31, ('uclz', ('ixor', 'value', ('ishr', 'value', 31)))),
2293    'options->lower_ifind_msb && options->has_uclz'),
2294
2295   (('ufind_msb', 'value@32'),
2296    ('bcsel', ('ige', ('ufind_msb_rev', 'value'), 0),
2297     ('isub', 31, ('ufind_msb_rev', 'value')),
2298     ('ufind_msb_rev', 'value')),
2299    'options->lower_ufind_msb && options->has_find_msb_rev'),
2300
2301   (('ufind_msb', 'value@32'),
2302    ('isub', 31, ('uclz', 'value')),
2303    'options->lower_ufind_msb && options->has_uclz'),
2304
2305   (('uclz', a), ('umin', 32, ('ufind_msb_rev', a)), '!options->has_uclz && options->has_find_msb_rev'),
2306
2307   (('find_lsb', 'value@64'),
2308    ('ufind_msb', ('iand', 'value', ('ineg', 'value'))),
2309    'options->lower_find_lsb'),
2310
2311   (('find_lsb', 'value'),
2312    ('ufind_msb', ('u2u32', ('iand', 'value', ('ineg', 'value')))),
2313    'options->lower_find_lsb'),
2314
2315   (('extract_i8', a, 'b@32'),
2316    ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24),
2317    'options->lower_extract_byte'),
2318
2319   (('extract_u8', a, 'b@32'),
2320    ('iand', ('ushr', a, ('imul', b, 8)), 0xff),
2321    'options->lower_extract_byte'),
2322
2323   (('extract_i16', a, 'b@32'),
2324    ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),
2325    'options->lower_extract_word'),
2326
2327   (('extract_u16', a, 'b@32'),
2328    ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
2329    'options->lower_extract_word'),
2330
2331    (('pack_unorm_2x16', 'v'),
2332     ('pack_uvec2_to_uint',
2333        ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))),
2334     'options->lower_pack_unorm_2x16'),
2335
2336    (('pack_unorm_4x8', 'v'),
2337     ('pack_uvec4_to_uint',
2338        ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
2339     'options->lower_pack_unorm_4x8 && !options->has_pack_32_4x8'),
2340
2341    (('pack_unorm_4x8', 'v'),
2342     ('pack_32_4x8',
2343        ('f2u8', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
2344     'options->lower_pack_unorm_4x8 && options->has_pack_32_4x8'),
2345
2346    (('pack_snorm_2x16', 'v'),
2347     ('pack_uvec2_to_uint',
2348        ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))),
2349     'options->lower_pack_snorm_2x16'),
2350
2351    (('pack_snorm_4x8', 'v'),
2352     ('pack_uvec4_to_uint',
2353        ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
2354     'options->lower_pack_snorm_4x8 && !options->has_pack_32_4x8'),
2355
2356    (('pack_snorm_4x8', 'v'),
2357     ('pack_32_4x8',
2358        ('f2i8', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
2359     'options->lower_pack_snorm_4x8 && options->has_pack_32_4x8'),
2360
2361    (('unpack_unorm_2x16', 'v'),
2362     ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0),
2363                                  ('extract_u16', 'v', 1))),
2364              65535.0),
2365     'options->lower_unpack_unorm_2x16'),
2366
2367    (('unpack_unorm_4x8', 'v'),
2368     ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0),
2369                                  ('extract_u8', 'v', 1),
2370                                  ('extract_u8', 'v', 2),
2371                                  ('extract_u8', 'v', 3))),
2372              255.0),
2373     'options->lower_unpack_unorm_4x8'),
2374
2375    (('unpack_snorm_2x16', 'v'),
2376     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0),
2377                                                            ('extract_i16', 'v', 1))),
2378                                           32767.0))),
2379     'options->lower_unpack_snorm_2x16'),
2380
2381    (('unpack_snorm_4x8', 'v'),
2382     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0),
2383                                                            ('extract_i8', 'v', 1),
2384                                                            ('extract_i8', 'v', 2),
2385                                                            ('extract_i8', 'v', 3))),
2386                                           127.0))),
2387     'options->lower_unpack_snorm_4x8'),
2388
2389   (('pack_half_2x16_split', 'a@32', 'b@32'),
2390    ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))),
2391    'options->lower_pack_split'),
2392
2393   (('unpack_half_2x16_split_x', 'a@32'),
2394    ('f2f32', ('u2u16', a)),
2395    'options->lower_pack_split && !nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'),
2396
2397   (('unpack_half_2x16_split_x', 'a@32'),
2398    ('f2f32', ('fmul', 1.0, ('u2u16', a))),
2399    'options->lower_pack_split && nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'),
2400
2401   (('unpack_half_2x16_split_y', 'a@32'),
2402    ('f2f32', ('u2u16', ('ushr', a, 16))),
2403    'options->lower_pack_split && !nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'),
2404
2405   (('unpack_half_2x16_split_y', 'a@32'),
2406    ('f2f32', ('fmul', 1.0, ('u2u16', ('ushr', a, 16)))),
2407    'options->lower_pack_split && nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'),
2408
2409   (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
2410   (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'),
2411   (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'),
2412   # float(0 < NaN) - float(NaN < 0) = float(False) - float(False) = 0 - 0 = 0
2413   # Mark the new comparisons precise to prevent them being changed to 'a !=
2414   # 0' or 'a == 0'.
2415   (('fsign', a), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_fsign'),
2416   (('fsign', 'a@64'), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_doubles_options & nir_lower_dsign'),
2417
2418   # Address/offset calculations:
2419   # Drivers supporting imul24 should use the nir_lower_amul() pass, this
2420   # rule converts everyone else to imul:
2421   (('amul', a, b), ('imul', a, b), '!options->has_imul24'),
2422
2423   (('umul24', a, b),
2424    ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)),
2425    '!options->has_umul24'),
2426   (('umad24', a, b, c),
2427    ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c),
2428    '!options->has_umad24'),
2429
2430   # Relaxed 24bit ops
2431   (('imul24_relaxed', a, b), ('imul24', a, b), 'options->has_imul24'),
2432   (('imul24_relaxed', a, b), ('imul', a, b), '!options->has_imul24'),
2433   (('umad24_relaxed', a, b, c), ('umad24', a, b, c), 'options->has_umad24'),
2434   (('umad24_relaxed', a, b, c), ('iadd', ('umul24_relaxed', a, b), c), '!options->has_umad24'),
2435   (('umul24_relaxed', a, b), ('umul24', a, b), 'options->has_umul24'),
2436   (('umul24_relaxed', a, b), ('imul', a, b), '!options->has_umul24'),
2437
2438   (('imad24_ir3', a, b, 0), ('imul24', a, b)),
2439   (('imad24_ir3', a, 0, c), (c)),
2440   (('imad24_ir3', a, 1, c), ('iadd', a, c)),
2441
2442   # if first two srcs are const, crack apart the imad so constant folding
2443   # can clean up the imul:
2444   # TODO ffma should probably get a similar rule:
2445   (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)),
2446
2447   # These will turn 24b address/offset calc back into 32b shifts, but
2448   # it should be safe to get back some of the bits of precision that we
2449   # already decided were no necessary:
2450   (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
2451   (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
2452   (('imul24', a, 0), (0)),
2453])
2454
2455for bit_size in [8, 16, 32, 64]:
2456   cond = '!options->lower_uadd_sat'
2457   if bit_size == 64:
2458      cond += ' && !(options->lower_int64_options & (nir_lower_iadd64 | nir_lower_uadd_sat64))'
2459   add = 'iadd@' + str(bit_size)
2460
2461   optimizations += [
2462      (('bcsel', ('ult', ('iadd', a, b), a), -1, (add, a, b)), ('uadd_sat', a, b), cond),
2463      (('bcsel', ('uge', ('iadd', a, b), a), (add, a, b), -1), ('uadd_sat', a, b), cond),
2464      (('bcsel', ('ieq', ('uadd_carry', a, b), 0), (add, a, b), -1), ('uadd_sat', a, b), cond),
2465      (('bcsel', ('ine', ('uadd_carry', a, b), 0), -1, (add, a, b)), ('uadd_sat', a, b), cond),
2466   ]
2467
2468for bit_size in [8, 16, 32, 64]:
2469   cond = '!options->lower_usub_sat'
2470   if bit_size == 64:
2471      cond += ' && !(options->lower_int64_options & nir_lower_usub_sat64)'
2472   add = 'iadd@' + str(bit_size)
2473
2474   optimizations += [
2475      (('bcsel', ('ult', a, b), 0, (add, a, ('ineg', b))), ('usub_sat', a, b), cond),
2476      (('bcsel', ('uge', a, b), (add, a, ('ineg', b)), 0), ('usub_sat', a, b), cond),
2477      (('bcsel', ('ieq', ('usub_borrow', a, b), 0), (add, a, ('ineg', b)), 0), ('usub_sat', a, b), cond),
2478      (('bcsel', ('ine', ('usub_borrow', a, b), 0), 0, (add, a, ('ineg', b))), ('usub_sat', a, b), cond),
2479   ]
2480
2481# bit_size dependent lowerings
2482for bit_size in [8, 16, 32, 64]:
2483   # convenience constants
2484   intmax = (1 << (bit_size - 1)) - 1
2485   intmin = 1 << (bit_size - 1)
2486
2487   optimizations += [
2488      (('iadd_sat@' + str(bit_size), a, b),
2489       ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)),
2490                                ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'),
2491      (('isub_sat@' + str(bit_size), a, b),
2492       ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)),
2493                                ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'),
2494   ]
2495
2496invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')])
2497
2498for left, right in itertools.combinations_with_replacement(invert.keys(), 2):
2499   optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),
2500                         ('iand', (invert[left], a, b), (invert[right], c, d))))
2501   optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))),
2502                         ('ior', (invert[left], a, b), (invert[right], c, d))))
2503
2504# Optimize x2yN(b2x(x)) -> b2y
2505for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']):
2506   if x != 'f' and y != 'f' and x != y:
2507      continue
2508
2509   b2x = 'b2f' if x == 'f' else 'b2i'
2510   b2y = 'b2f' if y == 'f' else 'b2i'
2511   x2yN = '{}2{}'.format(x, y)
2512   optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
2513
2514# Optimize away x2xN(a@N)
2515for t in ['int', 'uint', 'float', 'bool']:
2516   for N in type_sizes(t):
2517      x2xN = '{0}2{0}{1}'.format(t[0], N)
2518      aN = 'a@{0}'.format(N)
2519      optimizations.append(((x2xN, aN), a))
2520
2521# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers
2522# In particular, we can optimize away everything except upcast of downcast and
2523# upcasts where the type differs from the other cast
2524for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')):
2525   if N < M:
2526      # The outer cast is a down-cast.  It doesn't matter what the size of the
2527      # argument of the inner cast is because we'll never been in the upcast
2528      # of downcast case.  Regardless of types, we'll always end up with y2yN
2529      # in the end.
2530      for x, y in itertools.product(['i', 'u'], ['i', 'u']):
2531         x2xN = '{0}2{0}{1}'.format(x, N)
2532         y2yM = '{0}2{0}{1}'.format(y, M)
2533         y2yN = '{0}2{0}{1}'.format(y, N)
2534         optimizations.append(((x2xN, (y2yM, a)), (y2yN, a)))
2535   elif N > M:
2536      # If the outer cast is an up-cast, we have to be more careful about the
2537      # size of the argument of the inner cast and with types.  In this case,
2538      # the type is always the type of type up-cast which is given by the
2539      # outer cast.
2540      for P in type_sizes('uint'):
2541         # We can't optimize away up-cast of down-cast.
2542         if M < P:
2543            continue
2544
2545         # Because we're doing down-cast of down-cast, the types always have
2546         # to match between the two casts
2547         for x in ['i', 'u']:
2548            x2xN = '{0}2{0}{1}'.format(x, N)
2549            x2xM = '{0}2{0}{1}'.format(x, M)
2550            aP = 'a@{0}'.format(P)
2551            optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a)))
2552   else:
2553      # The N == M case is handled by other optimizations
2554      pass
2555
2556# Downcast operations should be able to see through pack
2557for t in ['i', 'u']:
2558    for N in [8, 16, 32]:
2559        x2xN = '{0}2{0}{1}'.format(t, N)
2560        optimizations += [
2561            ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
2562            ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
2563        ]
2564
2565# Optimize comparisons with up-casts
2566for t in ['int', 'uint', 'float']:
2567    for N, M in itertools.product(type_sizes(t), repeat=2):
2568        if N == 1 or N >= M:
2569            continue
2570
2571        cond = 'true'
2572        if N == 8:
2573            cond = 'options->support_8bit_alu'
2574        elif N == 16:
2575            cond = 'options->support_16bit_alu'
2576        x2xM = '{0}2{0}{1}'.format(t[0], M)
2577        x2xN = '{0}2{0}{1}'.format(t[0], N)
2578        aN = 'a@' + str(N)
2579        bN = 'b@' + str(N)
2580        xeq = 'feq' if t == 'float' else 'ieq'
2581        xne = 'fneu' if t == 'float' else 'ine'
2582        xge = '{0}ge'.format(t[0])
2583        xlt = '{0}lt'.format(t[0])
2584
2585        # Up-casts are lossless so for correctly signed comparisons of
2586        # up-casted values we can do the comparison at the largest of the two
2587        # original sizes and drop one or both of the casts.  (We have
2588        # optimizations to drop the no-op casts which this may generate.)
2589        for P in type_sizes(t):
2590            if P == 1 or P > N:
2591                continue
2592
2593            bP = 'b@' + str(P)
2594            optimizations += [
2595                ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond),
2596                ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond),
2597                ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond),
2598                ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond),
2599                ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond),
2600                ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond),
2601            ]
2602
2603        # The next bit doesn't work on floats because the range checks would
2604        # get way too complicated.
2605        if t in ['int', 'uint']:
2606            if t == 'int':
2607                xN_min = -(1 << (N - 1))
2608                xN_max = (1 << (N - 1)) - 1
2609            elif t == 'uint':
2610                xN_min = 0
2611                xN_max = (1 << N) - 1
2612            else:
2613                assert False
2614
2615            # If we're up-casting and comparing to a constant, we can unfold
2616            # the comparison into a comparison with the shrunk down constant
2617            # and a check that the constant fits in the smaller bit size.
2618            optimizations += [
2619                ((xeq, (x2xM, aN), '#b'),
2620                 ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond),
2621                ((xne, (x2xM, aN), '#b'),
2622                 ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond),
2623                ((xlt, (x2xM, aN), '#b'),
2624                 ('iand', (xlt, xN_min, b),
2625                          ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond),
2626                ((xlt, '#a', (x2xM, bN)),
2627                 ('iand', (xlt, a, xN_max),
2628                          ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond),
2629                ((xge, (x2xM, aN), '#b'),
2630                 ('iand', (xge, xN_max, b),
2631                          ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond),
2632                ((xge, '#a', (x2xM, bN)),
2633                 ('iand', (xge, a, xN_min),
2634                          ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond),
2635            ]
2636
2637# Convert masking followed by signed downcast to just unsigned downcast
2638optimizations += [
2639    (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)),
2640    (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)),
2641    (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)),
2642    (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)),
2643    (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)),
2644    (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)),
2645]
2646
2647# Some operations such as iadd have the property that the bottom N bits of the
2648# output only depends on the bottom N bits of each of the inputs so we can
2649# remove casts
2650for N in [16, 32]:
2651    for M in [8, 16]:
2652        if M >= N:
2653            continue
2654
2655        aN = 'a@' + str(N)
2656        u2uM = 'u2u{0}'.format(M)
2657        i2iM = 'i2i{0}'.format(M)
2658
2659        for x in ['u', 'i']:
2660            x2xN = '{0}2{0}{1}'.format(x, N)
2661            extract_xM = 'extract_{0}{1}'.format(x, M)
2662
2663            x2xN_M_bits = '{0}(only_lower_{1}_bits_used)'.format(x2xN, M)
2664            extract_xM_M_bits = \
2665                '{0}(only_lower_{1}_bits_used)'.format(extract_xM, M)
2666            optimizations += [
2667                ((x2xN_M_bits, (u2uM, aN)), a),
2668                ((extract_xM_M_bits, aN, 0), a),
2669            ]
2670
2671            bcsel_M_bits = 'bcsel(only_lower_{0}_bits_used)'.format(M)
2672            optimizations += [
2673                ((bcsel_M_bits, c, (x2xN, (u2uM, aN)), b), ('bcsel', c, a, b)),
2674                ((bcsel_M_bits, c, (x2xN, (i2iM, aN)), b), ('bcsel', c, a, b)),
2675                ((bcsel_M_bits, c, (extract_xM, aN, 0), b), ('bcsel', c, a, b)),
2676            ]
2677
2678            for op in ['iadd', 'imul', 'iand', 'ior', 'ixor']:
2679                op_M_bits = '{0}(only_lower_{1}_bits_used)'.format(op, M)
2680                optimizations += [
2681                    ((op_M_bits, (x2xN, (u2uM, aN)), b), (op, a, b)),
2682                    ((op_M_bits, (x2xN, (i2iM, aN)), b), (op, a, b)),
2683                    ((op_M_bits, (extract_xM, aN, 0), b), (op, a, b)),
2684                ]
2685
2686def fexp2i(exp, bits):
2687   # Generate an expression which constructs value 2.0^exp or 0.0.
2688   #
2689   # We assume that exp is already in a valid range:
2690   #
2691   #   * [-15, 15] for 16-bit float
2692   #   * [-127, 127] for 32-bit float
2693   #   * [-1023, 1023] for 16-bit float
2694   #
2695   # If exp is the lowest value in the valid range, a value of 0.0 is
2696   # constructed.  Otherwise, the value 2.0^exp is constructed.
2697   if bits == 16:
2698      return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
2699   elif bits == 32:
2700      return ('ishl', ('iadd', exp, 127), 23)
2701   elif bits == 64:
2702      return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
2703   else:
2704      assert False
2705
2706def ldexp(f, exp, bits):
2707   # The maximum possible range for a normal exponent is [-126, 127] and,
2708   # throwing in denormals, you get a maximum range of [-149, 127].  This
2709   # means that we can potentially have a swing of +-276.  If you start with
2710   # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush
2711   # all the way to zero.  The GLSL spec only requires that we handle a subset
2712   # of this range.  From version 4.60 of the spec:
2713   #
2714   #    "If exp is greater than +128 (single-precision) or +1024
2715   #    (double-precision), the value returned is undefined. If exp is less
2716   #    than -126 (single-precision) or -1022 (double-precision), the value
2717   #    returned may be flushed to zero. Additionally, splitting the value
2718   #    into a significand and exponent using frexp() and then reconstructing
2719   #    a floating-point value using ldexp() should yield the original input
2720   #    for zero and all finite non-denormalized values."
2721   #
2722   # The SPIR-V spec has similar language.
2723   #
2724   # In order to handle the maximum value +128 using the fexp2i() helper
2725   # above, we have to split the exponent in half and do two multiply
2726   # operations.
2727   #
2728   # First, we clamp exp to a reasonable range.  Specifically, we clamp to
2729   # twice the full range that is valid for the fexp2i() function above.  If
2730   # exp/2 is the bottom value of that range, the fexp2i() expression will
2731   # yield 0.0f which, when multiplied by f, will flush it to zero which is
2732   # allowed by the GLSL and SPIR-V specs for low exponent values.  If the
2733   # value is clamped from above, then it must have been above the supported
2734   # range of the GLSL built-in and therefore any return value is acceptable.
2735   if bits == 16:
2736      exp = ('imin', ('imax', exp, -30), 30)
2737   elif bits == 32:
2738      exp = ('imin', ('imax', exp, -254), 254)
2739   elif bits == 64:
2740      exp = ('imin', ('imax', exp, -2046), 2046)
2741   else:
2742      assert False
2743
2744   # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2.
2745   # (We use ishr which isn't the same for -1, but the -1 case still works
2746   # since we use exp-exp/2 as the second exponent.)  While the spec
2747   # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't
2748   # work with denormals and doesn't allow for the full swing in exponents
2749   # that you can get with normalized values.  Instead, we create two powers
2750   # of two and multiply by them each in turn.  That way the effective range
2751   # of our exponent is doubled.
2752   pow2_1 = fexp2i(('ishr', exp, 1), bits)
2753   pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits)
2754   return ('fmul', ('fmul', f, pow2_1), pow2_2)
2755
2756optimizations += [
2757   (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'),
2758   (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'),
2759   (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'),
2760]
2761
2762# XCOM 2 (OpenGL) open-codes bitfieldReverse()
2763def bitfield_reverse_xcom2(u):
2764    step1 = ('iadd', ('ishl', u, 16), ('ushr', u, 16))
2765    step2 = ('iadd', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555))
2766    step3 = ('iadd', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333))
2767    step4 = ('iadd', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f))
2768    step5 = ('iadd(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff))
2769
2770    return step5
2771
2772# Unreal Engine 4 demo applications open-codes bitfieldReverse()
2773def bitfield_reverse_ue4(u):
2774    step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
2775    step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8))
2776    step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4))
2777    step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2))
2778    step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1))
2779
2780    return step5
2781
2782# Cyberpunk 2077 open-codes bitfieldReverse()
2783def bitfield_reverse_cp2077(u):
2784    step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
2785    step2 = ('ior', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555))
2786    step3 = ('ior', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333))
2787    step4 = ('ior', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f))
2788    step5 = ('ior(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff))
2789
2790    return step5
2791
2792optimizations += [(bitfield_reverse_xcom2('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
2793optimizations += [(bitfield_reverse_ue4('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
2794optimizations += [(bitfield_reverse_cp2077('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
2795
2796# VKD3D-Proton DXBC f32 to f16 conversion implements a float conversion using PackHalf2x16.
2797# Because the spec does not specify a rounding mode or behaviour regarding infinity,
2798# it emits a sequence to ensure D3D-like behaviour for infinity.
2799# When we know the current backend already behaves like we need, we can eliminate the extra sequence.
2800#
2801# Input is f32, output is u32 that has the f16 packed into its low bits.
2802def vkd3d_proton_packed_f2f16_rtz_lo(a, abs_a):
2803    packed_half = ('pack_half_2x16_rtz_split', a, 0)
2804    packed_half_minus1 = ('iadd', packed_half, 0xffffffff)
2805    f32_was_not_inf = ('ine', abs_a, 0x7f800000)
2806    f16_is_now_inf = ('ieq', ('iand', packed_half, 0x7fff), 0x7c00)
2807    return ('bcsel', ('iand', f32_was_not_inf, f16_is_now_inf), packed_half_minus1, packed_half)
2808
2809optimizations += [
2810   (vkd3d_proton_packed_f2f16_rtz_lo('x', ('fabs', 'x')), ('pack_half_2x16_rtz_split', 'x', 0)),
2811   (vkd3d_proton_packed_f2f16_rtz_lo('x(is_not_negative)', 'x'), ('pack_half_2x16_rtz_split', 'x', 0)),
2812   (vkd3d_proton_packed_f2f16_rtz_lo(('fneg', 'x'), ('fabs', 'x')), ('pack_half_2x16_rtz_split', ('fneg', 'x'), 0)),
2813]
2814
2815def vkd3d_proton_msad():
2816   pattern = None
2817   for i in range(4):
2818      ref = ('extract_u8', 'a@32', i)
2819      src = ('extract_u8', 'b@32', i)
2820      sad = ('iabs', ('iadd', ref, ('ineg', src)))
2821      msad = ('bcsel', ('ieq', ref, 0), 0, sad)
2822      if pattern == None:
2823         pattern = msad
2824      else:
2825         pattern = ('iadd', pattern, msad)
2826   pattern = (pattern[0] + '(many-comm-expr)', *pattern[1:])
2827   return pattern
2828
2829optimizations += [
2830   (vkd3d_proton_msad(), ('msad_4x8', a, b, 0), 'options->has_msad'),
2831   (('iadd', ('msad_4x8', a, b, 0), c), ('msad_4x8', a, b, c)),
2832]
2833
2834
2835# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)"
2836# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)"
2837for ncomp in [2, 3, 4, 8, 16]:
2838   optimizations += [
2839      (('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)),
2840      (('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)),
2841      (('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)),
2842      (('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)),
2843   ]
2844
2845# For any float comparison operation, "cmp", if you have "a == a && a cmp b"
2846# then the "a == a" is redundant because it's equivalent to "a is not NaN"
2847# and, if a is a NaN then the second comparison will fail anyway.
2848for op in ['flt', 'fge', 'feq']:
2849   optimizations += [
2850      (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)),
2851      (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)),
2852   ]
2853
2854# Add optimizations to handle the case where the result of a ternary is
2855# compared to a constant.  This way we can take things like
2856#
2857# (a ? 0 : 1) > 0
2858#
2859# and turn it into
2860#
2861# a ? (0 > 0) : (1 > 0)
2862#
2863# which constant folding will eat for lunch.  The resulting ternary will
2864# further get cleaned up by the boolean reductions above and we will be
2865# left with just the original variable "a".
2866for op in ['feq', 'fneu', 'ieq', 'ine']:
2867   optimizations += [
2868      ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
2869       ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
2870   ]
2871
2872for op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']:
2873   optimizations += [
2874      ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
2875       ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
2876      ((op, '#d', ('bcsel', a, '#b', '#c')),
2877       ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))),
2878   ]
2879
2880
2881# For example, this converts things like
2882#
2883#    1 + mix(0, a - 1, condition)
2884#
2885# into
2886#
2887#    mix(1, (a-1)+1, condition)
2888#
2889# Other optimizations will rearrange the constants.
2890for op in ['fadd', 'fmul', 'fmulz', 'iadd', 'imul']:
2891   optimizations += [
2892      ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d)))
2893   ]
2894
2895# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives
2896# states:
2897#
2898#     If neither layout qualifier is specified, derivatives in compute shaders
2899#     return zero, which is consistent with the handling of built-in texture
2900#     functions like texture() in GLSL 4.50 compute shaders.
2901for op in ['fddx', 'fddx_fine', 'fddx_coarse',
2902           'fddy', 'fddy_fine', 'fddy_coarse']:
2903   optimizations += [
2904      ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->derivative_group == DERIVATIVE_GROUP_NONE')
2905]
2906
2907# Some optimizations for ir3-specific instructions.
2908optimizations += [
2909   # 'al * bl': If either 'al' or 'bl' is zero, return zero.
2910   (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)),
2911   # '(al * bh) << 16 + c': If either 'al' or 'bh' is zero, return 'c'.
2912   (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')),
2913   (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')),
2914]
2915
2916# These kinds of sequences can occur after nir_opt_peephole_select.
2917#
2918# NOTE: fadd is not handled here because that gets in the way of ffma
2919# generation in the i965 driver.  Instead, fadd and ffma are handled in
2920# late_optimizations.
2921
2922for op in ['flrp']:
2923    optimizations += [
2924        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2925        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2926        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2927        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2928        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)),
2929        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)),
2930    ]
2931
2932for op in ['fmulz', 'fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']:
2933    optimizations += [
2934        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
2935        (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))),
2936        (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
2937        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
2938    ]
2939
2940for op in ['fpow']:
2941    optimizations += [
2942        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
2943        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
2944        (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)),
2945        (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)),
2946    ]
2947
2948for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fsin_amd', 'fcos_amd', 'fsin_mdg', 'fcos_mdg', 'fsin_agx', 'fneg', 'fabs', 'fsign']:
2949    optimizations += [
2950        (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))),
2951    ]
2952
2953for op in ['ineg', 'iabs', 'inot', 'isign']:
2954    optimizations += [
2955        ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))),
2956    ]
2957
2958optimizations.extend([
2959    (('fisnormal', 'a@16'), ('ult', 0xfff, ('iadd', ('ishl', a, 1), 0x800)), 'options->lower_fisnormal'),
2960    (('fisnormal', 'a@32'), ('ult', 0x1ffffff, ('iadd', ('ishl', a, 1), 0x1000000)), 'options->lower_fisnormal'),
2961    (('fisnormal', 'a@64'), ('ult', 0x3fffffffffffff, ('iadd', ('ishl', a, 1), 0x20000000000000)), 'options->lower_fisnormal')
2962    ])
2963
2964
2965"""
2966  if (fabs(val) < SMALLEST_NORMALIZED_FLOAT16)
2967     return (val & SIGN_BIT) /* +0.0 or -0.0 as appropriate */;
2968  else
2969     return f2f32(f2f16(val));
2970"""
2971optimizations.extend([
2972    (('fquantize2f16', 'a@32'),
2973     ('bcsel', ('!flt', ('!fabs', a), math.ldexp(1.0, -14)),
2974               ('iand', a, 1 << 31),
2975               ('!f2f32', ('!f2f16_rtne', a))),
2976     'options->lower_fquantize2f16')
2977    ])
2978
2979for s in range(0, 31):
2980    mask = 0xffffffff << s
2981
2982    # bfi is ((mask & ...) | (~mask & ...)). Since the two sources of the ior
2983    # will never both have the same bits set, replacing the ior with an iadd
2984    # is safe (i.e., a carry out of a bit can never be generated). The iadd is
2985    # more likely to participate in other optimization patterns (e.g., iadd of
2986    # constant reassociation)
2987    optimizations.extend([
2988        (('bfi', mask, a, '#b'), ('iadd', ('ishl', a, s), ('iand', b, ~mask)),
2989         'options->avoid_ternary_with_two_constants'),
2990    ])
2991
2992# NaN propagation: Binary opcodes. If any operand is NaN, replace it with NaN.
2993# (unary opcodes with NaN are evaluated by nir_opt_constant_folding, not here)
2994for op in ['fadd', 'fdiv', 'fmod', 'fmul', 'fpow', 'frem', 'fsub']:
2995    optimizations += [((op, '#a(is_nan)', b), NAN)]
2996    optimizations += [((op, a, '#b(is_nan)'), NAN)] # some opcodes are not commutative
2997
2998# NaN propagation: Trinary opcodes. If any operand is NaN, replace it with NaN.
2999for op in ['ffma', 'flrp']:
3000    optimizations += [((op, '#a(is_nan)', b, c), NAN)]
3001    optimizations += [((op, a, '#b(is_nan)', c), NAN)] # some opcodes are not commutative
3002    optimizations += [((op, a, b, '#c(is_nan)'), NAN)]
3003
3004# NaN propagation: FP min/max. Pick the non-NaN operand.
3005for op in ['fmin', 'fmax']:
3006    optimizations += [((op, '#a(is_nan)', b), b)] # commutative
3007
3008# NaN propagation: ldexp is NaN if the first operand is NaN.
3009optimizations += [(('ldexp', '#a(is_nan)', b), NAN)]
3010
3011# NaN propagation: Dot opcodes. If any component is NaN, replace it with NaN.
3012for op in ['fdot2', 'fdot3', 'fdot4', 'fdot5', 'fdot8', 'fdot16']:
3013    optimizations += [((op, '#a(is_any_comp_nan)', b), NAN)] # commutative
3014
3015# NaN propagation: FP comparison opcodes except !=. Replace it with false.
3016for op in ['feq', 'fge', 'flt']:
3017    optimizations += [((op, '#a(is_nan)', b), False)]
3018    optimizations += [((op, a, '#b(is_nan)'), False)] # some opcodes are not commutative
3019
3020# NaN propagation: FP comparison opcodes using !=. Replace it with true.
3021# Operator != is the only opcode where a comparison with NaN returns true.
3022for op in ['fneu']:
3023    optimizations += [((op, '#a(is_nan)', b), True)] # commutative
3024
3025# NaN propagation: FP comparison opcodes except != returning FP 0 or 1.
3026for op in ['seq', 'sge', 'slt']:
3027    optimizations += [((op, '#a(is_nan)', b), 0.0)]
3028    optimizations += [((op, a, '#b(is_nan)'), 0.0)] # some opcodes are not commutative
3029
3030# NaN propagation: FP comparison opcodes using != returning FP 0 or 1.
3031# Operator != is the only opcode where a comparison with NaN returns true.
3032optimizations += [(('sne', '#a(is_nan)', b), 1.0)] # commutative
3033
3034# This section contains optimizations to propagate downsizing conversions of
3035# constructed vectors into vectors of downsized components. Whether this is
3036# useful depends on the SIMD semantics of the backend. On a true SIMD machine,
3037# this reduces the register pressure of the vector itself and often enables the
3038# conversions to be eliminated via other algebraic rules or constant folding.
3039# In the worst case on a SIMD architecture, the propagated conversions may be
3040# revectorized via nir_opt_vectorize so instruction count is minimally
3041# impacted.
3042#
3043# On a machine with SIMD-within-a-register only, this actually
3044# counterintuitively hurts instruction count. These machines are the same that
3045# require vectorize_vec2_16bit, so we predicate the optimizations on that flag
3046# not being set.
3047#
3048# Finally for scalar architectures, there should be no difference in generated
3049# code since it all ends up scalarized at the end, but it might minimally help
3050# compile-times.
3051
3052for i in range(2, 4 + 1):
3053   for T in ('f', 'u', 'i'):
3054      vec_inst = ('vec' + str(i),)
3055
3056      indices = ['a', 'b', 'c', 'd']
3057      suffix_in = tuple((indices[j] + '@32') for j in range(i))
3058
3059      to_16 = '{}2{}16'.format(T, T)
3060      to_mp = '{}2{}mp'.format(T, T)
3061
3062      out_16 = tuple((to_16, indices[j]) for j in range(i))
3063      out_mp = tuple((to_mp, indices[j]) for j in range(i))
3064
3065      optimizations  += [
3066         ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'),
3067      ]
3068      # u2ump doesn't exist, because it's equal to i2imp
3069      if T in ['f', 'i']:
3070          optimizations  += [
3071             ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit')
3072          ]
3073
3074# This section contains "late" optimizations that should be run before
3075# creating ffmas and calling regular optimizations for the final time.
3076# Optimizations should go here if they help code generation and conflict
3077# with the regular optimizations.
3078before_ffma_optimizations = [
3079   # Propagate constants down multiplication chains
3080   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)),
3081   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)),
3082   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)),
3083   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)),
3084
3085   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
3086   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
3087   (('~fadd', ('fneg', a), a), 0.0),
3088   (('iadd', ('ineg', a), a), 0),
3089   (('iadd', ('ineg', a), ('iadd', a, b)), b),
3090   (('iadd', a, ('iadd', ('ineg', a), b)), b),
3091   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
3092   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
3093
3094   (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a,  1.0), d), ('fadd', ('flrp', -1.0,  1.0, d), a)),
3095   (('~flrp', ('fadd(is_used_once)', a,  1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp',  1.0, -1.0, d), a)),
3096   (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),
3097]
3098
3099# This section contains "late" optimizations that should be run after the
3100# regular optimizations have finished.  Optimizations should go here if
3101# they help code generation but do not necessarily produce code that is
3102# more easily optimizable.
3103late_optimizations = [
3104   # The rearrangements are fine w.r.t. NaN.  However, they produce incorrect
3105   # results if one operand is +Inf and the other is -Inf.
3106   #
3107   # 1. Inf + -Inf = NaN
3108   # 2. ∀x: x + NaN = NaN and x - NaN = NaN
3109   # 3. ∀x: x != NaN = true
3110   # 4. ∀x, ∀ cmp ∈ {<, >, ≤, ≥, =}: x cmp NaN = false
3111   #
3112   #               a=Inf, b=-Inf   a=-Inf, b=Inf    a=NaN    b=NaN
3113   #  (a+b) < 0        false            false       false    false
3114   #      a < -b       false            false       false    false
3115   # -(a+b) < 0        false            false       false    false
3116   #     -a < b        false            false       false    false
3117   #  (a+b) >= 0       false            false       false    false
3118   #      a >= -b      true             true        false    false
3119   # -(a+b) >= 0       false            false       false    false
3120   #     -a >= b       true             true        false    false
3121   #  (a+b) == 0       false            false       false    false
3122   #      a == -b      true             true        false    false
3123   #  (a+b) != 0       true             true        true     true
3124   #      a != -b      false            false       true     true
3125   (('flt',                        ('fadd(is_used_once)', a, b),  0.0), ('flt',          a, ('fneg', b))),
3126   (('flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('flt', ('fneg', a),         b)),
3127   (('flt', 0.0,                        ('fadd(is_used_once)', a, b) ), ('flt', ('fneg', a),         b)),
3128   (('flt', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('flt',          a, ('fneg', b))),
3129   (('~fge',                        ('fadd(is_used_once)', a, b),  0.0), ('fge',          a, ('fneg', b))),
3130   (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('fge', ('fneg', a),         b)),
3131   (('~fge', 0.0,                        ('fadd(is_used_once)', a, b) ), ('fge', ('fneg', a),         b)),
3132   (('~fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fge',          a, ('fneg', b))),
3133   (('~feq', ('fadd(is_used_once)', a, b), 0.0), ('feq', a, ('fneg', b))),
3134   (('~fneu', ('fadd(is_used_once)', a, b), 0.0), ('fneu', a, ('fneg', b))),
3135
3136   # If either source must be finite, then the original (a+b) cannot produce
3137   # NaN due to Inf-Inf.  The patterns and the replacements produce the same
3138   # result if b is NaN. Therefore, the replacements are exact.
3139   (('fge',                        ('fadd(is_used_once)', 'a(is_finite)', b),  0.0), ('fge',          a, ('fneg', b))),
3140   (('fge', ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b)), 0.0), ('fge', ('fneg', a),         b)),
3141   (('fge', 0.0,                        ('fadd(is_used_once)', 'a(is_finite)', b) ), ('fge', ('fneg', a),         b)),
3142   (('fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b))), ('fge',          a, ('fneg', b))),
3143   (('feq',  ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq',  a, ('fneg', b))),
3144   (('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))),
3145
3146   # This is how SpvOpFOrdNotEqual might be implemented.  Replace it with
3147   # SpvOpLessOrGreater.
3148   *add_fabs_fneg((('iand', ('fneu', 'ma', 'mb'), ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', 'ma', 'mb'), ('!flt', 'mb', 'ma'))), {'ma' : a, 'mb' : b}),
3149   (('iand', ('fneu', a, 0.0), ('feq', a, a)), ('!flt', 0.0, ('fabs', a))),
3150
3151   # This is how SpvOpFUnordEqual might be implemented.  Replace it with
3152   # !SpvOpLessOrGreater.
3153   *add_fabs_fneg((('ior', ('feq', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', 'ma', 'mb'), ('!flt', 'mb', 'ma')))), {'ma' : a, 'mb' : b}),
3154   (('ior', ('feq', a, 0.0), ('fneu', a, a)), ('inot', ('!flt', 0.0, ('fabs', a)))),
3155
3156   *add_fabs_fneg((('ior', ('flt', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('fge', 'ma', 'mb'))), {'ma' : a, 'mb' : b}, False),
3157   *add_fabs_fneg((('ior', ('fge', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('flt', 'ma', 'mb'))), {'ma' : a, 'mb' : b}, False),
3158   *add_fabs_fneg((('ior', ('flt', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('inot', ('fge', 'ma', b))), {'ma' : a}),
3159   *add_fabs_fneg((('ior', ('fge', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('inot', ('flt', 'ma', b))), {'ma' : a}),
3160   *add_fabs_fneg((('ior', ('flt', 'a(is_a_number)', 'mb'), ('fneu', b, b)), ('inot', ('fge', a, 'mb'))), {'mb' : b}),
3161   *add_fabs_fneg((('ior', ('fge', 'a(is_a_number)', 'mb'), ('fneu', b, b)), ('inot', ('flt', a, 'mb'))), {'mb' : b}),
3162   *add_fabs_fneg((('iand', ('fneu', 'ma', 'b(is_a_number)'), ('feq', a, a)), ('fneo', 'ma', b), 'options->has_fneo_fcmpu'), {'ma' : a}),
3163   *add_fabs_fneg((('ior', ('feq', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('fequ', 'ma', b), 'options->has_fneo_fcmpu'), {'ma' : a}),
3164
3165   (('ior', ('flt', a, b), ('flt', b, a)), ('fneo', a, b), 'options->has_fneo_fcmpu'),
3166   (('flt', 0.0, ('fabs', a)), ('fneo', 0.0, a), 'options->has_fneo_fcmpu'),
3167
3168
3169   # These don't interfere with the previous optimizations which include this
3170   # in the search expression, because nir_algebraic_impl visits instructions
3171   # in reverse order.
3172   (('ior', ('fneu', 'a@16', a), ('fneu', 'b@16', b)), ('funord', a, b), 'options->has_ford_funord'),
3173   (('iand', ('feq', 'a@16', a), ('feq', 'b@16', b)), ('ford', a, b), 'options->has_ford_funord'),
3174   (('ior', ('fneu', 'a@32', a), ('fneu', 'b@32', b)), ('funord', a, b), 'options->has_ford_funord'),
3175   (('iand', ('feq', 'a@32', a), ('feq', 'b@32', b)), ('ford', a, b), 'options->has_ford_funord'),
3176   (('ior', ('fneu', 'a@64', a), ('fneu', 'b@64', b)), ('funord', a, b), 'options->has_ford_funord'),
3177   (('iand', ('feq', 'a@64', a), ('feq', 'b@64', b)), ('ford', a, b), 'options->has_ford_funord'),
3178
3179   (('inot', ('ford(is_used_once)', a, b)), ('funord', a, b)),
3180   (('inot', ('funord(is_used_once)', a, b)), ('ford', a, b)),
3181   (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)),
3182   (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)),
3183   (('inot', ('fequ(is_used_once)', a, b)), ('fneo', a, b)),
3184   (('inot', ('fneo(is_used_once)', a, b)), ('fequ', a, b)),
3185   (('inot', ('flt(is_used_once)', a, b)), ('fgeu', a, b), 'options->has_fneo_fcmpu'),
3186   (('inot', ('fgeu(is_used_once)', a, b)), ('flt', a, b)),
3187   (('inot', ('fge(is_used_once)', a, b)), ('fltu', a, b), 'options->has_fneo_fcmpu'),
3188   (('inot', ('fltu(is_used_once)', a, b)), ('fge', a, b)),
3189
3190   # nir_lower_to_source_mods will collapse this, but its existence during the
3191   # optimization loop can prevent other optimizations.
3192   (('fneg', ('fneg', a)), a),
3193
3194   # combine imul and iadd to imad
3195   (('iadd@32', ('imul(is_only_used_by_iadd)', a, b), c), ('imad', a, b, c), 'options->has_imad32'),
3196]
3197
3198# re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c
3199# gets combined to fma(a, b, -c).
3200for sz, mulz in itertools.product([16, 32, 64], [False, True]):
3201    # fmulz/ffmaz only for fp32
3202    if mulz and sz != 32:
3203        continue
3204
3205    # Fuse the correct fmul. Only consider fmuls where the only users are fadd
3206    # (or fneg/fabs which are assumed to be propagated away), as a heuristic to
3207    # avoid fusing in cases where it's harmful.
3208    fmul = ('fmulz' if mulz else 'fmul') + '(is_only_used_by_fadd)'
3209    ffma = 'ffmaz' if mulz else 'ffma'
3210
3211    fadd = '~fadd@{}'.format(sz)
3212    option = 'options->fuse_ffma{}'.format(sz)
3213
3214    late_optimizations.extend([
3215        ((fadd, (fmul, a, b), c), (ffma, a, b, c), option),
3216
3217        ((fadd, ('fneg(is_only_used_by_fadd)', (fmul, a, b)), c),
3218         (ffma, ('fneg', a), b, c), option),
3219
3220        ((fadd, ('fabs(is_only_used_by_fadd)', (fmul, a, b)), c),
3221         (ffma, ('fabs', a), ('fabs', b), c), option),
3222
3223        ((fadd, ('fneg(is_only_used_by_fadd)', ('fabs', (fmul, a, b))), c),
3224         (ffma, ('fneg', ('fabs', a)), ('fabs', b), c), option),
3225    ])
3226
3227late_optimizations.extend([
3228   # Subtractions get lowered during optimization, so we need to recombine them
3229   (('fadd@8', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
3230   (('fadd@16', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
3231   (('fadd@32', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
3232   (('fadd@64', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub && !(options->lower_doubles_options & nir_lower_dsub)'),
3233
3234   (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'),
3235   (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'),
3236   (('ineg', a), ('isub', 0, a), 'options->lower_ineg'),
3237   (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),
3238])
3239
3240for s in [8, 16, 32, 64]:
3241   cond = 'options->has_iadd3'
3242   if s == 64:
3243      cond += ' && !(options->lower_int64_options & nir_lower_iadd3_64)'
3244
3245   iadd = "iadd@{}".format(s)
3246
3247   # On Intel GPUs, the constant field for an ADD3 instruction must be either
3248   # int16_t or uint16_t.
3249   late_optimizations.extend([
3250      ((iadd, ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), cond),
3251      ((iadd, ('iadd(is_used_once)', '#a(is_16_bits)',  'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), cond),
3252      ((iadd, ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_16_bits)'),   ('iadd3', a, b, c), cond),
3253      ((iadd, ('ineg', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)')), 'c(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', b), c), cond),
3254      ((iadd, ('ineg', ('iadd(is_used_once)', '#a(is_16_bits)',  'b(is_not_const)')), 'c(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', b), c), cond),
3255      ((iadd, ('ineg', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)')), '#c(is_16_bits)'),  ('iadd3', ('ineg', a), ('ineg', b), c), cond),
3256
3257      ((iadd, ('ishl', a, 1), 'b(is_not_const)'), ('iadd3', a, a, b), cond),
3258      ((iadd, ('ishl', a, 1), '#b(is_16_bits)' ), ('iadd3', a, a, b), cond),
3259      ((iadd, ('ineg', ('ishl', a, 1)), 'b(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', a), b), cond),
3260      ((iadd, ('ineg', ('ishl', a, 1)), '#b(is_16_bits)' ), ('iadd3', ('ineg', a), ('ineg', a), b), cond),
3261
3262      # Use special checks to ensure (b+b) or -(b+b) fit in 16 bits.
3263      (('ishl@{}'.format(s), ('iadd', a, '#b(is_2x_16_bits)'), 1), ('iadd3', a, a, ('iadd', b, b)), cond),
3264      (('ishl@{}'.format(s), ('ineg', ('iadd', a, '#b(is_neg2x_16_bits)')), 1), ('iadd3', ('ineg', a), ('ineg', a), ('ineg', ('iadd', b, b))), cond),
3265   ])
3266
3267late_optimizations.extend([
3268    # fneg_lo / fneg_hi
3269   (('vec2(is_only_used_as_float)', ('fneg@16', a), b), ('fmul', ('vec2', a, b), ('vec2', -1.0, 1.0)), 'options->vectorize_vec2_16bit'),
3270   (('vec2(is_only_used_as_float)', a, ('fneg@16', b)), ('fmul', ('vec2', a, b), ('vec2', 1.0, -1.0)), 'options->vectorize_vec2_16bit'),
3271
3272   # These are duplicated from the main optimizations table.  The late
3273   # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
3274   # new patterns like these.  The patterns that compare with zero are removed
3275   # because they are unlikely to be created in by anything in
3276   # late_optimizations.
3277   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
3278   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
3279   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
3280   (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
3281
3282   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
3283
3284   (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))),
3285
3286   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
3287   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
3288   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
3289   (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
3290   (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
3291   (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
3292   (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
3293   (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
3294   (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
3295   (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
3296
3297   (('ior', a, a), a),
3298   (('iand', a, a), a),
3299
3300   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
3301
3302   (('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'),
3303   (('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'),
3304   (('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'),
3305   (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
3306
3307   (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
3308
3309   # Approximate handling of fround_even for DX9 addressing from gallium nine on
3310   # DX9-class hardware with no proper fround support.  This is in
3311   # late_optimizations so that the is_integral() opts in the main pass get a
3312   # chance to eliminate the fround_even first.
3313   (('fround_even', a), ('bcsel',
3314                         ('feq', ('ffract', a), 0.5),
3315                         ('fadd', ('ffloor', ('fadd', a, 0.5)), 1.0),
3316                         ('ffloor', ('fadd', a, 0.5))), 'options->lower_fround_even'),
3317
3318   # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this
3319   # particular operation is common for expanding values stored in a texture
3320   # from [0,1] to [-1,1].
3321   (('~ffma@32', a,  2.0, -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
3322   (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
3323   (('~ffma@32', a, -2.0,  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
3324   (('~ffma@32', a,  2.0,  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
3325   (('~fadd@32', ('fmul(is_used_once)',  2.0, a), -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
3326   (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
3327   (('~fadd@32', ('fmul(is_used_once)', -2.0, a),  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
3328   (('~fadd@32', ('fmul(is_used_once)',  2.0, a),  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
3329
3330    # flrp(a, b, a)
3331    # a*(1-a) + b*a
3332    # a + -a*a + a*b    (1)
3333    # a + a*(b - a)
3334    # Option 1: ffma(a, (b-a), a)
3335    #
3336    # Alternately, after (1):
3337    # a*(1+b) + -a*a
3338    # a*((1+b) + -a)
3339    #
3340    # Let b=1
3341    #
3342    # Option 2: ffma(a, 2, -(a*a))
3343    # Option 3: ffma(a, 2, (-a)*a)
3344    # Option 4: ffma(a, -a, (2*a)
3345    # Option 5: a * (2 - a)
3346    #
3347    # There are a lot of other possible combinations.
3348   (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'),
3349   (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
3350   (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
3351   (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
3352   (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))),    ('flrp', a, 1.0, a), '!options->lower_flrp32'),
3353
3354   # we do these late so that we don't get in the way of creating ffmas
3355   (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
3356   (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
3357
3358   # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c),
3359   # op(b, d)) => op(b, bcsel(a, c, d)) transformations.  I do not know why.
3360   (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)),
3361    ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))),
3362
3363   # Things that look like DPH in the source shader may get expanded to
3364   # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets
3365   # to NIR.  After FFMA is generated, this can look like:
3366   #
3367   #    fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w)
3368   #
3369   # Reassociate the last addition into the first multiplication.
3370   #
3371   # Some shaders do not use 'invariant' in vertex and (possibly) geometry
3372   # shader stages on some outputs that are intended to be invariant.  For
3373   # various reasons, this optimization may not be fully applied in all
3374   # shaders used for different rendering passes of the same geometry.  This
3375   # can result in Z-fighting artifacts (at best).  For now, disable this
3376   # optimization in these stages.  See bugzilla #111490.  In tessellation
3377   # stages applications seem to use 'precise' when necessary, so allow the
3378   # optimization in those stages.
3379   (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
3380    ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3381   (('~fadd', ('ffma(is_used_once)', a, b, ('fmul(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
3382    ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3383   (('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),
3384    ('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3385
3386   (('~fadd', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
3387    ('ffmaz', a, b, ('ffmaz', c, d, ('ffmaz', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3388   (('~fadd', ('ffmaz(is_used_once)', a, b, ('fmulz(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
3389    ('ffmaz', a, b, ('ffmaz', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3390   (('~fadd', ('fneg', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),
3391    ('ffmaz', ('fneg', a), b, ('ffmaz', ('fneg', c), d, ('ffmaz', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
3392
3393   # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
3394   #
3395   #    If bits is zero, the result will be zero.
3396   #
3397   # These prevent the next two lowerings generating incorrect results when
3398   # count is zero.
3399   (('ubfe', a, b, 0), 0),
3400   (('ibfe', a, b, 0), 0),
3401
3402   # On Intel GPUs, BFE is a 3-source instruction.  Like all 3-source
3403   # instructions on Intel GPUs, it cannot have an immediate values as
3404   # sources.  There are also limitations on source register strides.  As a
3405   # result, it is very easy for 3-source instruction combined with either
3406   # loads of immediate values or copies from weird register strides to be
3407   # more expensive than the primitive instructions it represents.
3408   (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->avoid_ternary_with_two_constants'),
3409
3410   # b is the lowest order bit to be extracted and c is the number of bits to
3411   # extract.  The inner shift removes the bits above b + c by shifting left
3412   # 32 - (b + c).  ishl only sees the low 5 bits of the shift count, which is
3413   # -(b + c).  The outer shift moves the bit that was at b to bit zero.
3414   # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c.
3415   # This means that it must be shifted right by 32 - c or -c bits.
3416   (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->avoid_ternary_with_two_constants'),
3417
3418   # Clean up no-op shifts that may result from the bfe lowerings.
3419   (('ishl', a, 0), a),
3420   (('ishl', a, -32), a),
3421   (('ishr', a, 0), a),
3422   (('ishr', a, -32), a),
3423   (('ushr', a, 0), a),
3424
3425   (('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)),
3426   (('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)),
3427   (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),
3428   (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),
3429
3430   # open coded bit test
3431   (('ine', ('iand', a, '#b(is_pos_power_of_two)'), 0), ('bitnz', a, ('find_lsb', b)), 'options->has_bit_test'),
3432   (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), 0), ('bitz', a, ('find_lsb', b)), 'options->has_bit_test'),
3433   (('ine', ('iand', a, '#b(is_pos_power_of_two)'), b), ('bitz', a, ('find_lsb', b)), 'options->has_bit_test'),
3434   (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), b), ('bitnz', a, ('find_lsb', b)), 'options->has_bit_test'),
3435   (('ine', ('iand', a, ('ishl', 1, b)), 0), ('bitnz', a, b), 'options->has_bit_test'),
3436   (('ieq', ('iand', a, ('ishl', 1, b)), 0), ('bitz', a, b), 'options->has_bit_test'),
3437   (('ine', ('iand', a, ('ishl', 1, b)), ('ishl', 1, b)), ('bitz', a, b), 'options->has_bit_test'),
3438   (('ieq', ('iand', a, ('ishl', 1, b)), ('ishl', 1, b)), ('bitnz', a, b), 'options->has_bit_test'),
3439   (('bitz', ('ushr', a, b), 0), ('bitz', a, b)),
3440   (('bitz', ('ishr', a, b), 0), ('bitz', a, b)),
3441   (('bitnz', ('ushr', a, b), 0), ('bitnz', a, b)),
3442   (('bitnz', ('ishr', a, b), 0), ('bitnz', a, b)),
3443   (('ine', ('ubfe', a, b, 1), 0), ('bitnz', a, b), 'options->has_bit_test'),
3444   (('ieq', ('ubfe', a, b, 1), 0), ('bitz', a, b), 'options->has_bit_test'),
3445   (('ine', ('ubfe', a, b, 1), 1), ('bitz', a, b), 'options->has_bit_test'),
3446   (('ieq', ('ubfe', a, b, 1), 1), ('bitnz', a, b), 'options->has_bit_test'),
3447   (('ine', ('ibfe', a, b, 1), 0), ('bitnz', a, b), 'options->has_bit_test'),
3448   (('ieq', ('ibfe', a, b, 1), 0), ('bitz', a, b), 'options->has_bit_test'),
3449   (('ine', ('ibfe', a, b, 1), -1), ('bitz', a, b), 'options->has_bit_test'),
3450   (('ieq', ('ibfe', a, b, 1), -1), ('bitnz', a, b), 'options->has_bit_test'),
3451   (('inot', ('bitnz', a, b)), ('bitz', a, b)),
3452   (('inot', ('bitz', a, b)), ('bitnz', a, b)),
3453   (('bitnz', ('inot', a), b), ('bitz', a, b)),
3454   (('bitz', ('inot', a), b), ('bitnz', a, b)),
3455])
3456
3457# A few more extract cases we'd rather leave late
3458for N in [16, 32]:
3459    aN = 'a@{0}'.format(N)
3460    u2uM = 'u2u{0}'.format(M)
3461    i2iM = 'i2i{0}'.format(M)
3462
3463    for x in ['u', 'i']:
3464        x2xN = '{0}2{0}{1}'.format(x, N)
3465        extract_x8 = 'extract_{0}8'.format(x)
3466        extract_x16 = 'extract_{0}16'.format(x)
3467
3468        late_optimizations.extend([
3469            ((x2xN, ('u2u8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'),
3470            ((x2xN, ('i2i8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'),
3471        ])
3472
3473        if N > 16:
3474            late_optimizations.extend([
3475                ((x2xN, ('u2u16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'),
3476                ((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'),
3477            ])
3478
3479# Byte insertion
3480late_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)])
3481late_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)])
3482late_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte'))
3483
3484late_optimizations += [
3485   # Word insertion
3486   (('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'),
3487
3488   # Extract and then insert
3489   (('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)),
3490   (('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)),
3491]
3492
3493# Float sizes
3494for s in [16, 32, 64]:
3495    late_optimizations.extend([
3496       (('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)),
3497       (('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))),
3498    ])
3499
3500for op in ['fadd']:
3501    late_optimizations += [
3502        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
3503        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
3504    ]
3505
3506for op in ['ffma', 'ffmaz']:
3507    late_optimizations += [
3508        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
3509        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
3510
3511        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
3512        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
3513    ]
3514
3515# mediump: If an opcode is surrounded by conversions, remove the conversions.
3516# The rationale is that type conversions + the low precision opcode are more
3517# expensive that the same arithmetic opcode at higher precision.
3518#
3519# This must be done in late optimizations, because we need normal optimizations to
3520# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))).
3521#
3522# Unary opcodes
3523for op in ['fabs', 'fceil', 'fcos', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg',
3524           'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']:
3525    late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))]
3526
3527# Binary opcodes
3528for op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']:
3529    late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))]
3530
3531# Ternary opcodes
3532for op in ['ffma', 'flrp']:
3533    late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))]
3534
3535# Comparison opcodes
3536for op in ['feq', 'fge', 'flt', 'fneu']:
3537    late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))]
3538
3539# Do this last, so that the f2fmp patterns above have effect.
3540late_optimizations += [
3541  # Convert *2*mp instructions to concrete *2*16 instructions. At this point
3542  # any conversions that could have been removed will have been removed in
3543  # nir_opt_algebraic so any remaining ones are required.
3544  (('f2fmp', a), ('f2f16', a), "!options->preserve_mediump"),
3545  (('f2imp', a), ('f2i16', a), "!options->preserve_mediump"),
3546  (('f2ump', a), ('f2u16', a), "!options->preserve_mediump"),
3547  (('i2imp', a), ('i2i16', a), "!options->preserve_mediump"),
3548  (('i2fmp', a), ('i2f16', a), "!options->preserve_mediump"),
3549  (('i2imp', a), ('u2u16', a), "!options->preserve_mediump"),
3550  (('u2fmp', a), ('u2f16', a), "!options->preserve_mediump"),
3551  (('fisfinite', a), ('flt', ('fabs', a), float("inf"))),
3552
3553  (('f2f16', a), ('f2f16_rtz', a), "options->force_f2f16_rtz && !nir_is_rounding_mode_rtne(info->float_controls_execution_mode, 16)"),
3554
3555  (('fcsel', ('slt', 0, a), b, c), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"),
3556  (('fcsel', ('slt', a, 0), b, c), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
3557  (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"),
3558  (('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
3559
3560  (('bcsel', ('ilt', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, b, c), "options->has_fused_comp_and_csel && !options->no_integers"),
3561  (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b), "options->has_fused_comp_and_csel && !options->no_integers"),
3562  (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c), "options->has_fused_comp_and_csel && !options->no_integers"),
3563  (('bcsel', ('ige', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, c, b), "options->has_fused_comp_and_csel && !options->no_integers"),
3564
3565  (('bcsel', ('flt', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"),
3566  (('bcsel', ('flt', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
3567  (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"),
3568  (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
3569]
3570
3571distribute_src_mods = [
3572   # Try to remove some spurious negations rather than pushing them down.
3573   (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)),
3574   (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
3575   (('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)),
3576   (('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)),
3577   (('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)),
3578   (('fneg', ('fneg', a)), a),
3579
3580   (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),
3581   (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),
3582
3583   (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
3584   (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
3585   (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
3586
3587   # Note that fmin <-> fmax.  I don't think there is a way to distribute
3588   # fabs() into fmin or fmax.
3589   (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))),
3590   (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))),
3591
3592   (('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)),
3593   (('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)),
3594   (('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)),
3595
3596   # fdph works mostly like fdot, but to get the correct result, the negation
3597   # must be applied to the second source.
3598   (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))),
3599
3600   (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))),
3601   (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
3602]
3603
3604before_lower_int64_optimizations = [
3605    # The i2i64(a) implies that 'a' has at most 32-bits of data.
3606    (('ishl', ('i2i64', a), b),
3607     # Effective shift count of zero, just return 'a'.
3608     ('bcsel', ('ieq', ('iand', b, 63), 0), ('i2i64', a),
3609      ('bcsel', ('ilt', ('iand', b, 63), 32),
3610       # Shifting less than 32 bits, so both 32-bit halves will have
3611       # some data. These (and the else case) shift counts are of 32-bit
3612       # values, so the shift counts are implicitly moduolo 32.
3613       ('pack_64_2x32_split', ('ishl', ('i2i32', a), b), ('ishr', ('i2i32', a),          ('iadd', ('ineg', b), 32) )),
3614       # Shifting 32 bits or more, so lower 32 bits must be zero.
3615       ('pack_64_2x32_split', 0                        , ('ishl', ('i2i32', a), ('iabs', ('iadd', ('ineg', b), 32)))))),
3616     '(options->lower_int64_options & nir_lower_shift64) != 0'),
3617
3618    (('ishl', ('u2u64', a), b),
3619     ('bcsel', ('ieq', ('iand', b, 63), 0), ('u2u64', a),
3620      ('bcsel', ('ilt', ('iand', b, 63), 32),
3621       ('pack_64_2x32_split', ('ishl', ('u2u32', a), b), ('ushr', ('u2u32', a),          ('iadd', ('ineg', b), 32) )),
3622       ('pack_64_2x32_split', 0                        , ('ishl', ('u2u32', a), ('iabs', ('iadd', ('ineg', b), 32)))))),
3623     '(options->lower_int64_options & nir_lower_shift64) != 0'),
3624
3625    # If ineg64 is lowered, then the negation is not free. Try to eliminate
3626    # some of the negations.
3627    (('iadd@64', ('ineg', a), ('ineg(is_used_once)', b)), ('isub', ('ineg', a), b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
3628    (('iadd@64', a, ('ineg', b)), ('isub', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
3629    (('isub@64', a, ('ineg', b)), ('iadd', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
3630    (('isub@64', ('ineg', a), ('ineg', b)), ('isub', b, a), '(options->lower_int64_options & nir_lower_ineg64) != 0'),
3631
3632    (('imul@64', ('ineg', a), ('ineg', b)), ('imul', a, b)),
3633    (('idiv@64', ('ineg', a), ('ineg', b)), ('idiv', a, b)),
3634
3635    # If the hardware can do int64, the shift is the same cost as the add. It
3636    # should be fine to do this transformation unconditionally.
3637    (('iadd', ('i2i64', a), ('i2i64', a)), ('ishl', ('i2i64', a), 1)),
3638    (('iadd', ('u2u64', a), ('u2u64', a)), ('ishl', ('u2u64', a), 1)),
3639]
3640
3641parser = argparse.ArgumentParser()
3642parser.add_argument('--out', required=True)
3643args = parser.parse_args()
3644
3645with open(args.out, "w", encoding='utf-8') as f:
3646    f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
3647    f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
3648                                        before_ffma_optimizations).render())
3649    f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_lower_int64",
3650                                        before_lower_int64_optimizations).render())
3651    f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
3652                                        late_optimizations).render())
3653    f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",
3654                                        distribute_src_mods).render())
3655