1# -*- coding: utf-8 -*- 2# 3# Copyright (C) 2014 Intel Corporation 4# 5# Permission is hereby granted, free of charge, to any person obtaining a 6# copy of this software and associated documentation files (the "Software"), 7# to deal in the Software without restriction, including without limitation 8# the rights to use, copy, modify, merge, publish, distribute, sublicense, 9# and/or sell copies of the Software, and to permit persons to whom the 10# Software is furnished to do so, subject to the following conditions: 11# 12# The above copyright notice and this permission notice (including the next 13# paragraph) shall be included in all copies or substantial portions of the 14# Software. 15# 16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22# IN THE SOFTWARE. 23 24import argparse 25from collections import OrderedDict 26import nir_algebraic 27from nir_opcodes import type_sizes 28import itertools 29import struct 30from math import pi 31import math 32 33# Convenience variables 34a = 'a' 35b = 'b' 36c = 'c' 37d = 'd' 38e = 'e' 39NAN = math.nan 40 41has_fmulz = '(options->has_fmulz || \ 42 (options->has_fmulz_no_denorms && \ 43 !nir_is_denorm_preserve(info->float_controls_execution_mode, 32)))' 44 45ignore_exact = nir_algebraic.ignore_exact 46 47# Written in the form (<search>, <replace>) where <search> is an expression 48# and <replace> is either an expression or a value. An expression is 49# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>) 50# where each source is either an expression or a value. A value can be 51# either a numeric constant or a string representing a variable name. 52# 53# If the opcode in a search expression is prefixed by a '~' character, this 54# indicates that the operation is inexact. Such operations will only get 55# applied to SSA values that do not have the exact bit set. This should be 56# used by by any optimizations that are not bit-for-bit exact. It should not, 57# however, be used for backend-requested lowering operations as those need to 58# happen regardless of precision. 59# 60# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where: 61# "#" indicates that the given variable will only match constants, 62# type indicates that the given variable will only match values from ALU 63# instructions with the given output type, 64# (cond) specifies an additional condition function (see nir_search_helpers.h), 65# swiz is a swizzle applied to the variable (only in the <replace> expression) 66# 67# For constants, you have to be careful to make sure that it is the right 68# type because python is unaware of the source and destination types of the 69# opcodes. 70# 71# All expression types can have a bit-size specified. For opcodes, this 72# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a 73# type and size. In the search half of the expression this indicates that it 74# should only match that particular bit-size. In the replace half of the 75# expression this indicates that the constructed value should have that 76# bit-size. 77# 78# If the opcode in a replacement expression is prefixed by a '!' character, 79# this indicated that the new expression will be marked exact. 80# 81# A special condition "many-comm-expr" can be used with expressions to note 82# that the expression and its subexpressions have more commutative expressions 83# than nir_replace_instr can handle. If this special condition is needed with 84# another condition, the two can be separated by a comma (e.g., 85# "(many-comm-expr,is_used_once)"). 86# 87# Another set of special "conditions" are 88# "nsz": sign of zero is not preserved 89# "ninf": infinities are not preserved 90# "nnan": nan is not preserved 91# These relate to the float controls/fpfastmath and more descriptions of the 92# expression than conditions. That is, an expression with the "nsz" condition 93# means that the replacement expression won't preserve the sign of zero of the 94# result, and so it will be skipped if the matching instruction has the 95# 'signed_zero_preserve' flag set. 96 97# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648 98def lowered_sincos(c): 99 x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0) 100 x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0) 101 return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x) 102 103def intBitsToFloat(i): 104 return struct.unpack('!f', struct.pack('!I', i))[0] 105 106# Takes a pattern as input and returns a list of patterns where each 107# pattern has a different permutation of fneg/fabs(value) as the replacement 108# for the key operands in replacements. 109def add_fabs_fneg(pattern, replacements, commutative = True): 110 def to_list(pattern): 111 return [to_list(i) if isinstance(i, tuple) else i for i in pattern] 112 113 def to_tuple(pattern): 114 return tuple(to_tuple(i) if isinstance(i, list) else i for i in pattern) 115 116 def replace_varible(pattern, search, replace): 117 for i in range(len(pattern)): 118 if pattern[i] == search: 119 pattern[i] = replace 120 elif isinstance(pattern[i], list): 121 replace_varible(pattern[i], search, replace) 122 123 if commutative: 124 perms = itertools.combinations_with_replacement(range(4), len(replacements)) 125 else: 126 perms = itertools.product(range(4), repeat=len(replacements)) 127 128 result = [] 129 130 for perm in perms: 131 curr = to_list(pattern) 132 133 for i, (search, base) in enumerate(replacements.items()): 134 if perm[i] == 0: 135 replace = ['fneg', ['fabs', base]] 136 elif perm[i] == 1: 137 replace = ['fabs', base] 138 elif perm[i] == 2: 139 replace = ['fneg', base] 140 elif perm[i] == 3: 141 replace = base 142 143 replace_varible(curr, search, replace) 144 145 result.append(to_tuple(curr)) 146 return result 147 148 149optimizations = [ 150 # These will be recreated by late_algebraic if supported. 151 # Lowering here means we don't have to duplicate all other optimization patterns. 152 (('fgeu', a, b), ('inot', ('flt', a, b))), 153 (('fltu', a, b), ('inot', ('fge', a, b))), 154 (('fneo', 0.0, a), ('flt', 0.0, ('fabs', a))), 155 (('fequ', 0.0, a), ('inot', ('flt', 0.0, ('fabs', a)))), 156 157 158 (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), 159 (('imul', 'a@8', 0x80), ('ishl', a, 7), '!options->lower_bitops'), 160 (('imul', 'a@16', 0x8000), ('ishl', a, 15), '!options->lower_bitops'), 161 (('imul', 'a@32', 0x80000000), ('ishl', a, 31), '!options->lower_bitops'), 162 (('imul', 'a@64', 0x8000000000000000), ('ishl', a, 63), '!options->lower_bitops'), 163 (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), 164 (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'), 165 166 (('imul@64', a, '#b(is_bitcount2)'), ('iadd', ('ishl', a, ('ufind_msb', b)), ('ishl', a, ('find_lsb', b))), 167 '!options->lower_bitops && (options->lower_int64_options & (nir_lower_imul64 | nir_lower_shift64)) == nir_lower_imul64'), 168 169 (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 170 (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 171 (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'), 172 (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'), 173 (('udiv', a, 1), a), 174 (('idiv', a, 1), a), 175 (('umod', a, 1), 0), 176 (('imod', a, 1), 0), 177 (('imod', a, -1), 0), 178 (('irem', a, 1), 0), 179 (('irem', a, -1), 0), 180 (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'), 181 (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), '!options->lower_bitops'), 182 (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'), 183 (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'), 184 (('imod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'), 185 (('imod', a, '#b(is_neg_power_of_two)'), ('bcsel', ('ieq', ('ior', a, b), b), 0, ('ior', a, b)), '!options->lower_bitops'), 186 # 'irem(a, b)' -> 'a - ((a < 0 ? (a + b - 1) : a) & -b)' 187 (('irem', a, '#b(is_pos_power_of_two)'), 188 ('isub', a, ('iand', ('bcsel', ('ilt', a, 0), ('iadd', a, ('isub', b, 1)), a), ('ineg', b))), 189 '!options->lower_bitops'), 190 (('irem', a, '#b(is_neg_power_of_two)'), ('irem', a, ('iabs', b)), '!options->lower_bitops'), 191 192 (('~fmul', ('fsign', a), ('ffloor', ('fadd', ('fabs', a), 0.5))), ('ftrunc', ('fadd', a, ('fmul', ('fsign', a), 0.5))), '!options->lower_ftrunc || options->lower_ffloor'), 193 194 (('~fneg', ('fneg', a)), a), 195 (('ineg', ('ineg', a)), a), 196 (('fabs', ('fneg', a)), ('fabs', a)), 197 (('fabs', ('u2f', a)), ('u2f', a)), 198 (('iabs', ('iabs', a)), ('iabs', a)), 199 (('iabs', ('ineg', a)), ('iabs', a)), 200 (('~fadd', a, 0.0), a), 201 # a+0.0 is 'a' unless 'a' is denormal or -0.0. If it's only used by a 202 # floating point instruction, they should flush any input denormals and we 203 # can replace -0.0 with 0.0 if the float execution mode allows it. 204 (('fadd(is_only_used_as_float,nsz)', 'a', 0.0), a), 205 (('iadd', a, 0), a), 206 (('iadd_sat', a, 0), a), 207 (('isub_sat', a, 0), a), 208 (('uadd_sat', a, 0), a), 209 (('usub_sat', a, 0), a), 210 (('usadd_4x8_vc4', a, 0), a), 211 (('usadd_4x8_vc4', a, ~0), ~0), 212 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 213 (('~fadd', ('fmulz', a, b), ('fmulz', a, c)), ('fmulz', a, ('fadd', b, c))), 214 (('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), 215 (('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))), 216 (('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), 217 (('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))), 218 (('~ffmaz', a, b, ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)), 219 (('~ffmaz', a, b, ('fmulz(is_used_once)', a, c)), ('fmulz', a, ('fadd', b, c))), 220 (('~fadd', ('fmulz(is_used_once)', a, b), ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)), 221 (('~ffmaz', a, ('fmulz(is_used_once)', b, c), ('fmulz(is_used_once)', b, d)), ('fmulz', b, ('ffmaz', a, c, d))), 222 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 223 (('iadd', ('ishl', b, a), ('ishl', c, a)), ('ishl', ('iadd', b, c), a)), 224 (('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))), 225 (('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))), 226 (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), b), ('ine', ('iand', a, b), 0)), 227 (('ine', ('iand', a, '#b(is_pos_power_of_two)'), b), ('ieq', ('iand', a, b), 0)), 228 (('ieq', ('ushr(is_used_once)', a, '#b'), 0), ('ult', a, ('ishl', 1, b))), 229 (('ine', ('ushr(is_used_once)', a, '#b'), 0), ('uge', a, ('ishl', 1, b))), 230 (('~fadd', ('fneg', a), a), 0.0), 231 (('iadd', ('ineg', a), a), 0), 232 (('iadd', ('ineg', a), ('iadd', a, b)), b), 233 (('iadd', a, ('iadd', ('ineg', a), b)), b), 234 (('~fadd', ('fneg', a), ('fadd', a, b)), b), 235 (('~fadd', a, ('fadd', ('fneg', a), b)), b), 236 (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))), 237 (('~fmul', a, 0.0), 0.0), 238 # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN 239 (('fmul(nsz,nnan)', 'a', 0.0), 0.0), 240 (('fmulz', a, 0.0), 0.0), 241 (('fmulz(nsz)', a, 'b(is_finite_not_zero)'), ('fmul', a, b)), 242 (('fmulz', 'a(is_finite)', 'b(is_finite)'), ('fmul', a, b)), 243 (('fmulz', a, a), ('fmul', a, a)), 244 (('ffmaz(nsz)', a, 'b(is_finite_not_zero)', c), ('ffma', a, b, c)), 245 (('ffmaz', 'a(is_finite)', 'b(is_finite)', c), ('ffma', a, b, c)), 246 (('ffmaz', a, a, b), ('ffma', a, a, b)), 247 (('imul', a, 0), 0), 248 (('umul_unorm_4x8_vc4', a, 0), 0), 249 (('umul_unorm_4x8_vc4', a, ~0), a), 250 (('~fmul', a, 1.0), a), 251 (('~fmulz', a, 1.0), a), 252 # The only effect a*1.0 can have is flushing denormals. If it's only used by 253 # a floating point instruction, they should flush any input denormals and 254 # this multiplication isn't needed. 255 (('fmul(is_only_used_as_float)', a, 1.0), a), 256 (('imul', a, 1), a), 257 (('fmul', a, -1.0), ('fneg', a)), 258 (('imul', a, -1), ('ineg', a)), 259 # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a 260 # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a 261 # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0 262 # If a != a: fsign(a)*a*a => 0*NaN*NaN => abs(NaN)*NaN 263 (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)), 264 (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)), 265 (('~ffma', 0.0, a, b), b), 266 (('ffma(is_only_used_as_float,nsz,nnan,ninf)', 0.0, a, b), b), 267 (('ffmaz', 0.0, a, b), ('fadd', 0.0, b)), 268 (('~ffma', a, b, 0.0), ('fmul', a, b)), 269 (('ffma(nsz)', a, b, 0.0), ('fmul', a, b)), 270 (('ffmaz(nsz)', a, b, 0.0), ('fmulz', a, b)), 271 (('ffma', 1.0, a, b), ('fadd', a, b)), 272 (('ffmaz(nsz)', 1.0, a, b), ('fadd', a, b)), 273 (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)), 274 (('ffmaz(nsz)', -1.0, a, b), ('fadd', ('fneg', a), b)), 275 (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)), 276 (('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)), 277 (('~flrp', a, b, 0.0), a), 278 (('~flrp', a, b, 1.0), b), 279 (('~flrp', a, a, b), a), 280 (('~flrp', 0.0, a, b), ('fmul', a, b)), 281 282 # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c) 283 (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)), 284 285 (('sdot_4x8_iadd', a, 0, b), b), 286 (('udot_4x8_uadd', a, 0, b), b), 287 (('sdot_4x8_iadd_sat', a, 0, b), b), 288 (('udot_4x8_uadd_sat', a, 0, b), b), 289 (('sdot_2x16_iadd', a, 0, b), b), 290 (('udot_2x16_uadd', a, 0, b), b), 291 (('sdot_2x16_iadd_sat', a, 0, b), b), 292 (('udot_2x16_uadd_sat', a, 0, b), b), 293 294 # sudot_4x8_iadd is not commutative at all, so the patterns must be 295 # duplicated with zeros on each of the first positions. 296 (('sudot_4x8_iadd', a, 0, b), b), 297 (('sudot_4x8_iadd', 0, a, b), b), 298 (('sudot_4x8_iadd_sat', a, 0, b), b), 299 (('sudot_4x8_iadd_sat', 0, a, b), b), 300 301 (('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))), 302 (('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))), 303 (('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))), 304 (('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))), 305 (('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))), 306 307 # Try to let constant folding eliminate the dot-product part. These are 308 # safe because the dot product cannot overflow 32 bits. 309 (('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)), 310 (('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)), 311 (('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)), 312 (('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)), 313 (('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)), 314 (('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)), 315 (('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)), 316 (('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)), 317 (('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)), 318 (('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)), 319 (('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)), 320 (('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 321 (('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'), 322 (('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 323 (('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 324 (('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'), 325 326 # Optimize open-coded fmulz. 327 # (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b) -> fmulz(a, b) 328 *add_fabs_fneg((('fmul@32(nsz)', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, 'ma'), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, 'mb')), 329 ('fmulz', 'ma', 'mb'), has_fmulz), {'ma' : a, 'mb' : b}), 330 *add_fabs_fneg((('fmul@32(nsz)', 'ma', ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)')), 331 ('fmulz', 'ma', b), has_fmulz), {'ma' : a}), 332 333 # ffma(b==0.0 ? 0.0 : a, a==0.0 ? 0.0 : b, c) -> ffmaz(a, b, c) 334 *add_fabs_fneg((('ffma@32(nsz)', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, 'ma'), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, 'mb'), c), 335 ('ffmaz', 'ma', 'mb', c), has_fmulz), {'ma' : a, 'mb' : b}), 336 *add_fabs_fneg((('ffma@32(nsz)', 'ma', ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)'), c), 337 ('ffmaz', 'ma', b, c), has_fmulz), {'ma' : a}), 338 339 # b == 0.0 ? 1.0 : fexp2(fmul(a, b)) -> fexp2(fmulz(a, b)) 340 *add_fabs_fneg((('bcsel(nsz,nnan,ninf)', ignore_exact('feq', b, 0.0), 1.0, ('fexp2', ('fmul@32', a, 'mb'))), 341 ('fexp2', ('fmulz', a, 'mb')), 342 has_fmulz), {'mb': b}), 343 *add_fabs_fneg((('bcsel', ignore_exact('feq', b, 0.0), 1.0, ('fexp2', ('fmulz', a, 'mb'))), 344 ('fexp2', ('fmulz', a, 'mb'))), {'mb': b}), 345] 346 347# Shorthand for the expansion of just the dot product part of the [iu]dp4a 348# instructions. 349sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)), 350 ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))), 351 ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)), 352 ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3)))) 353udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)), 354 ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))), 355 ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)), 356 ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3)))) 357sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)), 358 ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))), 359 ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)), 360 ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3)))) 361sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)), 362 ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1))) 363udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)), 364 ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1))) 365 366optimizations.extend([ 367 (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'), 368 (('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_udot_4x8'), 369 (('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'), 370 (('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'), 371 (('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'), 372 373 # For the unsigned dot-product, the largest possible value 4*(255*255) = 374 # 0x3f804, so we don't have to worry about that intermediate result 375 # overflowing. 0x100000000 - 0x3f804 = 0xfffc07fc. If c is a constant 376 # that is less than 0xfffc07fc, then the result cannot overflow ever. 377 (('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)), 378 (('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->has_udot_4x8_sat'), 379 380 # For the signed dot-product, the largest positive value is 4*(-128*-128) = 381 # 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00. We 382 # don't have to worry about that intermediate result overflowing or 383 # underflowing. 384 (('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->has_sdot_4x8_sat'), 385 386 (('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->has_sudot_4x8_sat'), 387 388 (('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'), 389 (('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'), 390]) 391 392# Float sizes 393for s in [16, 32, 64]: 394 optimizations.extend([ 395 (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 396 397 (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)), 398 (('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)), 399 (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)), 400 401 (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)), 402 # These are the same as the previous three rules, but it depends on 403 # 1-fsat(x) <=> fsat(1-x). See below. 404 (('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)), 405 (('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)), 406 407 (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 408 (('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 409 410 (('~ffma@{}'.format(s), a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1'))), ('fmul', b, ('b2f', 'c@1'))), ('bcsel', c, b, a)), 411 (('~ffma@{}'.format(s), b, ('b2f', 'c@1'), ('ffma', ('fneg', a), ('b2f', 'c@1'), a)), ('bcsel', c, b, a)), 412 413 # These two aren't flrp lowerings, but do appear in some shaders. 414 (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('fadd', b, ('fneg', a)), a), ('bcsel', c, b, a)), 415 (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('ffma', ('fneg', a), b, d), ('fmul', a, b)), ('bcsel', c, d, ('fmul', a, b))), 416 417 # 1 - ((1 - a) * (1 - b)) 418 # 1 - (1 - a - b + a*b) 419 # 1 - 1 + a + b - a*b 420 # a + b - a*b 421 # a + b*(1 - a) 422 # b*(1 - a) + 1*a 423 # flrp(b, 1, a) 424 (('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)), 425 ]) 426 427optimizations.extend([ 428 (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)), 429 430 (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)), 431 432 (('ftrunc@16', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'), 433 (('ftrunc@32', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'), 434 (('ftrunc@64', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 435 '(options->lower_ftrunc || (options->lower_doubles_options & nir_lower_dtrunc)) && !(options->lower_doubles_options & nir_lower_dfloor)'), 436 437 (('ffloor@16', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 438 (('ffloor@32', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 439 (('ffloor@64', a), ('fsub', a, ('ffract', a)), '(options->lower_ffloor || (options->lower_doubles_options & nir_lower_dfloor)) && !(options->lower_doubles_options & nir_lower_dfract)'), 440 (('fadd@16', a, ('fadd@16', b, ('fneg', ('ffract', a)))), ('fadd@16', b, ('ffloor', a)), '!options->lower_ffloor'), 441 (('fadd@32', a, ('fadd@32', b, ('fneg', ('ffract', a)))), ('fadd@32', b, ('ffloor', a)), '!options->lower_ffloor'), 442 (('fadd@64', a, ('fadd@64', b, ('fneg', ('ffract', a)))), ('fadd@64', b, ('ffloor', a)), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'), 443 (('fadd@16', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 444 (('fadd@32', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 445 (('fadd@64', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'), 446 (('ffract@16', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 447 (('ffract@32', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 448 (('ffract@64', a), ('fsub', a, ('ffloor', a)), 449 '(options->lower_ffract || (options->lower_doubles_options & nir_lower_dfract)) && !(options->lower_doubles_options & nir_lower_dfloor)'), 450 (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'), 451 (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'), 452 (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'), 453 (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'), 454 (('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'), 455 # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late). 456 (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'), 457 (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'), 458 (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'), 459 (('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'), 460 461 (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'), 462 ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))), 463 464 (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'), 465 466 (('fdot4', a, 0.0), 0.0), 467 (('fdot3', a, 0.0), 0.0), 468 (('fdot2', a, 0.0), 0.0), 469 470 (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d), '!options->lower_fdph'), 471 (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)), 472 (('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)), 473 (('fdot4', ('vec4', a, b, c, 0.0), d), ('fdot3', ('vec3', a, b, c), d)), 474 475 (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)), 476 (('fdot3', ('vec3', a, b, 0.0), c), ('fdot2', ('vec2', a, b), c)), 477 478 (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)), 479 (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')), 480 481 # Lower fdot to fsum when it is available 482 (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'), 483 (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'), 484 (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'), 485 (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'), 486 487 # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially 488 # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1 489 # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0 490 (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), 491 492 # (a * #b + #c) << #d 493 # ((a * #b) << #d) + (#c << #d) 494 # (a * (#b << #d)) + (#c << #d) 495 (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'), 496 ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))), 497 498 # (a * #b) << #c 499 # a * (#b << #c) 500 (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))), 501]) 502 503# Care must be taken here. Shifts in NIR uses only the lower log2(bitsize) 504# bits of the second source. These replacements must correctly handle the 505# case where (b % bitsize) + (c % bitsize) >= bitsize. 506for s in [8, 16, 32, 64]: 507 mask = s - 1 508 509 ishl = "ishl@{}".format(s) 510 ishr = "ishr@{}".format(s) 511 ushr = "ushr@{}".format(s) 512 513 in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s) 514 515 optimizations.extend([ 516 ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)), 517 ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)), 518 519 # To get get -1 for large shifts of negative values, ishr must instead 520 # clamp the shift count to the maximum value. 521 ((ishr, (ishr, a, '#b'), '#c'), 522 (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))), 523 ]) 524 525# Optimize a pattern of address calculation created by DXVK where the offset is 526# divided by 4 and then multipled by 4. This can be turned into an iand and the 527# additions before can be reassociated to CSE the iand instruction. 528 529for size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)): 530 a_sz = 'a@{}'.format(size) 531 532 optimizations.extend([ 533 # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)' 534 (('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), 535 (('ishl', ('ishr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), 536 537 # This does not trivially work with ishr. 538 (('ushr', ('ishl', a_sz, '#b'), b), ('iand', a, ('ushr', mask, b))), 539 ]) 540 541optimizations.extend([ 542 (('iand', ('ishl', 'a@32', '#b(is_first_5_bits_uge_2)'), -4), ('ishl', a, b)), 543 (('iand', ('imul', a, '#b(is_unsigned_multiple_of_4)'), -4), ('imul', a, b)), 544]) 545 546for log2 in range(1, 7): # powers of two from 2 to 64 547 v = 1 << log2 548 mask = 0xffffffff & ~(v - 1) 549 b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v) 550 551 optimizations.extend([ 552 # Reassociate for improved CSE 553 (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)), 554 ]) 555 556# To save space in the state tables, reduce to the set that is known to help. 557# Previously, this was range(1, 32). In addition, a couple rules inside the 558# loop are commented out. Revisit someday, probably after mesa/#2635 has some 559# resolution. 560for i in [1, 2, 16, 24]: 561 lo_mask = 0xffffffff >> i 562 hi_mask = (0xffffffff << i) & 0xffffffff 563 564 optimizations.extend([ 565 # This pattern seems to only help in the soft-fp64 code. 566 (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)), 567# (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)), 568# (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)), 569 570 (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)), 571 (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)), 572# (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct 573 ]) 574 575optimizations.extend([ 576 # This is common for address calculations. Reassociating may enable the 577 # 'a<<c' to be CSE'd. It also helps architectures that have an ISHLADD 578 # instruction or a constant offset field for in load / store instructions. 579 (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))), 580 581 # (a + #b) * #c => (a * #c) + (#b * #c) 582 (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))), 583 584 # ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d) 585 (('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'), 586 ('iadd', ('imul', ('iadd', a, c), d), ('imul', b, d))), 587 (('ishl', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'), 588 ('iadd', ('ishl', ('iadd', a, c), d), ('ishl', b, d))), 589 590 # Comparison simplifications 591 (('inot', ('flt(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('fge', a, b)), 592 (('inot', ('fge(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('flt', a, b)), 593 (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)), 594 (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)), 595 (('inot', ('ilt(is_used_once)', a, b)), ('ige', a, b)), 596 (('inot', ('ult(is_used_once)', a, b)), ('uge', a, b)), 597 (('inot', ('ige(is_used_once)', a, b)), ('ilt', a, b)), 598 (('inot', ('uge(is_used_once)', a, b)), ('ult', a, b)), 599 (('inot', ('ieq(is_used_once)', a, b)), ('ine', a, b)), 600 (('inot', ('ine(is_used_once)', a, b)), ('ieq', a, b)), 601 602 (('iand', ('feq', a, b), ('fneu', a, b)), False), 603 (('iand', ('flt', a, b), ('flt', b, a)), False), 604 (('iand', ('ieq', a, b), ('ine', a, b)), False), 605 (('iand', ('ilt', a, b), ('ilt', b, a)), False), 606 (('iand', ('ult', a, b), ('ult', b, a)), False), 607 608 # This helps some shaders because, after some optimizations, they end up 609 # with patterns like (-a < -b) || (b < a). In an ideal world, this sort of 610 # matching would be handled by CSE. 611 (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), 612 (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), 613 (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), 614 (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)), 615 (('flt', ('fneg', 'a(is_not_const)'), '#b'), ('flt', ('fneg', b), a)), 616 (('flt', '#b', ('fneg', 'a(is_not_const)')), ('flt', a, ('fneg', b))), 617 (('fge', ('fneg', 'a(is_not_const)'), '#b'), ('fge', ('fneg', b), a)), 618 (('fge', '#b', ('fneg', 'a(is_not_const)')), ('fge', a, ('fneg', b))), 619 (('fneu', ('fneg', 'a(is_not_const)'), '#b'), ('fneu', ('fneg', b), a)), 620 (('feq', '#b', ('fneg', 'a(is_not_const)')), ('feq', a, ('fneg', b))), 621 (('flt', a, '#b(is_negative_zero)'), ('flt', a, 0.0)), 622 (('flt', '#b(is_negative_zero)', a), ('flt', 0.0, a)), 623 (('fge', a, '#b(is_negative_zero)'), ('fge', a, 0.0)), 624 (('fge', '#b(is_negative_zero)', a), ('fge', 0.0, a)), 625 (('fneu', a, '#b(is_negative_zero)'), ('fneu', 0.0, a)), 626 (('feq', '#b(is_negative_zero)', a), ('feq', a, 0.0)), 627 628 (('ieq', ('ineg', a), 0), ('ieq', a, 0)), 629 (('ine', ('ineg', a), 0), ('ine', a, 0)), 630 (('ieq', ('iabs', a), 0), ('ieq', a, 0)), 631 (('ine', ('iabs', a), 0), ('ine', a, 0)), 632 (('fneu', ('fabs', a), 0.0), ('fneu', a, 0.0)), 633 (('feq', ('fabs', a), 0.0), ('feq', a, 0.0)), 634 (('fneu', ('fabs', a), ('fabs', a)), ('fneu', a, a)), 635 (('feq', ('fabs', a), ('fabs', a)), ('feq', a, a)), 636 637 # b < fsat(NaN) -> b < 0 -> false, and b < Nan -> false. 638 (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), 639 640 # fsat(NaN) >= b -> 0 >= b -> false, and NaN >= b -> false. 641 (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), 642 643 # b == fsat(NaN) -> b == 0 -> false, and b == NaN -> false. 644 (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), 645 646 # b != fsat(NaN) -> b != 0 -> true, and b != NaN -> true. 647 (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)), 648 649 # fsat(NaN) >= 1 -> 0 >= 1 -> false, and NaN >= 1 -> false. 650 (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), 651 652 # 0 < fsat(NaN) -> 0 < 0 -> false, and 0 < NaN -> false. 653 (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)), 654 655 # 0.0 >= b2f(a) 656 # b2f(a) <= 0.0 657 # b2f(a) == 0.0 because b2f(a) can only be 0 or 1 658 # inot(a) 659 (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)), 660 661 (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)), 662 663 (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)), 664 (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('ior', a, b)), 665 (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('ior', a, b)), 666 (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)), 667 (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('iand', a, b)), 668 (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)), 669 (('fneu', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ixor', a, b)), 670 (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ixor', a, b)), 671 (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))), 672 (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('inot', ('ior', a, b))), 673 (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 674 (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))), 675 (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('inot', ('iand', a, b))), 676 (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)), 677 (('feq', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ieq', a, b)), 678 (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ieq', a, b)), 679 680 # -(b2f(a) + b2f(b)) < 0 681 # 0 < b2f(a) + b2f(b) 682 # 0 != b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 683 # a || b 684 (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)), 685 (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)), 686 687 # -(b2f(a) + b2f(b)) >= 0 688 # 0 >= b2f(a) + b2f(b) 689 # 0 == b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 690 # !(a || b) 691 (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))), 692 (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 693 694 (('flt', a, ('fneg', a)), ('flt', a, 0.0)), 695 (('fge', a, ('fneg', a)), ('fge', a, 0.0)), 696 697 # Some optimizations (below) convert things like (a < b || c < b) into 698 # (min(a, c) < b). However, this interfers with the previous optimizations 699 # that try to remove comparisons with negated sums of b2f. This just 700 # breaks that apart. 701 (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0), 702 ('ior', ('flt', c, 0.0), ('ior', a, b))), 703 704 (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)), 705 (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)), 706 (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)), 707 (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)), 708 (('~flt', ('fadd(is_used_once)', a, '#b'), '#c'), ('flt', a, ('fadd', c, ('fneg', b)))), 709 (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)), 710 (('~fge', ('fadd(is_used_once)', a, '#b'), '#c'), ('fge', a, ('fadd', c, ('fneg', b)))), 711 (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)), 712 (('~feq', ('fadd(is_used_once)', a, '#b'), '#c'), ('feq', a, ('fadd', c, ('fneg', b)))), 713 (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)), 714 (('~fneu', ('fadd(is_used_once)', a, '#b'), '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))), 715 (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)), 716 717 # Cannot remove the addition from ilt or ige due to overflow. 718 (('ieq', ('iadd', a, b), a), ('ieq', b, 0)), 719 (('ine', ('iadd', a, b), a), ('ine', b, 0)), 720 721 (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)), 722 (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)), 723 (('fneu', ('b2f', 'a@1'), 0.0), a), 724 (('flt', 0.0, ('b2f', 'a@1')), a), 725 (('ieq', ('b2i', 'a@1'), 0), ('inot', a)), 726 (('ine', ('b2i', 'a@1'), 0), a), 727 (('ieq', 'a@1', False), ('inot', a)), 728 (('ieq', 'a@1', True), a), 729 (('ine', 'a@1', False), a), 730 (('ine', 'a@1', True), ('inot', a)), 731 732 (('fneu', ('u2f', a), 0.0), ('ine', a, 0)), 733 (('feq', ('u2f', a), 0.0), ('ieq', a, 0)), 734 (('fge', ('u2f', a), 0.0), True), 735 (('fge', 0.0, ('u2f', a)), ('uge', 0, a)), # ieq instead? 736 (('flt', ('u2f', a), 0.0), False), 737 (('flt', 0.0, ('u2f', a)), ('ult', 0, a)), # ine instead? 738 (('fneu', ('i2f', a), 0.0), ('ine', a, 0)), 739 (('feq', ('i2f', a), 0.0), ('ieq', a, 0)), 740 (('fge', ('i2f', a), 0.0), ('ige', a, 0)), 741 (('fge', 0.0, ('i2f', a)), ('ige', 0, a)), 742 (('flt', ('i2f', a), 0.0), ('ilt', a, 0)), 743 (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)), 744 745 # 0.0 < fabs(a) 746 # fabs(a) > 0.0 747 # fabs(a) != 0.0 because fabs(a) must be >= 0 748 # a != 0.0 749 (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)), 750 751 # -fabs(a) < 0.0 752 # fabs(a) > 0.0 753 (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)), 754 755 # 0.0 >= fabs(a) 756 # 0.0 == fabs(a) because fabs(a) must be >= 0 757 # 0.0 == a 758 (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)), 759 760 # -fabs(a) >= 0.0 761 # 0.0 >= fabs(a) 762 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 763 764 # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a 765 # 766 # This should be NaN safe. 767 # 768 # NaN >= 0 && 1 >= NaN -> false && false -> false 769 # 770 # vs. 771 # 772 # NaN == fsat(NaN) -> NaN == 0 -> false 773 (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'), 774 775 # Note: fmin(-a, -b) == -fmax(a, b) 776 (('fmax', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('ior', a, b))), 777 (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))), 778 (('fmin', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 779 (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))), 780 781 # fmin(b2f(a), b) 782 # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b)) 783 # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b)) 784 # bcsel(a, fmin(1.0, b), fmin(0.0, b)) 785 # 786 # Since b is a constant, constant folding will eliminate the fmin and the 787 # fmax. If b is > 1.0, the bcsel will be replaced with a b2f. 788 (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))), 789 790 (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)), 791 792 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 793 (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)), 794 (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)), 795 (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)), 796 (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)), 797 (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)), 798 (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)), 799 (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)), 800 (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 801 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 802 (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 803 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 804 (('bcsel', a, True, b), ('ior', a, b)), 805 (('bcsel', a, a, b), ('ior', a, b)), 806 (('bcsel', a, b, False), ('iand', a, b)), 807 (('bcsel', a, b, a), ('iand', a, b)), 808 (('~fmin', a, a), a), 809 (('~fmax', a, a), a), 810 (('imin', a, a), a), 811 (('imax', a, a), a), 812 (('umin', a, a), a), 813 (('umin', a, 0), 0), 814 (('umin', a, -1), a), 815 (('umax', a, a), a), 816 (('umax', a, 0), a), 817 (('umax', a, -1), -1), 818 (('fmax', ('fmax', a, b), b), ('fmax', a, b)), 819 (('umax', ('umax', a, b), b), ('umax', a, b)), 820 (('imax', ('imax', a, b), b), ('imax', a, b)), 821 (('fmin', ('fmin', a, b), b), ('fmin', a, b)), 822 (('umin', ('umin', a, b), b), ('umin', a, b)), 823 (('imin', ('imin', a, b), b), ('imin', a, b)), 824 (('fmax', ('fmax', ('fmax', a, b), c), a), ('fmax', ('fmax', a, b), c)), 825 (('umax', ('umax', ('umax', a, b), c), a), ('umax', ('umax', a, b), c)), 826 (('imax', ('imax', ('imax', a, b), c), a), ('imax', ('imax', a, b), c)), 827 (('fmin', ('fmin', ('fmin', a, b), c), a), ('fmin', ('fmin', a, b), c)), 828 (('umin', ('umin', ('umin', a, b), c), a), ('umin', ('umin', a, b), c)), 829 (('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)), 830]) 831 832for N in [8, 16, 32, 64]: 833 b2iN = 'b2i{0}'.format(N) 834 optimizations.extend([ 835 (('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)), 836 (('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)), 837 ]) 838 839for N in [16, 32, 64]: 840 b2fN = 'b2f{0}'.format(N) 841 optimizations.extend([ 842 (('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)), 843 (('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)), 844 ]) 845 846# Integer sizes 847for s in [8, 16, 32, 64]: 848 optimizations.extend([ 849 (('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)), 850 851 # Simplify logic to detect sign of an integer. 852 (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ige', a, 0)), 853 (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)), 854 (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ilt', a, 0)), 855 (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)), 856 (('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)), 857 (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)), 858 (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)), 859 (('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)), 860 (('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)), 861 (('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)), 862 (('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)), 863 (('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)), 864 ]) 865 866optimizations.extend([ 867 (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))), 868 (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))), 869 (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))), 870 (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))), 871 (('~fmin', a, ('fabs', a)), a), 872 (('imin', a, ('iabs', a)), a), 873 (('~fmax', a, ('fneg', ('fabs', a))), a), 874 (('imax', a, ('ineg', ('iabs', a))), a), 875 (('fmax', a, ('fabs', a)), ('fabs', a)), 876 (('imax', a, ('iabs', a)), ('iabs', a)), 877 (('fmax', a, ('fneg', a)), ('fabs', a)), 878 (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'), 879 (('~fmax', ('fabs', a), 0.0), ('fabs', a)), 880 (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), 881 # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while 882 # fsat(a) returns 0.0. 883 (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), 884 # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while 885 # fneg(fsat(fneg(a))) returns -0.0 on NaN. 886 (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), 887 # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while 888 # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if 889 # SignedZeroInfNanPreserve is set, but we don't currently have any way of 890 # representing this in the optimizations other than the usual ~. 891 (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), 892 # fsat(fsign(NaN)) = fsat(0) = 0, and b2f(0 < NaN) = b2f(False) = 0. Mark 893 # the new comparison precise to prevent it being changed to 'a != 0'. 894 (('fsat', ('fsign', a)), ('b2f', ('!flt', 0.0, a))), 895 (('fsat', ('b2f', a)), ('b2f', a)), 896 (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), 897 (('fsat', ('fsat', a)), ('fsat', a)), 898 (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'), 899 (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'), 900 (('fsat(nsz)', ('fneg(is_used_once)', ('fmulz(is_used_once)', a, b))), ('fsat', ('fmulz', ('fneg', a), b)), '!options->lower_fsat'), 901 (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'), 902 (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)), 903 (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)), 904 (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)), 905 # Both the left and right patterns are "b" when isnan(a), so this is exact. 906 (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))), 907 (('fmax', ('fsat(is_used_once)', a), ('fsat(is_used_once)', b)), ('fsat', ('fmax', a, b))), 908 # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) -> 909 # fmin(0.0, b)) while the right one is "b", so this optimization is inexact. 910 (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))), 911 912 # max(-min(b, a), b) -> max(abs(b), -a) 913 # min(-max(b, a), b) -> min(-abs(b), -a) 914 (('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))), 915 (('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))), 916 917 # If a in [0,b] then b-a is also in [0,b]. Since b in [0,1], max(b-a, 0) = 918 # fsat(b-a). 919 # 920 # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0 921 # 922 # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0. 923 (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0), 924 ('fsat', ('fadd', ('fneg', a), b)), '!options->lower_fsat'), 925 926 (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)), 927 928 # The ior versions are exact because fmin and fmax will always pick a 929 # non-NaN value, if one exists. Therefore (a < NaN) || (a < c) == a < 930 # fmax(NaN, c) == a < c. Mark the fmin or fmax in the replacement as exact 931 # to prevent other optimizations from ruining the "NaN clensing" property 932 # of the fmin or fmax. 933 (('ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('!fmax', b, c))), 934 (('ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('!fmin', a, b), c)), 935 (('ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('!fmin', b, c))), 936 (('ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('!fmax', a, b), c)), 937 (('ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('!fmax', b, c))), 938 (('ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('!fmin', a, b), c)), 939 (('ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('!fmin', b, c))), 940 (('ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('!fmax', a, b), c)), 941 (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))), 942 (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)), 943 (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))), 944 (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)), 945 (('iand', ('flt', a, '#b(is_a_number)'), ('flt', a, '#c(is_a_number)')), ('flt', a, ('fmin', b, c))), 946 (('iand', ('flt', '#a(is_a_number)', c), ('flt', '#b(is_a_number)', c)), ('flt', ('fmax', a, b), c)), 947 (('iand', ('fge', a, '#b(is_a_number)'), ('fge', a, '#c(is_a_number)')), ('fge', a, ('fmax', b, c))), 948 (('iand', ('fge', '#a(is_a_number)', c), ('fge', '#b(is_a_number)', c)), ('fge', ('fmin', a, b), c)), 949 950 (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))), 951 (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)), 952 (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))), 953 (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)), 954 (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))), 955 (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)), 956 (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))), 957 (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)), 958 (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))), 959 (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)), 960 (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))), 961 (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)), 962 (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))), 963 (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)), 964 (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))), 965 (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)), 966 967 # A number of shaders contain a pattern like a.x < 0.0 || a.x > 1.0 || a.y 968 # < 0.0, || a.y > 1.0 || ... These patterns rearrange and replace in a 969 # single step. Doing just the replacement can lead to an infinite loop as 970 # the pattern is repeatedly applied to the result of the previous 971 # application of the pattern. 972 (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, c), d), ('flt', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)), 973 (('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)), 974 (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)), 975 (('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)), 976 977 # This is how SpvOpFOrdNotEqual might be implemented. If both values are 978 # numbers, then it can be replaced with fneu. 979 (('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)), 980 981 # Other patterns may optimize the resulting iand tree further. 982 (('umin', ('iand', a, '#b(is_pos_power_of_two)'), ('iand', c, b)), 983 ('iand', ('iand', a, b), ('iand', c, b))), 984]) 985 986# Float sizes 987for s in [16, 32, 64]: 988 if s == 64: 989 match_fsign_cond = "!options->lower_fsign & !(options->lower_doubles_options & nir_lower_dsign)" 990 else: 991 match_fsign_cond = "!options->lower_fsign" 992 optimizations.extend([ 993 # These derive from the previous patterns with the application of b < 0 <=> 994 # 0 < -b. The transformation should be applied if either comparison is 995 # used once as this ensures that the number of comparisons will not 996 # increase. The sources to the ior and iand are not symmetric, so the 997 # rules have to be duplicated to get this behavior. 998 (('ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), 999 (('ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), 1000 (('ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), 1001 (('ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), 1002 (('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), 1003 (('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), 1004 (('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), 1005 (('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), 1006 1007 (('ior', ('feq(is_used_once)', 'a@{}'.format(s), 0.0), ('feq', 'b@{}'.format(s), 0.0)), ('feq', ('fmin', ('fabs', a), ('fabs', b)), 0.0)), 1008 (('ior', ('fneu(is_used_once)', 'a@{}'.format(s), 0.0), ('fneu', 'b@{}'.format(s), 0.0)), ('fneu', ('fadd', ('fabs', a), ('fabs', b)), 0.0)), 1009 (('iand', ('feq(is_used_once)', 'a@{}'.format(s), 0.0), ('feq', 'b@{}'.format(s), 0.0)), ('feq', ('fadd', ('fabs', a), ('fabs', b)), 0.0)), 1010 (('iand', ('fneu(is_used_once)', 'a@{}'.format(s), 0.0), ('fneu', 'b@{}'.format(s), 0.0)), ('fneu', ('fmin', ('fabs', a), ('fabs', b)), 0.0)), 1011 1012 # The (i2f32, ...) part is an open-coded fsign. When that is combined 1013 # with the bcsel, it's basically copysign(1.0, a). There are some 1014 # behavior differences between this pattern and copysign w.r.t. ±0 and 1015 # NaN. copysign(x, y) blindly takes the sign bit from y and applies it 1016 # to x, regardless of whether either or both values are NaN. 1017 # 1018 # If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0, 1019 # int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0 1020 # If a == ±0: bcsel(True, 1.0, ...) = 1.0, 1021 # int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1 1022 # 1023 # For all other values of 'a', the original and replacement behave as 1024 # copysign. 1025 # 1026 # Marking the replacement comparisons as precise prevents any future 1027 # optimizations from replacing either of the comparisons with the 1028 # logical-not of the other. 1029 # 1030 # Note: Use b2i32 in the replacement because some platforms that 1031 # support fp16 don't support int16. 1032 (('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))), 1033 ('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))), 1034 1035 (('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))), 1036 1037 # The C spec says, "If the value of the integral part cannot be represented 1038 # by the integer type, the behavior is undefined." "Undefined" can mean 1039 # "the conversion doesn't happen at all." 1040 (('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)), 1041 1042 # Ironically, mark these as imprecise because removing the conversions may 1043 # preserve more precision than doing the conversions (e.g., 1044 # uint(float(0x81818181u)) == 0x81818200). 1045 (('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a), 1046 (('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a), 1047 (('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a), 1048 (('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a), 1049 1050 (('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), match_fsign_cond), 1051 (('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), match_fsign_cond), 1052 1053 # float? -> float? -> floatS ==> float? -> floatS 1054 (('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)), 1055 1056 # int? -> float? -> floatS ==> int? -> floatS 1057 (('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)), 1058 (('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)), 1059 1060 # float? -> float? -> intS ==> float? -> intS 1061 (('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)), 1062 (('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)), 1063 1064 # HLSL's sign function returns an integer 1065 (('i2f{}'.format(s), ('f2i', ('fsign', 'a@{}'.format(s)))), ('fsign', a)), 1066 ]) 1067 1068 for B in [32, 64]: 1069 if s < B: 1070 optimizations.extend([ 1071 # S = smaller, B = bigger 1072 # floatS -> floatB -> floatS ==> identity 1073 (('~f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a), 1074 1075 # floatS -> floatB -> intB ==> floatS -> intB 1076 (('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)), 1077 (('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)), 1078 1079 # int? -> floatB -> floatS ==> int? -> floatS 1080 (('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)), 1081 (('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)), 1082 ]) 1083 1084for S in [1, 8, 16, 32]: 1085 for B in [8, 16, 32, 64]: 1086 if B <= S: 1087 continue 1088 optimizations.extend([ 1089 # intS -> intB -> intS ==> identity 1090 (('i2i{}'.format(S), ('i2i{}'.format(B), 'a@{}'.format(S))), a), 1091 (('u2u{}'.format(S), ('u2u{}'.format(B), 'a@{}'.format(S))), a), 1092 ]) 1093 1094 if B < 16: 1095 continue 1096 for C in [8, 16, 32, 64]: 1097 if C <= S: 1098 continue 1099 optimizations.extend([ 1100 # intS -> intC -> floatB ==> intS -> floatB 1101 (('u2f{}'.format(B), ('u2u{}'.format(C), 'a@{}'.format(S))), ('u2f{}'.format(B), a)), 1102 (('i2f{}'.format(B), ('i2i{}'.format(C), 'a@{}'.format(S))), ('i2f{}'.format(B), a)), 1103 ]) 1104 1105# mediump variants of the above 1106optimizations.extend([ 1107 # int32 -> float32 -> float16 ==> int32 -> float16 1108 (('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)), 1109 (('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)), 1110 1111 # float32 -> float16 -> int16 ==> float32 -> int16 1112 (('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)), 1113 (('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)), 1114 1115 # float32 -> int32 -> int16 ==> float32 -> int16 1116 (('i2imp', ('f2u32', 'a@32')), ('f2ump', a)), 1117 (('i2imp', ('f2i32', 'a@32')), ('f2imp', a)), 1118 1119 # int32 -> int16 -> float16 ==> int32 -> float16 1120 (('u2f16', ('i2imp', 'a@32')), ('u2f16', a)), 1121 (('i2f16', ('i2imp', 'a@32')), ('i2f16', a)), 1122]) 1123 1124# Clean up junk left from 8-bit integer to 16-bit integer lowering. 1125optimizations.extend([ 1126 # The u2u16(u2u8(X)) just masks off the upper 8-bits of X. This can be 1127 # accomplished by mask the upper 8-bit of the immediate operand to the 1128 # iand instruction. Often times, both patterns will end up being applied 1129 # to the same original expression tree. 1130 (('iand', ('u2u16', ('u2u8', 'a@16')), '#b'), ('iand', a, ('iand', b, 0xff))), 1131 (('u2u16', ('u2u8(is_used_once)', ('iand', 'a@16', '#b'))), ('iand', a, ('iand', b, 0xff))), 1132]) 1133 1134for op in ['iand', 'ior', 'ixor']: 1135 optimizations.extend([ 1136 (('u2u8', (op, ('u2u16', ('u2u8', 'a@16')), ('u2u16', ('u2u8', 'b@16')))), ('u2u8', (op, a, b))), 1137 (('u2u8', (op, ('u2u16', ('u2u8', 'a@32')), ('u2u16', ('u2u8', 'b@32')))), ('u2u8', (op, a, b))), 1138 1139 # Undistribute extract from a logic op 1140 ((op, ('extract_i8', a, '#b'), ('extract_i8', c, b)), ('extract_i8', (op, a, c), b)), 1141 ((op, ('extract_u8', a, '#b'), ('extract_u8', c, b)), ('extract_u8', (op, a, c), b)), 1142 ((op, ('extract_i16', a, '#b'), ('extract_i16', c, b)), ('extract_i16', (op, a, c), b)), 1143 ((op, ('extract_u16', a, '#b'), ('extract_u16', c, b)), ('extract_u16', (op, a, c), b)), 1144 1145 # Undistribute shifts from a logic op 1146 ((op, ('ushr(is_used_once)', a, '#b'), ('ushr', c, b)), ('ushr', (op, a, c), b)), 1147 ((op, ('ishr(is_used_once)', a, '#b'), ('ishr', c, b)), ('ishr', (op, a, c), b)), 1148 ((op, ('ishl(is_used_once)', a, '#b'), ('ishl', c, b)), ('ishl', (op, a, c), b)), 1149 ]) 1150 1151# Integer sizes 1152for s in [8, 16, 32, 64]: 1153 last_shift_bit = int(math.log2(s)) - 1 1154 1155 lower_umin = 'options->lower_umin' 1156 lower_umax = 'options->lower_umax' 1157 lower_imin = 'false' 1158 lower_imax = 'false' 1159 lower_ior = 'options->lower_bitops' 1160 if s == 64: 1161 lower_umin = '(options->lower_umin || (options->lower_int64_options & nir_lower_minmax64) != 0)' 1162 lower_umax = '(options->lower_umax || (options->lower_int64_options & nir_lower_minmax64) != 0)' 1163 lower_imin = '((options->lower_int64_options & nir_lower_minmax64) != 0)' 1164 lower_imax = '((options->lower_int64_options & nir_lower_minmax64) != 0)' 1165 lower_ior = '(options->lower_bitops || (options->lower_int64_options & nir_lower_logic64) != 0)' 1166 1167 optimizations.extend([ 1168 (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), lower_umax + ' && !' + lower_ior), 1169 (('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), lower_umin + ' && !' + lower_ior), 1170 (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!'+lower_umax), 1171 (('ior', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!'+lower_umin), 1172 (('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!'+lower_umin), 1173 (('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!'+lower_umax), 1174 1175 (('bcsel', ('ult', 'b@{}'.format(s), a), b, a), ('umin', a, b), '!'+lower_umin), 1176 (('bcsel', ('ult', 'a@{}'.format(s), b), b, a), ('umax', a, b), '!'+lower_umax), 1177 (('bcsel', ('uge', 'a@{}'.format(s), b), b, a), ('umin', a, b), '!'+lower_umin), 1178 (('bcsel', ('uge', 'b@{}'.format(s), a), b, a), ('umax', a, b), '!'+lower_umax), 1179 (('bcsel', ('ilt', 'b@{}'.format(s), a), b, a), ('imin', a, b), '!'+lower_imin), 1180 (('bcsel', ('ilt', 'a@{}'.format(s), b), b, a), ('imax', a, b), '!'+lower_imax), 1181 (('bcsel', ('ige', 'a@{}'.format(s), b), b, a), ('imin', a, b), '!'+lower_imin), 1182 (('bcsel', ('ige', 'b@{}'.format(s), a), b, a), ('imax', a, b), '!'+lower_imax), 1183 1184 # True/False are ~0 and 0 in NIR. b2i of True is 1, and -1 is ~0 (True). 1185 (('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a), 1186 1187 # SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits) 1188 (('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)), 1189 (('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)), 1190 (('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)), 1191 (('ushr', 'a@{}'.format(s), ('ishl(is_used_once)', ('iand', b, 1), last_shift_bit)), ('ushr', a, ('ishl', b, last_shift_bit))), 1192 ]) 1193 1194optimizations.extend([ 1195 # Common pattern like 'if (i == 0 || i == 1 || ...)' 1196 (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)), 1197 (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)), 1198 (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)), 1199 (('ior', a, ('ieq', a, False)), True), 1200 1201 (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)), 1202 (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))), 1203 1204 (('ishl', ('b2i32', ('ine', ('iand', 'a@32', '#b(is_pos_power_of_two)'), 0)), '#c'), 1205 ('bcsel', ('ige', ('iand', c, 31), ('find_lsb', b)), 1206 ('ishl', ('iand', a, b), ('iadd', ('iand', c, 31), ('ineg', ('find_lsb', b)))), 1207 ('ushr', ('iand', a, b), ('iadd', ('ineg', ('iand', c, 31)), ('find_lsb', b))) 1208 ) 1209 ), 1210 1211 (('b2i32', ('ine', ('iand', 'a@32', '#b(is_pos_power_of_two)'), 0)), 1212 ('ushr', ('iand', a, b), ('find_lsb', b)), '!options->lower_bitops'), 1213 1214 (('ior', ('b2i', a), ('iand', b, 1)), ('iand', ('ior', ('b2i', a), b), 1)), 1215 (('iand', ('b2i', a), ('iand', b, 1)), ('iand', ('b2i', a), b)), 1216 1217 # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code. 1218 # The first part of the iand comes from the !__feq64_nonnan. 1219 # 1220 # The second pattern is a reformulation of the first based on the relation 1221 # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation 1222 # happens to be y == 0. 1223 (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0), b), c)), ('ilt', a, 0)), 1224 ('iand', ('inot', ('iand', b , c)), ('ilt', a, 0))), 1225 (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)), 1226 ('iand', ('inot', ('iand', ('ieq', b , 0), c)), ('ilt', a, 0))), 1227 1228 # These patterns can result when (a < b || a < c) => (a < min(b, c)) 1229 # transformations occur before constant propagation and loop-unrolling. 1230 # 1231 # The flt versions are exact. If isnan(a), the original pattern is 1232 # trivially false, and the replacements are false too. If isnan(b): 1233 # 1234 # a < fmax(NaN, a) => a < a => false vs a < NaN => false 1235 (('flt', a, ('fmax', b, a)), ('flt', a, b)), 1236 (('flt', ('fmin', a, b), a), ('flt', b, a)), 1237 (('~fge', a, ('fmin', b, a)), True), 1238 (('~fge', ('fmax', a, b), a), True), 1239 (('flt', a, ('fmin', b, a)), False), 1240 (('flt', ('fmax', a, b), a), False), 1241 (('~fge', a, ('fmax', b, a)), ('fge', a, b)), 1242 (('~fge', ('fmin', a, b), a), ('fge', b, a)), 1243 1244 (('ilt', a, ('imax', b, a)), ('ilt', a, b)), 1245 (('ilt', ('imin', a, b), a), ('ilt', b, a)), 1246 (('ige', a, ('imin', b, a)), True), 1247 (('ige', ('imax', a, b), a), True), 1248 (('ult', a, ('umax', b, a)), ('ult', a, b)), 1249 (('ult', ('umin', a, b), a), ('ult', b, a)), 1250 (('uge', a, ('umin', b, a)), True), 1251 (('uge', ('umax', a, b), a), True), 1252 (('ilt', a, ('imin', b, a)), False), 1253 (('ilt', ('imax', a, b), a), False), 1254 (('ige', a, ('imax', b, a)), ('ige', a, b)), 1255 (('ige', ('imin', a, b), a), ('ige', b, a)), 1256 (('ult', a, ('umin', b, a)), False), 1257 (('ult', ('umax', a, b), a), False), 1258 (('uge', a, ('umax', b, a)), ('uge', a, b)), 1259 (('uge', ('umin', a, b), a), ('uge', b, a)), 1260 (('ult', a, ('iand', b, a)), False), 1261 (('ult', ('ior', a, b), a), False), 1262 (('uge', a, ('iand', b, a)), True), 1263 (('uge', ('ior', a, b), a), True), 1264 1265 (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))), 1266 (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))), 1267 (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))), 1268 (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))), 1269 (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))), 1270 (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))), 1271 (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))), 1272 (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))), 1273 (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))), 1274 (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))), 1275 (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))), 1276 (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))), 1277 (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))), 1278 (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))), 1279 (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))), 1280 (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))), 1281 1282 # Thanks to sign extension, the ishr(a, b) is negative if and only if a is 1283 # negative. 1284 (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)), 1285 ('iabs', ('ishr', a, b))), 1286 (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)), 1287 1288 (('fabs', ('slt', a, b)), ('slt', a, b)), 1289 (('fabs', ('sge', a, b)), ('sge', a, b)), 1290 (('fabs', ('seq', a, b)), ('seq', a, b)), 1291 (('fabs', ('sne', a, b)), ('sne', a, b)), 1292 (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'), 1293 (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'), 1294 (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'), 1295 (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'), 1296 (('seq', ('seq', a, b), 1.0), ('seq', a, b)), 1297 (('seq', ('sne', a, b), 1.0), ('sne', a, b)), 1298 (('seq', ('slt', a, b), 1.0), ('slt', a, b)), 1299 (('seq', ('sge', a, b), 1.0), ('sge', a, b)), 1300 (('sne', ('seq', a, b), 0.0), ('seq', a, b)), 1301 (('sne', ('sne', a, b), 0.0), ('sne', a, b)), 1302 (('sne', ('slt', a, b), 0.0), ('slt', a, b)), 1303 (('sne', ('sge', a, b), 0.0), ('sge', a, b)), 1304 (('seq', ('seq', a, b), 0.0), ('sne', a, b)), 1305 (('seq', ('sne', a, b), 0.0), ('seq', a, b)), 1306 (('seq', ('slt', a, b), 0.0), ('sge', a, b)), 1307 (('seq', ('sge', a, b), 0.0), ('slt', a, b)), 1308 (('sne', ('seq', a, b), 1.0), ('sne', a, b)), 1309 (('sne', ('sne', a, b), 1.0), ('seq', a, b)), 1310 (('sne', ('slt', a, b), 1.0), ('sge', a, b)), 1311 (('sne', ('sge', a, b), 1.0), ('slt', a, b)), 1312 (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1313 (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'), 1314 (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'), 1315 (('fall_equal8', a, b), ('seq', ('fany_nequal8', a, b), 0.0), 'options->lower_vector_cmp'), 1316 (('fall_equal16', a, b), ('seq', ('fany_nequal16', a, b), 0.0), 'options->lower_vector_cmp'), 1317 (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1318 (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1319 (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1320 (('fany_nequal8', a, b), ('fsat', ('fdot8', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1321 (('fany_nequal16', a, b), ('fsat', ('fdot16', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1322]) 1323 1324def vector_cmp(reduce_op, cmp_op, comps): 1325 if len(comps) == 1: 1326 return (cmp_op, 'a.' + comps[0], 'b.' + comps[0]) 1327 else: 1328 mid = len(comps) // 2 1329 return (reduce_op, vector_cmp(reduce_op, cmp_op, comps[:mid]), 1330 vector_cmp(reduce_op, cmp_op, comps[mid:])) 1331 1332for op in [ 1333 ('ball_iequal', 'ieq', 'iand'), 1334 ('ball_fequal', 'feq', 'iand'), 1335 ('bany_inequal', 'ine', 'ior'), 1336 ('bany_fnequal', 'fneu', 'ior'), 1337]: 1338 optimizations.extend([ 1339 ((op[0] + '2', a, b), vector_cmp(op[2], op[1], 'xy'), 'options->lower_vector_cmp'), 1340 ((op[0] + '3', a, b), vector_cmp(op[2], op[1], 'xyz'), 'options->lower_vector_cmp'), 1341 ((op[0] + '4', a, b), vector_cmp(op[2], op[1], 'xyzw'), 'options->lower_vector_cmp'), 1342 ((op[0] + '8', a, b), vector_cmp(op[2], op[1], 'abcdefgh'), 'options->lower_vector_cmp'), 1343 ((op[0] + '16', a, b), vector_cmp(op[2], op[1], 'abcdefghijklmnop'), 'options->lower_vector_cmp'), 1344 ]) 1345 1346optimizations.extend([ 1347 (('feq', ('seq', a, b), 1.0), ('feq', a, b)), 1348 (('feq', ('sne', a, b), 1.0), ('fneu', a, b)), 1349 (('feq', ('slt', a, b), 1.0), ('flt', a, b)), 1350 (('feq', ('sge', a, b), 1.0), ('fge', a, b)), 1351 (('fneu', ('seq', a, b), 0.0), ('feq', a, b)), 1352 (('fneu', ('sne', a, b), 0.0), ('fneu', a, b)), 1353 (('fneu', ('slt', a, b), 0.0), ('flt', a, b)), 1354 (('fneu', ('sge', a, b), 0.0), ('fge', a, b)), 1355 (('feq', ('seq', a, b), 0.0), ('fneu', a, b)), 1356 (('feq', ('sne', a, b), 0.0), ('feq', a, b)), 1357 (('feq', ('slt', a, b), 0.0), ('fge', a, b)), 1358 (('feq', ('sge', a, b), 0.0), ('flt', a, b)), 1359 (('fneu', ('seq', a, b), 1.0), ('fneu', a, b)), 1360 (('fneu', ('sne', a, b), 1.0), ('feq', a, b)), 1361 (('fneu', ('slt', a, b), 1.0), ('fge', a, b)), 1362 (('fneu', ('sge', a, b), 1.0), ('flt', a, b)), 1363 1364 (('fneu', ('fneg', a), a), ('fneu', a, 0.0)), 1365 (('feq', ('fneg', a), a), ('feq', a, 0.0)), 1366 # Emulating booleans 1367 (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1368 (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1369 (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))), 1370 (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 1371 (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))), 1372 (('iand', 'a@bool16', 1.0), ('b2f', a)), 1373 (('iand', 'a@bool32', 1.0), ('b2f', a)), 1374 (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. 1375 # Comparison with the same args. Note that these are only done for the 1376 # float versions when the source must be a number. Generally, NaN cmp NaN 1377 # produces the opposite result of X cmp X. flt is the outlier. NaN < NaN 1378 # is false, and, for any number X, X < X is also false. 1379 (('ilt', a, a), False), 1380 (('ige', a, a), True), 1381 (('ieq', a, a), True), 1382 (('ine', a, a), False), 1383 (('ult', a, a), False), 1384 (('uge', a, a), True), 1385 (('flt', a, a), False), 1386 (('fge', 'a(is_a_number)', a), True), 1387 (('feq', 'a(is_a_number)', a), True), 1388 (('fneu', 'a(is_a_number)', a), False), 1389 # Logical and bit operations 1390 (('iand', a, a), a), 1391 (('iand', a, 0), 0), 1392 (('iand', a, -1), a), 1393 (('iand', a, ('inot', a)), 0), 1394 (('ior', a, a), a), 1395 (('ior', a, 0), a), 1396 (('ior', a, -1), -1), 1397 (('ior', a, ('inot', a)), -1), 1398 (('ixor', a, a), 0), 1399 (('ixor', a, 0), a), 1400 (('ixor', a, ('ixor', a, b)), b), 1401 (('ixor', a, -1), ('inot', a)), 1402 (('inot', ('inot', a)), a), 1403 (('ior', ('iand', a, b), b), b), 1404 (('ior', ('ior', a, b), b), ('ior', a, b)), 1405 (('iand', ('ior', a, b), b), b), 1406 (('iand', ('iand', a, b), b), ('iand', a, b)), 1407 1408 # It is common for sequences of (x & 1) to occur in large trees. Replacing 1409 # an expression like ((a & 1) & (b & 1)) with ((a & b) & 1) allows the "& 1410 # 1" to eventually bubble up to the top of the tree. 1411 (('iand', ('iand(is_used_once)', a, b), ('iand(is_used_once)', a, c)), 1412 ('iand', a, ('iand', b, c))), 1413 1414 (('iand@64', a, '#b(is_lower_half_zero)'), 1415 ('pack_64_2x32_split', 0, 1416 ('iand', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1417 '!options->lower_pack_64_2x32_split'), 1418 (('iand@64', a, '#b(is_upper_half_zero)'), 1419 ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1420 0), 1421 '!options->lower_pack_64_2x32_split'), 1422 (('iand@64', a, '#b(is_lower_half_negative_one)'), 1423 ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1424 ('iand', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1425 '!options->lower_pack_64_2x32_split'), 1426 (('iand@64', a, '#b(is_upper_half_negative_one)'), 1427 ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1428 ('unpack_64_2x32_split_y', a)), 1429 '!options->lower_pack_64_2x32_split'), 1430 1431 (('ior@64', a, '#b(is_lower_half_zero)'), 1432 ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1433 ('ior', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1434 '!options->lower_pack_64_2x32_split'), 1435 (('ior@64', a, '#b(is_upper_half_zero)'), 1436 ('pack_64_2x32_split', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1437 ('unpack_64_2x32_split_y', a)), 1438 '!options->lower_pack_64_2x32_split'), 1439 (('ior@64', a, '#b(is_lower_half_negative_one)'), 1440 ('pack_64_2x32_split', -1, 1441 ('ior', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1442 '!options->lower_pack_64_2x32_split'), 1443 (('ior@64', a, '#b(is_upper_half_negative_one)'), 1444 ('pack_64_2x32_split', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1445 -1), 1446 '!options->lower_pack_64_2x32_split'), 1447 1448 (('ixor@64', a, '#b(is_lower_half_zero)'), 1449 ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1450 ('ixor', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1451 '!options->lower_pack_64_2x32_split'), 1452 (('ixor@64', a, '#b(is_upper_half_zero)'), 1453 ('pack_64_2x32_split', ('ixor', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1454 ('unpack_64_2x32_split_y', a)), 1455 '!options->lower_pack_64_2x32_split'), 1456 1457 # DeMorgan's Laws 1458 (('iand', ('inot', a), ('inot', b)), ('inot', ('ior', a, b))), 1459 (('ior', ('inot', a), ('inot', b)), ('inot', ('iand', a, b))), 1460 # Shift optimizations 1461 (('ishl', 0, a), 0), 1462 (('ishl', a, 0), a), 1463 (('ishr', 0, a), 0), 1464 (('ishr', -1, a), -1), 1465 (('ishr', a, 0), a), 1466 (('ushr', 0, a), 0), 1467 (('ushr', a, 0), a), 1468 (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), 'options->has_rotate16'), 1469 (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), 'options->has_rotate16'), 1470 (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), 'options->has_rotate32'), 1471 (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), 'options->has_rotate32'), 1472 (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), 'options->has_rotate16'), 1473 (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), 'options->has_rotate16'), 1474 (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), 'options->has_rotate32'), 1475 (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), 'options->has_rotate32'), 1476 (('urol@8', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 8, b))), '!options->has_rotate8'), 1477 (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), '!options->has_rotate16'), 1478 (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), '!options->has_rotate32'), 1479 (('urol@64', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 64, b)))), 1480 (('uror@8', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 8, b))), '!options->has_rotate8'), 1481 (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), '!options->has_rotate16'), 1482 (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), '!options->has_rotate32'), 1483 (('uror@64', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 64, b)))), 1484 1485 (('bitfield_select', 0xff000000, ('ishl', 'b@32', 24), ('ushr', a, 8)), ('shfr', b, a, 8), 'options->has_shfr32'), 1486 (('bitfield_select', 0xffff0000, ('ishl', 'b@32', 16), ('extract_u16', a, 1)), ('shfr', b, a, 16), 'options->has_shfr32'), 1487 (('bitfield_select', 0xffffff00, ('ishl', 'b@32', 8), ('extract_u8', a, 3)), ('shfr', b, a, 24), 'options->has_shfr32'), 1488 (('ior', ('ishl', 'b@32', 24), ('ushr', a, 8)), ('shfr', b, a, 8), 'options->has_shfr32'), 1489 (('ior', ('ishl', 'b@32', 16), ('extract_u16', a, 1)), ('shfr', b, a, 16), 'options->has_shfr32'), 1490 (('ior', ('ishl', 'b@32', 8), ('extract_u8', a, 3)), ('shfr', b, a, 24), 'options->has_shfr32'), 1491 (('ior', ('ishl', 'b@32', ('iadd', 32, ('ineg', c))), ('ushr@32', a, c)), ('shfr', b, a, c), 'options->has_shfr32'), 1492 1493 # bfi(X, a, b) = (b & ~X) | (a & X) 1494 # If X = ~0: (b & 0) | (a & 0xffffffff) = a 1495 # If X = 0: (b & 0xffffffff) | (a & 0) = b 1496 (('bfi', 0xffffffff, a, b), a), 1497 (('bfi', 0x00000000, a, b), b), 1498 1499 # The result of -int(some_bool) is 0 or 0xffffffff, so the result of the 1500 # bfi is either b or c. 1501 (('bfi', ('ineg', ('b2i', 'a@1')), b, c), ('bcsel', a, b, c)), 1502 1503 # bfi(a, a, b) = ((a << find_lsb(a)) & a) | (b & ~a) 1504 # = (a & a) | (b & ~a) If a is odd, find_lsb(a) == 0 1505 # = a | (b & ~a) 1506 # = a | b 1507 (('bfi', '#a(is_odd)', a, b), ('ior', a, b)), 1508 1509 # bfi(a, b, 0) = ((b << find_lsb(a)) & a) | (0 & ~a) 1510 # = ((b << find_lsb(a)) & a) 1511 # = (b & a) If a is odd, find_lsb(a) == 0 1512 (('bfi', '#a(is_odd)', b, 0), ('iand', a, b)), 1513 1514 # Because 'a' is a positive power of two, the result of the bfi is either 0 1515 # or 'a' depending on whether or not 'b' is odd. Use 'b&1' for the zero 1516 # value to help platforms that can't have two constants in a bcsel. 1517 (('u2f32', ('bfi', '#a(is_pos_power_of_two)', b, 0)), 1518 ('bcsel', ('ieq', ('iand', b, 1), 0), ('iand', b, 1), ('u2f', a))), 1519 (('u2f', ('bfi', '#a(is_pos_power_of_two)', b, 0)), 1520 ('bcsel', ('ieq', ('iand', b, 1), 0), 0, ('u2f', a))), 1521 1522 # Exponential/logarithmic identities 1523 (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a 1524 (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a 1525 # 32-bit fpow should use fmulz to fix https://gitlab.freedesktop.org/mesa/mesa/-/issues/11464 (includes apitrace) 1526 (('fpow@32', a, b), ('fexp2', ('fmulz', ('flog2', a), b)), 'options->lower_fpow && ' + has_fmulz), # a^b = 2^(lg2(a)*b) 1527 (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b) 1528 (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b 1529 (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))), 1530 ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d 1531 (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)), 1532 (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)), 1533 (('~fexp2', ('fmul', ('flog2', a), 3.0)), ('fmul', ('fmul', a, a), a)), 1534 (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))), 1535 (('~fexp2', ('fmul', ('flog2', a), 5.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), a)), 1536 (('~fexp2', ('fmul', ('flog2', a), 6.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), ('fmul', a, a))), 1537 (('~fexp2', ('fmul', ('flog2', a), 8.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), ('fmul', ('fmul', a, a), ('fmul', a, a)))), 1538 (('~fpow', a, 1.0), a), 1539 (('~fpow', a, 2.0), ('fmul', a, a)), 1540 (('~fpow', a, 3.0), ('fmul', ('fmul', a, a), a)), 1541 (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))), 1542 (('~fpow', 2.0, a), ('fexp2', a)), 1543 (('~fpow', ('fpow', a, 2.2), 0.454545), a), 1544 (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)), 1545 (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))), 1546 (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))), 1547 (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))), 1548 (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))), 1549 (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))), 1550 (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))), 1551 (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), 1552 (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))), 1553 (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))), 1554 (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)), 1555 (('~fmulz', ('fsqrt', a), ('fsqrt', a)), ('fabs', a)), 1556 # Division and reciprocal 1557 (('~fdiv', 1.0, a), ('frcp', a)), 1558 (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), 1559 (('~frcp', ('frcp', a)), a), 1560 (('~frcp', ('fsqrt', a)), ('frsq', a)), 1561 (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'), 1562 (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'), 1563 # Trig 1564 (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'), 1565 (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'), 1566 # Boolean simplifications 1567 (('ieq', a, True), a), 1568 (('ine(is_not_used_by_if)', a, True), ('inot', a)), 1569 (('ine', a, False), a), 1570 (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')), 1571 (('bcsel', a, True, False), a), 1572 (('bcsel', a, False, True), ('inot', a)), 1573 (('bcsel', True, b, c), b), 1574 (('bcsel', False, b, c), c), 1575 1576 (('bcsel@16', a, 1.0, 0.0), ('b2f', a)), 1577 (('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))), 1578 (('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))), 1579 (('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 1580 (('bcsel@32', a, 1.0, 0.0), ('b2f', a)), 1581 (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))), 1582 (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))), 1583 (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 1584 (('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1585 (('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1586 (('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1587 (('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1588 1589 (('bcsel', a, b, b), b), 1590 (('~fcsel', a, b, b), b), 1591 1592 # D3D Boolean emulation 1593 (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))), 1594 (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))), 1595 (('bcsel', a, 1, 0), ('b2i', 'a@1')), 1596 (('bcsel', a, 0, 1), ('b2i', ('inot', a))), 1597 (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1598 ('ineg', ('b2i', ('iand', a, b)))), 1599 (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))), 1600 ('ineg', ('b2i', ('ior', a, b)))), 1601 (('ieq', ('ineg', ('b2i', 'a@1')), -1), a), 1602 (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)), 1603 (('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)), 1604 (('ilt', ('ineg', ('b2i', 'a@1')), 0), a), 1605 (('ult', 0, ('ineg', ('b2i', 'a@1'))), a), 1606 (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)), 1607 (('iand', ('ineg', ('b2i', a)), 1), ('b2i', a)), 1608 1609 # With D3D booleans, imax is AND and umax is OR 1610 (('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1611 ('ineg', ('b2i', ('iand', a, b)))), 1612 (('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1613 ('ineg', ('b2i', ('ior', a, b)))), 1614 (('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1615 ('ineg', ('b2i', ('ior', a, b)))), 1616 (('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1617 ('ineg', ('b2i', ('iand', a, b)))), 1618 (('umax', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))), 1619 (('umin', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1620 1621 # Clean up LLVM booleans. b2i output is 0/1 so iand is a no-op. 1622 (('iand', ('b2i', a), 1), ('b2i', a)), 1623 1624 (('ine', ('umin', ('ineg', ('b2i', 'a@1')), b), 0), ('iand', a, ('ine', b, 0))), 1625 (('ine', ('umax', ('ineg', ('b2i', 'a@1')), b), 0), ('ior' , a, ('ine', b, 0))), 1626 1627 # Conversions 1628 (('f2i', ('ftrunc', a)), ('f2i', a)), 1629 (('f2u', ('ftrunc', a)), ('f2u', a)), 1630 1631 # Conversions from 16 bits to 32 bits and back can always be removed 1632 (('f2fmp', ('f2f32', 'a@16')), a), 1633 (('i2imp', ('i2i32', 'a@16')), a), 1634 (('i2imp', ('u2u32', 'a@16')), a), 1635 1636 (('f2imp', ('f2f32', 'a@16')), ('f2i16', a)), 1637 (('f2ump', ('f2f32', 'a@16')), ('f2u16', a)), 1638 (('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)), 1639 (('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)), 1640 1641 (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)), 1642 (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)), 1643 (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)), 1644 1645 (('f2imp', ('b2f32', 'a@1')), ('b2i16', a)), 1646 (('f2ump', ('b2f32', 'a@1')), ('b2i16', a)), 1647 (('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)), 1648 (('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)), 1649 1650 # Conversions to 16 bits would be lossy so they should only be removed if 1651 # the instruction was generated by the precision lowering pass. 1652 (('f2f32', ('f2fmp', 'a@32')), a), 1653 (('i2i32', ('i2imp', 'a@32')), a), 1654 (('u2u32', ('i2imp', 'a@32')), a), 1655 1656 # typeA@32 -> typeB@16 -> typeB@32 ==> typeA@32 -> typeB@32 1657 (('i2i32', ('f2imp', 'a@32')), ('f2i32', a)), 1658 (('u2u32', ('f2ump', 'a@32')), ('f2u32', a)), 1659 (('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)), 1660 (('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)), 1661 1662 # typeA@32 -> typeA@16 -> typeB@32 ==> typeA@32 -> typeB@32 1663 (('f2i32', ('f2fmp', 'a@32')), ('f2i32', a)), 1664 (('f2u32', ('f2fmp', 'a@32')), ('f2u32', a)), 1665 (('i2f32', ('i2imp', 'a@32')), ('i2f32', a)), 1666 1667 (('ffloor', 'a(is_integral)'), a), 1668 (('fceil', 'a(is_integral)'), a), 1669 (('ftrunc', 'a(is_integral)'), a), 1670 (('fround_even', 'a(is_integral)'), a), 1671 1672 # fract(x) = x - floor(x), so fract(NaN) = NaN 1673 (('~ffract', 'a(is_integral)'), 0.0), 1674 (('fabs', 'a(is_not_negative)'), a), 1675 (('iabs', 'a(is_not_negative)'), a), 1676 (('fsat', 'a(is_not_positive)'), 0.0), 1677 1678 (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'), 1679 1680 # The result of the multiply must be in [-1, 0], so the result of the ffma 1681 # must be in [0, 1]. 1682 (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False), 1683 (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False), 1684 (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)), 1685 (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)), 1686 1687 (('fneu', 'a(is_not_zero)', 0.0), True), 1688 (('feq', 'a(is_not_zero)', 0.0), False), 1689 1690 # In this chart, + means value > 0 and - means value < 0. 1691 # 1692 # + >= + -> unknown 0 >= + -> false - >= + -> false 1693 # + >= 0 -> true 0 >= 0 -> true - >= 0 -> false 1694 # + >= - -> true 0 >= - -> true - >= - -> unknown 1695 # 1696 # Using grouping conceptually similar to a Karnaugh map... 1697 # 1698 # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true 1699 # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false 1700 # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false 1701 # 1702 # The flt / ilt cases just invert the expected result. 1703 # 1704 # The results expecting true, must be marked imprecise. The results 1705 # expecting false are fine because NaN compared >= or < anything is false. 1706 1707 (('fge', 'a(is_a_number_not_negative)', 'b(is_a_number_not_positive)'), True), 1708 (('fge', 'a(is_not_positive)', 'b(is_gt_zero)'), False), 1709 (('fge', 'a(is_lt_zero)', 'b(is_not_negative)'), False), 1710 1711 (('flt', 'a(is_not_negative)', 'b(is_not_positive)'), False), 1712 (('flt', 'a(is_a_number_not_positive)', 'b(is_a_number_gt_zero)'), True), 1713 (('flt', 'a(is_a_number_lt_zero)', 'b(is_a_number_not_negative)'), True), 1714 1715 (('ine', 'a(is_not_zero)', 0), True), 1716 (('ieq', 'a(is_not_zero)', 0), False), 1717 1718 (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True), 1719 (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'), False), 1720 (('ige', 'a(is_lt_zero)', 'b(is_not_negative)'), False), 1721 1722 (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False), 1723 (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'), True), 1724 (('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True), 1725 1726 (('ult', 0, 'a(is_gt_zero)'), True), 1727 (('ult', a, 0), False), 1728 1729 # Packing and then unpacking does nothing 1730 (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a), 1731 (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b), 1732 (('unpack_64_2x32_split_x', ('pack_64_2x32', a)), 'a.x'), 1733 (('unpack_64_2x32_split_y', ('pack_64_2x32', a)), 'a.y'), 1734 (('unpack_64_2x32_split_x', ('u2u64', 'a@32')), a), 1735 (('unpack_64_2x32_split_y', ('u2u64', a)), 0), 1736 (('unpack_64_2x32_split_x', ('i2i64', 'a@32')), a), 1737 (('unpack_64_2x32_split_y', ('i2i64(is_used_once)', 'a@32')), ('ishr', a, 31)), 1738 (('unpack_64_2x32', ('pack_64_2x32_split', a, b)), ('vec2', a, b)), 1739 (('unpack_64_2x32', ('pack_64_2x32', a)), a), 1740 (('unpack_double_2x32_dxil', ('pack_double_2x32_dxil', a)), a), 1741 (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1742 ('unpack_64_2x32_split_y', a)), a), 1743 (('pack_64_2x32', ('vec2', ('unpack_64_2x32_split_x', a), 1744 ('unpack_64_2x32_split_y', a))), a), 1745 (('pack_64_2x32', ('unpack_64_2x32', a)), a), 1746 (('pack_double_2x32_dxil', ('unpack_double_2x32_dxil', a)), a), 1747 1748 (('unpack_64_4x16', ('pack_64_4x16', a)), a), 1749 (('unpack_64_4x16', ('pack_64_2x32', ('vec2', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d)))), ('vec4', a, b, c, d)), 1750 (('unpack_64_4x16', ('pack_64_2x32_split', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d))), ('vec4', a, b, c, d)), 1751 1752 # Comparing two halves of an unpack separately. While this optimization 1753 # should be correct for non-constant values, it's less obvious that it's 1754 # useful in that case. For constant values, the pack will fold and we're 1755 # guaranteed to reduce the whole tree to one instruction. 1756 (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'), 1757 ('ieq', ('unpack_32_2x16_split_y', a), '#c')), 1758 ('ieq', a, ('pack_32_2x16_split', b, c))), 1759 1760 # Byte extraction 1761 (('ushr', 'a@16', 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1762 (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'), 1763 (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'), 1764 (('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 1765 (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'), 1766 (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'), 1767 (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'), 1768 (('ishr', ('iand', a, 0x0000ff00), 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1769 (('ishr', ('iand', a, 0x00ff0000), 16), ('extract_u8', a, 2), '!options->lower_extract_byte'), 1770 1771 # Common pattern in many Vulkan CTS tests that read 8-bit integers from a 1772 # storage buffer. 1773 (('u2u8', ('extract_u16', a, 1)), ('u2u8', ('extract_u8', a, 2)), '!options->lower_extract_byte'), 1774 (('u2u8', ('ushr', a, 8)), ('u2u8', ('extract_u8', a, 1)), '!options->lower_extract_byte'), 1775 1776 # Common pattern after lowering 8-bit integers to 16-bit. 1777 (('i2i16', ('u2u8', ('extract_u8', a, b))), ('i2i16', ('extract_i8', a, b))), 1778 (('u2u16', ('u2u8', ('extract_u8', a, b))), ('u2u16', ('extract_u8', a, b))), 1779 1780 (('ubfe', a, 0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'), 1781 (('ubfe', a, 8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1782 (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'), 1783 (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'), 1784 (('ibfe', a, 0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'), 1785 (('ibfe', a, 8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 1786 (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'), 1787 (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'), 1788 1789 (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)), 1790 (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)), 1791 1792 # The extract_X8(a & 0xff) patterns aren't included because the iand will 1793 # already be converted to extract_u8. 1794 (('extract_i8', ('iand', a, 0x0000ff00), 1), ('extract_i8', a, 1)), 1795 (('extract_i8', ('iand', a, 0x00ff0000), 2), ('extract_i8', a, 2)), 1796 (('extract_i8', ('iand', a, 0xff000000), 3), ('extract_i8', a, 3)), 1797 1798 (('extract_u8', ('iand', a, 0x0000ff00), 1), ('extract_u8', a, 1)), 1799 (('extract_u8', ('iand', a, 0x00ff0000), 2), ('extract_u8', a, 2)), 1800 (('extract_u8', ('iand', a, 0xff000000), 3), ('extract_u8', a, 3)), 1801 1802 (('iand', ('extract_u8', a, 0), '#b'), ('iand', a, ('iand', b, 0x00ff))), 1803 (('iand', ('extract_u16', a, 0), '#b'), ('iand', a, ('iand', b, 0xffff))), 1804 1805 (('ieq', ('iand', ('extract_u8', a, '#b'), '#c'), 0), ('ieq', ('iand', a, ('ishl', ('iand', c, 0x00ff), ('imul', ('i2i32', b), 8))), 0)), 1806 (('ine', ('iand', ('extract_u8', a, '#b'), '#c'), 0), ('ine', ('iand', a, ('ishl', ('iand', c, 0x00ff), ('imul', ('i2i32', b), 8))), 0)), 1807 (('ieq', ('iand', ('extract_u16(is_used_once)', a, '#b'), '#c'), 0), ('ieq', ('iand', a, ('ishl', ('iand', c, 0xffff), ('imul', ('i2i32', b), 16))), 0)), 1808 (('ine', ('iand', ('extract_u16(is_used_once)', a, '#b'), '#c'), 0), ('ine', ('iand', a, ('ishl', ('iand', c, 0xffff), ('imul', ('i2i32', b), 16))), 0)), 1809 1810 # Word extraction 1811 (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 1812 (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 1813 (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 1814 (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 1815 (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'), 1816 1817 (('ubfe', a, 0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 1818 (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 1819 (('ibfe', a, 0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 1820 (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 1821 1822 # Collapse nop packing. 1823 (('unpack_32_4x8', ('pack_32_4x8', a)), a), 1824 (('unpack_32_2x16', ('pack_32_2x16', a)), a), 1825 (('unpack_64_4x16', ('pack_64_4x16', a)), a), 1826 (('unpack_64_2x32', ('pack_64_2x32', a)), a), 1827 (('pack_32_4x8', ('unpack_32_4x8', a)), a), 1828 (('pack_32_2x16', ('unpack_32_2x16', a)), a), 1829 (('pack_64_4x16', ('unpack_64_4x16', a)), a), 1830 (('pack_64_2x32', ('unpack_64_2x32', a)), a), 1831 1832 # Packing a u8vec4 to write to an SSBO. 1833 (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))), 1834 ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'), 1835 1836 (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)), 1837 (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)), 1838 1839 # The extract_X16(a & 0xff) patterns aren't included because the iand will 1840 # already be converted to extract_u8. 1841 (('extract_i16', ('iand', a, 0x00ff0000), 1), ('extract_u8', a, 2), '!options->lower_extract_byte'), # extract_u8 is correct 1842 (('extract_u16', ('iand', a, 0x00ff0000), 1), ('extract_u8', a, 2), '!options->lower_extract_byte'), 1843 1844 # Lower pack/unpack 1845 (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'), 1846 (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split || options->lower_pack_split'), 1847 (('pack_half_2x16_split', a, b), ('pack_half_2x16_rtz_split', a, b), 'options->has_pack_half_2x16_rtz'), 1848 (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'), 1849 (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'), 1850 (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split || options->lower_pack_split'), 1851 (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split || options->lower_pack_split'), 1852 1853 (('unpack_64_2x32_split_x', ('ushr', a, 32)), ('unpack_64_2x32_split_y', a), '!options->lower_unpack_64_2x32_split'), 1854 (('u2u32', ('ushr', 'a@64', 32)), ('unpack_64_2x32_split_y', a), '!options->lower_unpack_64_2x32_split'), 1855 1856 # Useless masking before unpacking 1857 (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)), 1858 (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)), 1859 (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)), 1860 (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)), 1861 (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)), 1862 (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)), 1863 1864 (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)), 1865 (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)), 1866 (('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)), 1867 (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)), 1868 (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)), 1869 1870 # Optimize half packing 1871 (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))), 1872 (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))), 1873 1874 (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), 1875 ('pack_half_2x16', ('vec2', a, b))), 1876 (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), 1877 ('pack_half_2x16', ('vec2', a, b))), 1878 1879 (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)), 1880 (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)), 1881 (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)), 1882 1883 (('ishl', ('pack_half_2x16_rtz_split', a, 0), 16), ('pack_half_2x16_rtz_split', 0, a)), 1884 (('ushr', ('pack_half_2x16_rtz_split', 0, a), 16), ('pack_half_2x16_rtz_split', a, 0)), 1885 (('extract_u16', ('pack_half_2x16_rtz_split', 0, a), 1), ('pack_half_2x16_rtz_split', a, 0)), 1886 1887 (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)), 1888 (('ior', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)), 1889 1890 (('iadd', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)), 1891 (('ior', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)), 1892 1893 (('pack_uint_2x16', ('vec2', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', b, 0))), ('pack_half_2x16_rtz_split', a, b)), 1894 1895 (('bfi', 0xffff0000, ('pack_half_2x16_split', a, b), ('pack_half_2x16_split', c, d)), 1896 ('pack_half_2x16_split', c, a)), 1897 1898 # The important part here is that ~0xf & 0xfffffffc = ~0xf. 1899 (('iand', ('bfi', 0x0000000f, '#a', b), 0xfffffffc), 1900 ('bfi', 0x0000000f, ('iand', a, 0xfffffffc), b)), 1901 (('iand', ('bfi', 0x00000007, '#a', b), 0xfffffffc), 1902 ('bfi', 0x00000007, ('iand', a, 0xfffffffc), b)), 1903 1904 # 0x0f << 3 == 0x78, so that's already the maximum possible value. 1905 (('umin', ('ishl', ('iand', a, 0xf), 3), 0x78), ('ishl', ('iand', a, 0xf), 3)), 1906 1907 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)), 1908 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)), 1909 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)), 1910 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)), 1911 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)), 1912 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)), 1913 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)), 1914 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)), 1915 1916 # Reduce intermediate precision with int64. 1917 (('u2u32', ('iadd(is_used_once)', 'a@64', b)), 1918 ('iadd', ('u2u32', a), ('u2u32', b))), 1919 1920 # Lowered pack followed by lowered unpack, for the high bits 1921 (('u2u32', ('ushr', ('ior', ('ishl', a, 32), ('u2u64', b)), 32)), ('u2u32', a)), 1922 (('u2u16', ('ushr', ('ior', ('ishl', a, 16), ('u2u32', b)), 16)), ('u2u16', a)), 1923]) 1924 1925# After the ('extract_u8', a, 0) pattern, above, triggers, there will be 1926# patterns like those below. 1927for op in ('ushr', 'ishr'): 1928 optimizations.extend([(('extract_u8', (op, 'a@16', 8), 0), ('extract_u8', a, 1))]) 1929 optimizations.extend([(('extract_u8', (op, 'a@32', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)]) 1930 optimizations.extend([(('extract_u8', (op, 'a@64', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)]) 1931 1932optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))]) 1933 1934# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be 1935# patterns like those below. 1936for op in ('extract_u8', 'extract_i8'): 1937 optimizations.extend([((op, ('ishl', 'a@16', 8), 1), (op, a, 0))]) 1938 optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)]) 1939 optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)]) 1940 1941for op, repl in [('ieq', 'ieq'), ('ine', 'ine'), 1942 ('ult', 'ult'), ('ilt', 'ult'), 1943 ('uge', 'uge'), ('ige', 'uge')]: 1944 optimizations.extend([ 1945 ((op, ('pack_64_2x32_split', a, 0), ('pack_64_2x32_split', b, 0)), (repl, a, b)), 1946 ((op, ('pack_64_2x32_split', a, 0), '#b(is_upper_half_zero)'), (repl, a, ('unpack_64_2x32_split_x', b))), 1947 ((op, '#a(is_upper_half_zero)', ('pack_64_2x32_split', b, 0)), (repl, ('unpack_64_2x32_split_x', a), b)), 1948 1949 ((op, ('pack_64_2x32_split', 0, a), ('pack_64_2x32_split', 0, b)), (op, a, b)), 1950 ((op, ('pack_64_2x32_split', 0, a), '#b(is_lower_half_zero)'), (op, a, ('unpack_64_2x32_split_y', b))), 1951 ((op, '#a(is_lower_half_zero)', ('pack_64_2x32_split', 0, b)), (op, ('unpack_64_2x32_split_y', a), b)), 1952 ]) 1953 1954optimizations.extend([ 1955 # Subtracts 1956 (('ussub_4x8_vc4', a, 0), a), 1957 (('ussub_4x8_vc4', a, ~0), 0), 1958 # Lower all Subtractions first - they can get recombined later 1959 (('fsub', a, b), ('fadd', a, ('fneg', b))), 1960 (('isub', a, b), ('iadd', a, ('ineg', b))), 1961 (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), 1962 # This is correct. We don't need isub_sat because the result type is unsigned, so it cannot overflow. 1963 (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), 1964 (('bitz', a, b), ('inot', ('bitnz', a, b))), 1965 1966 # Propagate negation up multiplication chains 1967 (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))), 1968 (('fmulz(is_used_by_non_fsat,nsz)', ('fneg', a), b), ('fneg', ('fmulz', a, b))), 1969 (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), 1970 (('ffmaz', ('fneg', a), ('fneg', b), c), ('ffmaz', a, b, c)), 1971 (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))), 1972 1973 # Propagate constants up multiplication chains 1974 (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)), 1975 (('~fmulz(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmulz', ('fmulz', a, c), b)), 1976 (('~fmul(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)'), ('fmulz', ('fmul', a, c), b)), 1977 (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)), 1978 (('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)), 1979 (('~ffmaz', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffmaz', ('fmulz', a, c), b, d)), 1980 (('~ffma', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)', d), ('ffmaz', ('fmul', a, c), b, d)), 1981 # Prefer moving out a multiplication for more MAD/FMA-friendly code 1982 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)), 1983 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)), 1984 (('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)), 1985 (('~fadd(is_used_once)', ('ffmaz(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffmaz', a, b, d), c)), 1986 (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)), 1987 1988 # Reassociate constants in add/mul chains so they can be folded together. 1989 # For now, we mostly only handle cases where the constants are separated by 1990 # a single non-constant. We could do better eventually. 1991 (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)), 1992 (('~fmulz', '#a', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmulz', a, c), b)), 1993 (('~fmul', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmul', a, c), b)), 1994 (('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)), 1995 (('~ffmaz', '#a', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmulz', a, c), b, d)), 1996 (('~ffmaz', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmul', a, c), b, d)), 1997 (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)), 1998 (('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)), 1999 (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), 2000 (('~fadd', '#a', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffma', b, c, ('fadd', a, d))), 2001 (('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))), 2002 (('~fadd', '#a', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffmaz', b, c, ('fadd', a, d))), 2003 (('~fadd', '#a', ('fneg', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffmaz', ('fneg', b), c, ('fadd', a, ('fneg', d)))), 2004 (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)), 2005 (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)), 2006 (('ior', '#a', ('ior', 'b(is_not_const)', '#c')), ('ior', ('ior', a, c), b)), 2007 (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)), 2008 (('ior', ('iand', a, '#c'), ('ior', b, ('iand', a, '#d'))), ('ior', b, ('iand', a, ('ior', c, d)))), 2009 2010 # Reassociate add chains for more MAD/FMA-friendly code 2011 (('~fadd', ('fadd(is_used_once)', 'a(is_fmul)', 'b(is_fmul)'), 'c(is_not_fmul)'), ('fadd', ('fadd', a, c), b)), 2012 2013 # Drop mul-div by the same value when there's no wrapping. 2014 (('idiv', ('imul(no_signed_wrap)', a, b), b), a), 2015 2016 # By definition... 2017 (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)), 2018 (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 2019 (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 2020 (('bcsel', ('ige', ('ifind_msb_rev', a), 0), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)), 2021 (('bcsel', ('ige', ('ufind_msb_rev', a), 0), ('ufind_msb_rev', a), -1), ('ufind_msb_rev', a)), 2022 2023 (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)), 2024 (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 2025 (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 2026 (('bcsel', ('ine', a, 0), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)), 2027 (('bcsel', ('ine', a, 0), ('ufind_msb_rev', a), -1), ('ufind_msb_rev', a)), 2028 2029 (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)), 2030 (('bcsel', ('ine', a, -1), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)), 2031 2032 (('bcsel', ('ine', ('ifind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb', a))), -1), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 2033 (('bcsel', ('ine', ('ufind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb', a))), -1), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2034 (('bcsel', ('ieq', ('ifind_msb', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ifind_msb', a)))), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 2035 (('bcsel', ('ieq', ('ufind_msb', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2036 (('bcsel', ('ine', ('ifind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb', a))), ('ifind_msb', a)), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 2037 (('bcsel', ('ine', ('ufind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb', a))), ('ufind_msb', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2038 (('bcsel', ('ieq', ('ifind_msb', 'a@32'), -1), ('ifind_msb', a), ('iadd', 31, ('ineg', ('ifind_msb', a)))), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 2039 (('bcsel', ('ieq', ('ufind_msb', 'a@32'), -1), ('ufind_msb', a), ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2040 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb', a))), -1), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2041 (('bcsel', ('ieq', 'a@32', 0), -1, ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2042 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb', a))), ('ufind_msb', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2043 (('bcsel', ('ieq', 'a@32', 0), ('ufind_msb', a), ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2044 2045 (('bcsel', ('ine', ('ifind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb_rev', a))), -1), ('ifind_msb', a), '!options->lower_ifind_msb'), 2046 (('bcsel', ('ine', ('ufind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), -1), ('ufind_msb', a), '!options->lower_ufind_msb'), 2047 (('bcsel', ('ieq', ('ifind_msb_rev', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ifind_msb_rev', a)))), ('ifind_msb', a), '!options->lower_ifind_msb'), 2048 (('bcsel', ('ieq', ('ufind_msb_rev', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 2049 (('bcsel', ('ine', ('ifind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb_rev', a))), ('ifind_msb_rev', a)), ('ifind_msb', a), '!options->lower_ifind_msb'), 2050 (('bcsel', ('ine', ('ufind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), ('ufind_msb_rev', a)), ('ufind_msb', a), '!options->lower_ufind_msb'), 2051 (('bcsel', ('ieq', ('ifind_msb_rev', 'a@32'), -1), ('ifind_msb_rev', a), ('iadd', 31, ('ineg', ('ifind_msb_rev', a)))), ('ifind_msb', a), '!options->lower_ifind_msb'), 2052 (('bcsel', ('ieq', ('ufind_msb_rev', 'a@32'), -1), ('ufind_msb_rev', a), ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 2053 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), -1), ('ufind_msb', a), '!options->lower_ufind_msb'), 2054 (('bcsel', ('ieq', 'a@32', 0), -1, ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 2055 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), ('ufind_msb_rev', a)), ('ufind_msb', a), '!options->lower_ufind_msb'), 2056 (('bcsel', ('ieq', 'a@32', 0), ('ufind_msb_rev', a), ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 2057 2058 # Clear the LSB 2059 (('iand', a, ('inot', ('ishl', 1, ('find_lsb', a)))), ('iand', a, ('inot', ('ineg', a)))), 2060 2061 # This is safe. Both ufind_msb_rev and bitfield_reverse can only have 2062 # 32-bit sources, so the transformation can only generate correct NIR. 2063 (('find_lsb', ('bitfield_reverse', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2064 (('ufind_msb_rev', ('bitfield_reverse', a)), ('find_lsb', a), '!options->lower_find_lsb'), 2065 2066 (('ifind_msb', ('f2i32(is_used_once)', a)), ('ufind_msb', ('f2i32', ('fabs', a)))), 2067 (('ifind_msb', ('extract_u8', a, b)), ('ufind_msb', ('extract_u8', a, b))), 2068 (('ifind_msb', ('extract_u16', a, b)), ('ufind_msb', ('extract_u16', a, b))), 2069 (('ifind_msb', ('imax', a, 1)), ('ufind_msb', ('imax', a, 1))), 2070 2071 (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), 2072 (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), 2073 (('~fmulz', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), 2074 (('~fmulz', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), 2075 (('fabs', ('bcsel(is_used_once)', b, ('fneg', a), a)), ('fabs', a)), 2076 (('fabs', ('bcsel(is_used_once)', b, a, ('fneg', a))), ('fabs', a)), 2077 (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)), 2078 2079 (('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)), 2080 (('bcsel', a, b, ('bcsel', c, b, d)), ('bcsel', ('ior', a, c), b, d)), 2081 2082 # Misc. lowering 2083 (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'), 2084 (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'), 2085 (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'), 2086 (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'), 2087 2088 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 2089 ('bcsel', ('ult', 31, 'bits'), 'insert', 2090 ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')), 2091 'options->lower_bitfield_insert && options->has_bfm && options->has_bfi'), 2092 (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 2093 (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 2094 (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 2095 (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 2096 (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 2097 (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 2098 (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 2099 (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 2100 2101 (('imul_32x16', a, b), ('imul', a, ('extract_i16', b, 0)), 'options->lower_mul_32x16'), 2102 (('umul_32x16', a, b), ('imul', a, ('extract_u16', b, 0)), 'options->lower_mul_32x16'), 2103 2104 (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 2105 'options->lower_uadd_sat || (options->lower_int64_options & (nir_lower_iadd64 | nir_lower_uadd_sat64)) != 0'), 2106 (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'), 2107 (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat'), 2108 (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), '(options->lower_int64_options & nir_lower_usub_sat64) != 0'), 2109 2110 # int64_t sum = a + b; 2111 # 2112 # if (a < 0 && b < 0 && a < sum) 2113 # sum = INT64_MIN; 2114 # } else if (a >= 0 && b >= 0 && sum < a) 2115 # sum = INT64_MAX; 2116 # } 2117 # 2118 # A couple optimizations are applied. 2119 # 2120 # 1. a < sum => sum >= 0. This replacement works because it is known that 2121 # a < 0 and b < 0, so sum should also be < 0 unless there was 2122 # underflow. 2123 # 2124 # 2. sum < a => sum < 0. This replacement works because it is known that 2125 # a >= 0 and b >= 0, so sum should also be >= 0 unless there was 2126 # overflow. 2127 # 2128 # 3. Invert the second if-condition and swap the order of parameters for 2129 # the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >= 2130 # 0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0) 2131 # 2132 # On Intel Gen11, this saves ~11 instructions. 2133 (('iadd_sat@64', a, b), ('bcsel', 2134 ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), 2135 0x8000000000000000, 2136 ('bcsel', 2137 ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), 2138 ('iadd', a, b), 2139 0x7fffffffffffffff)), 2140 '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'), 2141 2142 # int64_t sum = a - b; 2143 # 2144 # if (a < 0 && b >= 0 && a < sum) 2145 # sum = INT64_MIN; 2146 # } else if (a >= 0 && b < 0 && a >= sum) 2147 # sum = INT64_MAX; 2148 # } 2149 # 2150 # Optimizations similar to the iadd_sat case are applied here. 2151 (('isub_sat@64', a, b), ('bcsel', 2152 ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), 2153 0x8000000000000000, 2154 ('bcsel', 2155 ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), 2156 ('isub', a, b), 2157 0x7fffffffffffffff)), 2158 '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'), 2159 2160 # These are done here instead of in the backend because the int64 lowering 2161 # pass will make a mess of the patterns. The first patterns are 2162 # conditioned on nir_lower_minmax64 because it was not clear that it was 2163 # always an improvement on platforms that have real int64 support. No 2164 # shaders in shader-db hit this, so it was hard to say one way or the 2165 # other. 2166 (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2167 (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2168 (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2169 (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2170 (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2171 (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2172 2173 (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2174 (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2175 # 0u < uint(a) <=> uint(a) != 0u 2176 (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2177 2178 # Alternative lowering that doesn't rely on bfi. 2179 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 2180 ('bcsel', ('ult', 31, 'bits'), 2181 'insert', 2182 (('ior', 2183 ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))), 2184 ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))), 2185 'options->lower_bitfield_insert && (!options->has_bfm || (!options->has_bfi && !options->has_bitfield_select))'), 2186 2187 # Alternative lowering that uses bitfield_select. 2188 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 2189 ('bcsel', ('ult', 31, 'bits'), 'insert', 2190 ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')), 2191 'options->lower_bitfield_insert && options->has_bfm && options->has_bitfield_select'), 2192 2193 (('ibitfield_extract', 'value', 'offset', 'bits'), 2194 ('bcsel', ('ult', 31, 'bits'), 'value', 2195 ('ibfe', 'value', 'offset', 'bits')), 2196 'options->lower_bitfield_extract && options->has_bfe'), 2197 2198 (('ubitfield_extract', 'value', 'offset', 'bits'), 2199 ('bcsel', ('ult', 31, 'bits'), 'value', 2200 ('ubfe', 'value', 'offset', 'bits')), 2201 'options->lower_bitfield_extract && options->has_bfe'), 2202 2203 # (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0. 2204 (('bitfield_select', a, b, 0), ('iand', a, b)), 2205 (('bitfield_select', a, ('iand', a, b), c), ('bitfield_select', a, b, c)), 2206 2207 # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits' 2208 (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')), 2209 (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')), 2210 (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')), 2211 (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')), 2212 (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')), 2213 (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')), 2214 2215 # Optimizations for ubitfield_extract(value, offset, umin(bits, 32-(offset&0x1f))) and such 2216 (('ult', a, ('umin', ('iand', a, b), c)), False), 2217 (('ult', 31, ('umin', '#bits(is_ult_32)', a)), False), 2218 (('ubfe', 'value', 'offset', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset'))))), 2219 ('ubfe', 'value', 'offset', 'width')), 2220 (('ibfe', 'value', 'offset', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset'))))), 2221 ('ibfe', 'value', 'offset', 'width')), 2222 (('bfm', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset')))), 'offset'), 2223 ('bfm', 'width', 'offset')), 2224 2225 # open-coded BFM 2226 (('iadd@32', ('ishl', 1, a), -1), ('bfm', a, 0), 'options->has_bfm'), 2227 (('ishl', ('bfm', a, 0), b), ('bfm', a, b)), 2228 2229 # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: 2230 # 2231 # If bits is zero, the result will be zero. 2232 # 2233 # These patterns prevent other patterns from generating invalid results 2234 # when count is zero. 2235 (('ubfe', a, b, 0), 0), 2236 (('ibfe', a, b, 0), 0), 2237 2238 (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))), 2239 2240 (('b2i32', ('ine', ('ubfe', a, b, 1), 0)), ('ubfe', a, b, 1)), 2241 (('b2i32', ('ine', ('ibfe', a, b, 1), 0)), ('ubfe', a, b, 1)), # ubfe in the replacement is correct 2242 (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2243 (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2244 (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2245 (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2246 2247 (('ibitfield_extract', 'value', 'offset', 'bits'), 2248 ('bcsel', ('ieq', 0, 'bits'), 2249 0, 2250 ('ishr', 2251 ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')), 2252 ('isub', 32, 'bits'))), 2253 'options->lower_bitfield_extract && !options->has_bfe'), 2254 2255 (('ubitfield_extract', 'value', 'offset', 'bits'), 2256 ('iand', 2257 ('ushr', 'value', 'offset'), 2258 ('bcsel', ('ieq', 'bits', 32), 2259 0xffffffff, 2260 ('isub', ('ishl', 1, 'bits'), 1))), 2261 'options->lower_bitfield_extract && !options->has_bfe'), 2262 2263 (('ifind_msb', 'value'), 2264 ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')), 2265 'options->lower_ifind_msb && !options->has_find_msb_rev && !options->has_uclz'), 2266 2267 (('ifind_msb', 'value'), 2268 ('bcsel', ('ige', ('ifind_msb_rev', 'value'), 0), 2269 ('isub', 31, ('ifind_msb_rev', 'value')), 2270 ('ifind_msb_rev', 'value')), 2271 'options->lower_ifind_msb && options->has_find_msb_rev'), 2272 2273 # uclz of an absolute value source almost always does the right thing. 2274 # There are a couple problem values: 2275 # 2276 # * 0x80000000. Since abs(0x80000000) == 0x80000000, uclz returns 0. 2277 # However, findMSB(int(0x80000000)) == 30. 2278 # 2279 # * 0xffffffff. Since abs(0xffffffff) == 1, uclz returns 31. Section 8.8 2280 # (Integer Functions) of the GLSL 4.50 spec says: 2281 # 2282 # For a value of zero or negative one, -1 will be returned. 2283 # 2284 # * Negative powers of two. uclz(abs(-(1<<x))) returns x, but 2285 # findMSB(-(1<<x)) should return x-1. 2286 # 2287 # For all negative number cases, including 0x80000000 and 0xffffffff, the 2288 # correct value is obtained from uclz if instead of negating the (already 2289 # negative) value the logical-not is used. A conditional logical-not can 2290 # be achieved by (x ^ (x >> 31)). 2291 (('ifind_msb', 'value'), 2292 ('isub', 31, ('uclz', ('ixor', 'value', ('ishr', 'value', 31)))), 2293 'options->lower_ifind_msb && options->has_uclz'), 2294 2295 (('ufind_msb', 'value@32'), 2296 ('bcsel', ('ige', ('ufind_msb_rev', 'value'), 0), 2297 ('isub', 31, ('ufind_msb_rev', 'value')), 2298 ('ufind_msb_rev', 'value')), 2299 'options->lower_ufind_msb && options->has_find_msb_rev'), 2300 2301 (('ufind_msb', 'value@32'), 2302 ('isub', 31, ('uclz', 'value')), 2303 'options->lower_ufind_msb && options->has_uclz'), 2304 2305 (('uclz', a), ('umin', 32, ('ufind_msb_rev', a)), '!options->has_uclz && options->has_find_msb_rev'), 2306 2307 (('find_lsb', 'value@64'), 2308 ('ufind_msb', ('iand', 'value', ('ineg', 'value'))), 2309 'options->lower_find_lsb'), 2310 2311 (('find_lsb', 'value'), 2312 ('ufind_msb', ('u2u32', ('iand', 'value', ('ineg', 'value')))), 2313 'options->lower_find_lsb'), 2314 2315 (('extract_i8', a, 'b@32'), 2316 ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24), 2317 'options->lower_extract_byte'), 2318 2319 (('extract_u8', a, 'b@32'), 2320 ('iand', ('ushr', a, ('imul', b, 8)), 0xff), 2321 'options->lower_extract_byte'), 2322 2323 (('extract_i16', a, 'b@32'), 2324 ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16), 2325 'options->lower_extract_word'), 2326 2327 (('extract_u16', a, 'b@32'), 2328 ('iand', ('ushr', a, ('imul', b, 16)), 0xffff), 2329 'options->lower_extract_word'), 2330 2331 (('pack_unorm_2x16', 'v'), 2332 ('pack_uvec2_to_uint', 2333 ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))), 2334 'options->lower_pack_unorm_2x16'), 2335 2336 (('pack_unorm_4x8', 'v'), 2337 ('pack_uvec4_to_uint', 2338 ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))), 2339 'options->lower_pack_unorm_4x8 && !options->has_pack_32_4x8'), 2340 2341 (('pack_unorm_4x8', 'v'), 2342 ('pack_32_4x8', 2343 ('f2u8', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))), 2344 'options->lower_pack_unorm_4x8 && options->has_pack_32_4x8'), 2345 2346 (('pack_snorm_2x16', 'v'), 2347 ('pack_uvec2_to_uint', 2348 ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))), 2349 'options->lower_pack_snorm_2x16'), 2350 2351 (('pack_snorm_4x8', 'v'), 2352 ('pack_uvec4_to_uint', 2353 ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))), 2354 'options->lower_pack_snorm_4x8 && !options->has_pack_32_4x8'), 2355 2356 (('pack_snorm_4x8', 'v'), 2357 ('pack_32_4x8', 2358 ('f2i8', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))), 2359 'options->lower_pack_snorm_4x8 && options->has_pack_32_4x8'), 2360 2361 (('unpack_unorm_2x16', 'v'), 2362 ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0), 2363 ('extract_u16', 'v', 1))), 2364 65535.0), 2365 'options->lower_unpack_unorm_2x16'), 2366 2367 (('unpack_unorm_4x8', 'v'), 2368 ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0), 2369 ('extract_u8', 'v', 1), 2370 ('extract_u8', 'v', 2), 2371 ('extract_u8', 'v', 3))), 2372 255.0), 2373 'options->lower_unpack_unorm_4x8'), 2374 2375 (('unpack_snorm_2x16', 'v'), 2376 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0), 2377 ('extract_i16', 'v', 1))), 2378 32767.0))), 2379 'options->lower_unpack_snorm_2x16'), 2380 2381 (('unpack_snorm_4x8', 'v'), 2382 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0), 2383 ('extract_i8', 'v', 1), 2384 ('extract_i8', 'v', 2), 2385 ('extract_i8', 'v', 3))), 2386 127.0))), 2387 'options->lower_unpack_snorm_4x8'), 2388 2389 (('pack_half_2x16_split', 'a@32', 'b@32'), 2390 ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))), 2391 'options->lower_pack_split'), 2392 2393 (('unpack_half_2x16_split_x', 'a@32'), 2394 ('f2f32', ('u2u16', a)), 2395 'options->lower_pack_split && !nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'), 2396 2397 (('unpack_half_2x16_split_x', 'a@32'), 2398 ('f2f32', ('fmul', 1.0, ('u2u16', a))), 2399 'options->lower_pack_split && nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'), 2400 2401 (('unpack_half_2x16_split_y', 'a@32'), 2402 ('f2f32', ('u2u16', ('ushr', a, 16))), 2403 'options->lower_pack_split && !nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'), 2404 2405 (('unpack_half_2x16_split_y', 'a@32'), 2406 ('f2f32', ('fmul', 1.0, ('u2u16', ('ushr', a, 16)))), 2407 'options->lower_pack_split && nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'), 2408 2409 (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'), 2410 (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'), 2411 (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'), 2412 # float(0 < NaN) - float(NaN < 0) = float(False) - float(False) = 0 - 0 = 0 2413 # Mark the new comparisons precise to prevent them being changed to 'a != 2414 # 0' or 'a == 0'. 2415 (('fsign', a), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_fsign'), 2416 (('fsign', 'a@64'), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_doubles_options & nir_lower_dsign'), 2417 2418 # Address/offset calculations: 2419 # Drivers supporting imul24 should use the nir_lower_amul() pass, this 2420 # rule converts everyone else to imul: 2421 (('amul', a, b), ('imul', a, b), '!options->has_imul24'), 2422 2423 (('umul24', a, b), 2424 ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), 2425 '!options->has_umul24'), 2426 (('umad24', a, b, c), 2427 ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c), 2428 '!options->has_umad24'), 2429 2430 # Relaxed 24bit ops 2431 (('imul24_relaxed', a, b), ('imul24', a, b), 'options->has_imul24'), 2432 (('imul24_relaxed', a, b), ('imul', a, b), '!options->has_imul24'), 2433 (('umad24_relaxed', a, b, c), ('umad24', a, b, c), 'options->has_umad24'), 2434 (('umad24_relaxed', a, b, c), ('iadd', ('umul24_relaxed', a, b), c), '!options->has_umad24'), 2435 (('umul24_relaxed', a, b), ('umul24', a, b), 'options->has_umul24'), 2436 (('umul24_relaxed', a, b), ('imul', a, b), '!options->has_umul24'), 2437 2438 (('imad24_ir3', a, b, 0), ('imul24', a, b)), 2439 (('imad24_ir3', a, 0, c), (c)), 2440 (('imad24_ir3', a, 1, c), ('iadd', a, c)), 2441 2442 # if first two srcs are const, crack apart the imad so constant folding 2443 # can clean up the imul: 2444 # TODO ffma should probably get a similar rule: 2445 (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)), 2446 2447 # These will turn 24b address/offset calc back into 32b shifts, but 2448 # it should be safe to get back some of the bits of precision that we 2449 # already decided were no necessary: 2450 (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), 2451 (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), 2452 (('imul24', a, 0), (0)), 2453]) 2454 2455for bit_size in [8, 16, 32, 64]: 2456 cond = '!options->lower_uadd_sat' 2457 if bit_size == 64: 2458 cond += ' && !(options->lower_int64_options & (nir_lower_iadd64 | nir_lower_uadd_sat64))' 2459 add = 'iadd@' + str(bit_size) 2460 2461 optimizations += [ 2462 (('bcsel', ('ult', ('iadd', a, b), a), -1, (add, a, b)), ('uadd_sat', a, b), cond), 2463 (('bcsel', ('uge', ('iadd', a, b), a), (add, a, b), -1), ('uadd_sat', a, b), cond), 2464 (('bcsel', ('ieq', ('uadd_carry', a, b), 0), (add, a, b), -1), ('uadd_sat', a, b), cond), 2465 (('bcsel', ('ine', ('uadd_carry', a, b), 0), -1, (add, a, b)), ('uadd_sat', a, b), cond), 2466 ] 2467 2468for bit_size in [8, 16, 32, 64]: 2469 cond = '!options->lower_usub_sat' 2470 if bit_size == 64: 2471 cond += ' && !(options->lower_int64_options & nir_lower_usub_sat64)' 2472 add = 'iadd@' + str(bit_size) 2473 2474 optimizations += [ 2475 (('bcsel', ('ult', a, b), 0, (add, a, ('ineg', b))), ('usub_sat', a, b), cond), 2476 (('bcsel', ('uge', a, b), (add, a, ('ineg', b)), 0), ('usub_sat', a, b), cond), 2477 (('bcsel', ('ieq', ('usub_borrow', a, b), 0), (add, a, ('ineg', b)), 0), ('usub_sat', a, b), cond), 2478 (('bcsel', ('ine', ('usub_borrow', a, b), 0), 0, (add, a, ('ineg', b))), ('usub_sat', a, b), cond), 2479 ] 2480 2481# bit_size dependent lowerings 2482for bit_size in [8, 16, 32, 64]: 2483 # convenience constants 2484 intmax = (1 << (bit_size - 1)) - 1 2485 intmin = 1 << (bit_size - 1) 2486 2487 optimizations += [ 2488 (('iadd_sat@' + str(bit_size), a, b), 2489 ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)), 2490 ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'), 2491 (('isub_sat@' + str(bit_size), a, b), 2492 ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)), 2493 ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'), 2494 ] 2495 2496invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')]) 2497 2498for left, right in itertools.combinations_with_replacement(invert.keys(), 2): 2499 optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))), 2500 ('iand', (invert[left], a, b), (invert[right], c, d)))) 2501 optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))), 2502 ('ior', (invert[left], a, b), (invert[right], c, d)))) 2503 2504# Optimize x2yN(b2x(x)) -> b2y 2505for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']): 2506 if x != 'f' and y != 'f' and x != y: 2507 continue 2508 2509 b2x = 'b2f' if x == 'f' else 'b2i' 2510 b2y = 'b2f' if y == 'f' else 'b2i' 2511 x2yN = '{}2{}'.format(x, y) 2512 optimizations.append(((x2yN, (b2x, a)), (b2y, a))) 2513 2514# Optimize away x2xN(a@N) 2515for t in ['int', 'uint', 'float', 'bool']: 2516 for N in type_sizes(t): 2517 x2xN = '{0}2{0}{1}'.format(t[0], N) 2518 aN = 'a@{0}'.format(N) 2519 optimizations.append(((x2xN, aN), a)) 2520 2521# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers 2522# In particular, we can optimize away everything except upcast of downcast and 2523# upcasts where the type differs from the other cast 2524for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')): 2525 if N < M: 2526 # The outer cast is a down-cast. It doesn't matter what the size of the 2527 # argument of the inner cast is because we'll never been in the upcast 2528 # of downcast case. Regardless of types, we'll always end up with y2yN 2529 # in the end. 2530 for x, y in itertools.product(['i', 'u'], ['i', 'u']): 2531 x2xN = '{0}2{0}{1}'.format(x, N) 2532 y2yM = '{0}2{0}{1}'.format(y, M) 2533 y2yN = '{0}2{0}{1}'.format(y, N) 2534 optimizations.append(((x2xN, (y2yM, a)), (y2yN, a))) 2535 elif N > M: 2536 # If the outer cast is an up-cast, we have to be more careful about the 2537 # size of the argument of the inner cast and with types. In this case, 2538 # the type is always the type of type up-cast which is given by the 2539 # outer cast. 2540 for P in type_sizes('uint'): 2541 # We can't optimize away up-cast of down-cast. 2542 if M < P: 2543 continue 2544 2545 # Because we're doing down-cast of down-cast, the types always have 2546 # to match between the two casts 2547 for x in ['i', 'u']: 2548 x2xN = '{0}2{0}{1}'.format(x, N) 2549 x2xM = '{0}2{0}{1}'.format(x, M) 2550 aP = 'a@{0}'.format(P) 2551 optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a))) 2552 else: 2553 # The N == M case is handled by other optimizations 2554 pass 2555 2556# Downcast operations should be able to see through pack 2557for t in ['i', 'u']: 2558 for N in [8, 16, 32]: 2559 x2xN = '{0}2{0}{1}'.format(t, N) 2560 optimizations += [ 2561 ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), 2562 ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), 2563 ] 2564 2565# Optimize comparisons with up-casts 2566for t in ['int', 'uint', 'float']: 2567 for N, M in itertools.product(type_sizes(t), repeat=2): 2568 if N == 1 or N >= M: 2569 continue 2570 2571 cond = 'true' 2572 if N == 8: 2573 cond = 'options->support_8bit_alu' 2574 elif N == 16: 2575 cond = 'options->support_16bit_alu' 2576 x2xM = '{0}2{0}{1}'.format(t[0], M) 2577 x2xN = '{0}2{0}{1}'.format(t[0], N) 2578 aN = 'a@' + str(N) 2579 bN = 'b@' + str(N) 2580 xeq = 'feq' if t == 'float' else 'ieq' 2581 xne = 'fneu' if t == 'float' else 'ine' 2582 xge = '{0}ge'.format(t[0]) 2583 xlt = '{0}lt'.format(t[0]) 2584 2585 # Up-casts are lossless so for correctly signed comparisons of 2586 # up-casted values we can do the comparison at the largest of the two 2587 # original sizes and drop one or both of the casts. (We have 2588 # optimizations to drop the no-op casts which this may generate.) 2589 for P in type_sizes(t): 2590 if P == 1 or P > N: 2591 continue 2592 2593 bP = 'b@' + str(P) 2594 optimizations += [ 2595 ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond), 2596 ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond), 2597 ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond), 2598 ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond), 2599 ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond), 2600 ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond), 2601 ] 2602 2603 # The next bit doesn't work on floats because the range checks would 2604 # get way too complicated. 2605 if t in ['int', 'uint']: 2606 if t == 'int': 2607 xN_min = -(1 << (N - 1)) 2608 xN_max = (1 << (N - 1)) - 1 2609 elif t == 'uint': 2610 xN_min = 0 2611 xN_max = (1 << N) - 1 2612 else: 2613 assert False 2614 2615 # If we're up-casting and comparing to a constant, we can unfold 2616 # the comparison into a comparison with the shrunk down constant 2617 # and a check that the constant fits in the smaller bit size. 2618 optimizations += [ 2619 ((xeq, (x2xM, aN), '#b'), 2620 ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond), 2621 ((xne, (x2xM, aN), '#b'), 2622 ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond), 2623 ((xlt, (x2xM, aN), '#b'), 2624 ('iand', (xlt, xN_min, b), 2625 ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond), 2626 ((xlt, '#a', (x2xM, bN)), 2627 ('iand', (xlt, a, xN_max), 2628 ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond), 2629 ((xge, (x2xM, aN), '#b'), 2630 ('iand', (xge, xN_max, b), 2631 ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond), 2632 ((xge, '#a', (x2xM, bN)), 2633 ('iand', (xge, a, xN_min), 2634 ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond), 2635 ] 2636 2637# Convert masking followed by signed downcast to just unsigned downcast 2638optimizations += [ 2639 (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)), 2640 (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)), 2641 (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)), 2642 (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)), 2643 (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)), 2644 (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)), 2645] 2646 2647# Some operations such as iadd have the property that the bottom N bits of the 2648# output only depends on the bottom N bits of each of the inputs so we can 2649# remove casts 2650for N in [16, 32]: 2651 for M in [8, 16]: 2652 if M >= N: 2653 continue 2654 2655 aN = 'a@' + str(N) 2656 u2uM = 'u2u{0}'.format(M) 2657 i2iM = 'i2i{0}'.format(M) 2658 2659 for x in ['u', 'i']: 2660 x2xN = '{0}2{0}{1}'.format(x, N) 2661 extract_xM = 'extract_{0}{1}'.format(x, M) 2662 2663 x2xN_M_bits = '{0}(only_lower_{1}_bits_used)'.format(x2xN, M) 2664 extract_xM_M_bits = \ 2665 '{0}(only_lower_{1}_bits_used)'.format(extract_xM, M) 2666 optimizations += [ 2667 ((x2xN_M_bits, (u2uM, aN)), a), 2668 ((extract_xM_M_bits, aN, 0), a), 2669 ] 2670 2671 bcsel_M_bits = 'bcsel(only_lower_{0}_bits_used)'.format(M) 2672 optimizations += [ 2673 ((bcsel_M_bits, c, (x2xN, (u2uM, aN)), b), ('bcsel', c, a, b)), 2674 ((bcsel_M_bits, c, (x2xN, (i2iM, aN)), b), ('bcsel', c, a, b)), 2675 ((bcsel_M_bits, c, (extract_xM, aN, 0), b), ('bcsel', c, a, b)), 2676 ] 2677 2678 for op in ['iadd', 'imul', 'iand', 'ior', 'ixor']: 2679 op_M_bits = '{0}(only_lower_{1}_bits_used)'.format(op, M) 2680 optimizations += [ 2681 ((op_M_bits, (x2xN, (u2uM, aN)), b), (op, a, b)), 2682 ((op_M_bits, (x2xN, (i2iM, aN)), b), (op, a, b)), 2683 ((op_M_bits, (extract_xM, aN, 0), b), (op, a, b)), 2684 ] 2685 2686def fexp2i(exp, bits): 2687 # Generate an expression which constructs value 2.0^exp or 0.0. 2688 # 2689 # We assume that exp is already in a valid range: 2690 # 2691 # * [-15, 15] for 16-bit float 2692 # * [-127, 127] for 32-bit float 2693 # * [-1023, 1023] for 16-bit float 2694 # 2695 # If exp is the lowest value in the valid range, a value of 0.0 is 2696 # constructed. Otherwise, the value 2.0^exp is constructed. 2697 if bits == 16: 2698 return ('i2i16', ('ishl', ('iadd', exp, 15), 10)) 2699 elif bits == 32: 2700 return ('ishl', ('iadd', exp, 127), 23) 2701 elif bits == 64: 2702 return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20)) 2703 else: 2704 assert False 2705 2706def ldexp(f, exp, bits): 2707 # The maximum possible range for a normal exponent is [-126, 127] and, 2708 # throwing in denormals, you get a maximum range of [-149, 127]. This 2709 # means that we can potentially have a swing of +-276. If you start with 2710 # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush 2711 # all the way to zero. The GLSL spec only requires that we handle a subset 2712 # of this range. From version 4.60 of the spec: 2713 # 2714 # "If exp is greater than +128 (single-precision) or +1024 2715 # (double-precision), the value returned is undefined. If exp is less 2716 # than -126 (single-precision) or -1022 (double-precision), the value 2717 # returned may be flushed to zero. Additionally, splitting the value 2718 # into a significand and exponent using frexp() and then reconstructing 2719 # a floating-point value using ldexp() should yield the original input 2720 # for zero and all finite non-denormalized values." 2721 # 2722 # The SPIR-V spec has similar language. 2723 # 2724 # In order to handle the maximum value +128 using the fexp2i() helper 2725 # above, we have to split the exponent in half and do two multiply 2726 # operations. 2727 # 2728 # First, we clamp exp to a reasonable range. Specifically, we clamp to 2729 # twice the full range that is valid for the fexp2i() function above. If 2730 # exp/2 is the bottom value of that range, the fexp2i() expression will 2731 # yield 0.0f which, when multiplied by f, will flush it to zero which is 2732 # allowed by the GLSL and SPIR-V specs for low exponent values. If the 2733 # value is clamped from above, then it must have been above the supported 2734 # range of the GLSL built-in and therefore any return value is acceptable. 2735 if bits == 16: 2736 exp = ('imin', ('imax', exp, -30), 30) 2737 elif bits == 32: 2738 exp = ('imin', ('imax', exp, -254), 254) 2739 elif bits == 64: 2740 exp = ('imin', ('imax', exp, -2046), 2046) 2741 else: 2742 assert False 2743 2744 # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2. 2745 # (We use ishr which isn't the same for -1, but the -1 case still works 2746 # since we use exp-exp/2 as the second exponent.) While the spec 2747 # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't 2748 # work with denormals and doesn't allow for the full swing in exponents 2749 # that you can get with normalized values. Instead, we create two powers 2750 # of two and multiply by them each in turn. That way the effective range 2751 # of our exponent is doubled. 2752 pow2_1 = fexp2i(('ishr', exp, 1), bits) 2753 pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits) 2754 return ('fmul', ('fmul', f, pow2_1), pow2_2) 2755 2756optimizations += [ 2757 (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'), 2758 (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'), 2759 (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'), 2760] 2761 2762# XCOM 2 (OpenGL) open-codes bitfieldReverse() 2763def bitfield_reverse_xcom2(u): 2764 step1 = ('iadd', ('ishl', u, 16), ('ushr', u, 16)) 2765 step2 = ('iadd', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555)) 2766 step3 = ('iadd', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333)) 2767 step4 = ('iadd', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f)) 2768 step5 = ('iadd(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff)) 2769 2770 return step5 2771 2772# Unreal Engine 4 demo applications open-codes bitfieldReverse() 2773def bitfield_reverse_ue4(u): 2774 step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 2775 step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8)) 2776 step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4)) 2777 step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2)) 2778 step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1)) 2779 2780 return step5 2781 2782# Cyberpunk 2077 open-codes bitfieldReverse() 2783def bitfield_reverse_cp2077(u): 2784 step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 2785 step2 = ('ior', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555)) 2786 step3 = ('ior', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333)) 2787 step4 = ('ior', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f)) 2788 step5 = ('ior(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff)) 2789 2790 return step5 2791 2792optimizations += [(bitfield_reverse_xcom2('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2793optimizations += [(bitfield_reverse_ue4('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2794optimizations += [(bitfield_reverse_cp2077('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2795 2796# VKD3D-Proton DXBC f32 to f16 conversion implements a float conversion using PackHalf2x16. 2797# Because the spec does not specify a rounding mode or behaviour regarding infinity, 2798# it emits a sequence to ensure D3D-like behaviour for infinity. 2799# When we know the current backend already behaves like we need, we can eliminate the extra sequence. 2800# 2801# Input is f32, output is u32 that has the f16 packed into its low bits. 2802def vkd3d_proton_packed_f2f16_rtz_lo(a, abs_a): 2803 packed_half = ('pack_half_2x16_rtz_split', a, 0) 2804 packed_half_minus1 = ('iadd', packed_half, 0xffffffff) 2805 f32_was_not_inf = ('ine', abs_a, 0x7f800000) 2806 f16_is_now_inf = ('ieq', ('iand', packed_half, 0x7fff), 0x7c00) 2807 return ('bcsel', ('iand', f32_was_not_inf, f16_is_now_inf), packed_half_minus1, packed_half) 2808 2809optimizations += [ 2810 (vkd3d_proton_packed_f2f16_rtz_lo('x', ('fabs', 'x')), ('pack_half_2x16_rtz_split', 'x', 0)), 2811 (vkd3d_proton_packed_f2f16_rtz_lo('x(is_not_negative)', 'x'), ('pack_half_2x16_rtz_split', 'x', 0)), 2812 (vkd3d_proton_packed_f2f16_rtz_lo(('fneg', 'x'), ('fabs', 'x')), ('pack_half_2x16_rtz_split', ('fneg', 'x'), 0)), 2813] 2814 2815def vkd3d_proton_msad(): 2816 pattern = None 2817 for i in range(4): 2818 ref = ('extract_u8', 'a@32', i) 2819 src = ('extract_u8', 'b@32', i) 2820 sad = ('iabs', ('iadd', ref, ('ineg', src))) 2821 msad = ('bcsel', ('ieq', ref, 0), 0, sad) 2822 if pattern == None: 2823 pattern = msad 2824 else: 2825 pattern = ('iadd', pattern, msad) 2826 pattern = (pattern[0] + '(many-comm-expr)', *pattern[1:]) 2827 return pattern 2828 2829optimizations += [ 2830 (vkd3d_proton_msad(), ('msad_4x8', a, b, 0), 'options->has_msad'), 2831 (('iadd', ('msad_4x8', a, b, 0), c), ('msad_4x8', a, b, c)), 2832] 2833 2834 2835# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)" 2836# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)" 2837for ncomp in [2, 3, 4, 8, 16]: 2838 optimizations += [ 2839 (('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)), 2840 (('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)), 2841 (('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)), 2842 (('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)), 2843 ] 2844 2845# For any float comparison operation, "cmp", if you have "a == a && a cmp b" 2846# then the "a == a" is redundant because it's equivalent to "a is not NaN" 2847# and, if a is a NaN then the second comparison will fail anyway. 2848for op in ['flt', 'fge', 'feq']: 2849 optimizations += [ 2850 (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)), 2851 (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)), 2852 ] 2853 2854# Add optimizations to handle the case where the result of a ternary is 2855# compared to a constant. This way we can take things like 2856# 2857# (a ? 0 : 1) > 0 2858# 2859# and turn it into 2860# 2861# a ? (0 > 0) : (1 > 0) 2862# 2863# which constant folding will eat for lunch. The resulting ternary will 2864# further get cleaned up by the boolean reductions above and we will be 2865# left with just the original variable "a". 2866for op in ['feq', 'fneu', 'ieq', 'ine']: 2867 optimizations += [ 2868 ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 2869 ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 2870 ] 2871 2872for op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']: 2873 optimizations += [ 2874 ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 2875 ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 2876 ((op, '#d', ('bcsel', a, '#b', '#c')), 2877 ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))), 2878 ] 2879 2880 2881# For example, this converts things like 2882# 2883# 1 + mix(0, a - 1, condition) 2884# 2885# into 2886# 2887# mix(1, (a-1)+1, condition) 2888# 2889# Other optimizations will rearrange the constants. 2890for op in ['fadd', 'fmul', 'fmulz', 'iadd', 'imul']: 2891 optimizations += [ 2892 ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d))) 2893 ] 2894 2895# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives 2896# states: 2897# 2898# If neither layout qualifier is specified, derivatives in compute shaders 2899# return zero, which is consistent with the handling of built-in texture 2900# functions like texture() in GLSL 4.50 compute shaders. 2901for op in ['fddx', 'fddx_fine', 'fddx_coarse', 2902 'fddy', 'fddy_fine', 'fddy_coarse']: 2903 optimizations += [ 2904 ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->derivative_group == DERIVATIVE_GROUP_NONE') 2905] 2906 2907# Some optimizations for ir3-specific instructions. 2908optimizations += [ 2909 # 'al * bl': If either 'al' or 'bl' is zero, return zero. 2910 (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)), 2911 # '(al * bh) << 16 + c': If either 'al' or 'bh' is zero, return 'c'. 2912 (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')), 2913 (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')), 2914] 2915 2916# These kinds of sequences can occur after nir_opt_peephole_select. 2917# 2918# NOTE: fadd is not handled here because that gets in the way of ffma 2919# generation in the i965 driver. Instead, fadd and ffma are handled in 2920# late_optimizations. 2921 2922for op in ['flrp']: 2923 optimizations += [ 2924 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2925 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2926 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2927 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2928 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)), 2929 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)), 2930 ] 2931 2932for op in ['fmulz', 'fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']: 2933 optimizations += [ 2934 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), 2935 (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))), 2936 (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 2937 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), 2938 ] 2939 2940for op in ['fpow']: 2941 optimizations += [ 2942 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), 2943 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 2944 (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)), 2945 (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)), 2946 ] 2947 2948for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fsin_amd', 'fcos_amd', 'fsin_mdg', 'fcos_mdg', 'fsin_agx', 'fneg', 'fabs', 'fsign']: 2949 optimizations += [ 2950 (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))), 2951 ] 2952 2953for op in ['ineg', 'iabs', 'inot', 'isign']: 2954 optimizations += [ 2955 ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))), 2956 ] 2957 2958optimizations.extend([ 2959 (('fisnormal', 'a@16'), ('ult', 0xfff, ('iadd', ('ishl', a, 1), 0x800)), 'options->lower_fisnormal'), 2960 (('fisnormal', 'a@32'), ('ult', 0x1ffffff, ('iadd', ('ishl', a, 1), 0x1000000)), 'options->lower_fisnormal'), 2961 (('fisnormal', 'a@64'), ('ult', 0x3fffffffffffff, ('iadd', ('ishl', a, 1), 0x20000000000000)), 'options->lower_fisnormal') 2962 ]) 2963 2964 2965""" 2966 if (fabs(val) < SMALLEST_NORMALIZED_FLOAT16) 2967 return (val & SIGN_BIT) /* +0.0 or -0.0 as appropriate */; 2968 else 2969 return f2f32(f2f16(val)); 2970""" 2971optimizations.extend([ 2972 (('fquantize2f16', 'a@32'), 2973 ('bcsel', ('!flt', ('!fabs', a), math.ldexp(1.0, -14)), 2974 ('iand', a, 1 << 31), 2975 ('!f2f32', ('!f2f16_rtne', a))), 2976 'options->lower_fquantize2f16') 2977 ]) 2978 2979for s in range(0, 31): 2980 mask = 0xffffffff << s 2981 2982 # bfi is ((mask & ...) | (~mask & ...)). Since the two sources of the ior 2983 # will never both have the same bits set, replacing the ior with an iadd 2984 # is safe (i.e., a carry out of a bit can never be generated). The iadd is 2985 # more likely to participate in other optimization patterns (e.g., iadd of 2986 # constant reassociation) 2987 optimizations.extend([ 2988 (('bfi', mask, a, '#b'), ('iadd', ('ishl', a, s), ('iand', b, ~mask)), 2989 'options->avoid_ternary_with_two_constants'), 2990 ]) 2991 2992# NaN propagation: Binary opcodes. If any operand is NaN, replace it with NaN. 2993# (unary opcodes with NaN are evaluated by nir_opt_constant_folding, not here) 2994for op in ['fadd', 'fdiv', 'fmod', 'fmul', 'fpow', 'frem', 'fsub']: 2995 optimizations += [((op, '#a(is_nan)', b), NAN)] 2996 optimizations += [((op, a, '#b(is_nan)'), NAN)] # some opcodes are not commutative 2997 2998# NaN propagation: Trinary opcodes. If any operand is NaN, replace it with NaN. 2999for op in ['ffma', 'flrp']: 3000 optimizations += [((op, '#a(is_nan)', b, c), NAN)] 3001 optimizations += [((op, a, '#b(is_nan)', c), NAN)] # some opcodes are not commutative 3002 optimizations += [((op, a, b, '#c(is_nan)'), NAN)] 3003 3004# NaN propagation: FP min/max. Pick the non-NaN operand. 3005for op in ['fmin', 'fmax']: 3006 optimizations += [((op, '#a(is_nan)', b), b)] # commutative 3007 3008# NaN propagation: ldexp is NaN if the first operand is NaN. 3009optimizations += [(('ldexp', '#a(is_nan)', b), NAN)] 3010 3011# NaN propagation: Dot opcodes. If any component is NaN, replace it with NaN. 3012for op in ['fdot2', 'fdot3', 'fdot4', 'fdot5', 'fdot8', 'fdot16']: 3013 optimizations += [((op, '#a(is_any_comp_nan)', b), NAN)] # commutative 3014 3015# NaN propagation: FP comparison opcodes except !=. Replace it with false. 3016for op in ['feq', 'fge', 'flt']: 3017 optimizations += [((op, '#a(is_nan)', b), False)] 3018 optimizations += [((op, a, '#b(is_nan)'), False)] # some opcodes are not commutative 3019 3020# NaN propagation: FP comparison opcodes using !=. Replace it with true. 3021# Operator != is the only opcode where a comparison with NaN returns true. 3022for op in ['fneu']: 3023 optimizations += [((op, '#a(is_nan)', b), True)] # commutative 3024 3025# NaN propagation: FP comparison opcodes except != returning FP 0 or 1. 3026for op in ['seq', 'sge', 'slt']: 3027 optimizations += [((op, '#a(is_nan)', b), 0.0)] 3028 optimizations += [((op, a, '#b(is_nan)'), 0.0)] # some opcodes are not commutative 3029 3030# NaN propagation: FP comparison opcodes using != returning FP 0 or 1. 3031# Operator != is the only opcode where a comparison with NaN returns true. 3032optimizations += [(('sne', '#a(is_nan)', b), 1.0)] # commutative 3033 3034# This section contains optimizations to propagate downsizing conversions of 3035# constructed vectors into vectors of downsized components. Whether this is 3036# useful depends on the SIMD semantics of the backend. On a true SIMD machine, 3037# this reduces the register pressure of the vector itself and often enables the 3038# conversions to be eliminated via other algebraic rules or constant folding. 3039# In the worst case on a SIMD architecture, the propagated conversions may be 3040# revectorized via nir_opt_vectorize so instruction count is minimally 3041# impacted. 3042# 3043# On a machine with SIMD-within-a-register only, this actually 3044# counterintuitively hurts instruction count. These machines are the same that 3045# require vectorize_vec2_16bit, so we predicate the optimizations on that flag 3046# not being set. 3047# 3048# Finally for scalar architectures, there should be no difference in generated 3049# code since it all ends up scalarized at the end, but it might minimally help 3050# compile-times. 3051 3052for i in range(2, 4 + 1): 3053 for T in ('f', 'u', 'i'): 3054 vec_inst = ('vec' + str(i),) 3055 3056 indices = ['a', 'b', 'c', 'd'] 3057 suffix_in = tuple((indices[j] + '@32') for j in range(i)) 3058 3059 to_16 = '{}2{}16'.format(T, T) 3060 to_mp = '{}2{}mp'.format(T, T) 3061 3062 out_16 = tuple((to_16, indices[j]) for j in range(i)) 3063 out_mp = tuple((to_mp, indices[j]) for j in range(i)) 3064 3065 optimizations += [ 3066 ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'), 3067 ] 3068 # u2ump doesn't exist, because it's equal to i2imp 3069 if T in ['f', 'i']: 3070 optimizations += [ 3071 ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit') 3072 ] 3073 3074# This section contains "late" optimizations that should be run before 3075# creating ffmas and calling regular optimizations for the final time. 3076# Optimizations should go here if they help code generation and conflict 3077# with the regular optimizations. 3078before_ffma_optimizations = [ 3079 # Propagate constants down multiplication chains 3080 (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)), 3081 (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)), 3082 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)), 3083 (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)), 3084 3085 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 3086 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 3087 (('~fadd', ('fneg', a), a), 0.0), 3088 (('iadd', ('ineg', a), a), 0), 3089 (('iadd', ('ineg', a), ('iadd', a, b)), b), 3090 (('iadd', a, ('iadd', ('ineg', a), b)), b), 3091 (('~fadd', ('fneg', a), ('fadd', a, b)), b), 3092 (('~fadd', a, ('fadd', ('fneg', a), b)), b), 3093 3094 (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a, 1.0), d), ('fadd', ('flrp', -1.0, 1.0, d), a)), 3095 (('~flrp', ('fadd(is_used_once)', a, 1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp', 1.0, -1.0, d), a)), 3096 (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))), 3097] 3098 3099# This section contains "late" optimizations that should be run after the 3100# regular optimizations have finished. Optimizations should go here if 3101# they help code generation but do not necessarily produce code that is 3102# more easily optimizable. 3103late_optimizations = [ 3104 # The rearrangements are fine w.r.t. NaN. However, they produce incorrect 3105 # results if one operand is +Inf and the other is -Inf. 3106 # 3107 # 1. Inf + -Inf = NaN 3108 # 2. ∀x: x + NaN = NaN and x - NaN = NaN 3109 # 3. ∀x: x != NaN = true 3110 # 4. ∀x, ∀ cmp ∈ {<, >, ≤, ≥, =}: x cmp NaN = false 3111 # 3112 # a=Inf, b=-Inf a=-Inf, b=Inf a=NaN b=NaN 3113 # (a+b) < 0 false false false false 3114 # a < -b false false false false 3115 # -(a+b) < 0 false false false false 3116 # -a < b false false false false 3117 # (a+b) >= 0 false false false false 3118 # a >= -b true true false false 3119 # -(a+b) >= 0 false false false false 3120 # -a >= b true true false false 3121 # (a+b) == 0 false false false false 3122 # a == -b true true false false 3123 # (a+b) != 0 true true true true 3124 # a != -b false false true true 3125 (('flt', ('fadd(is_used_once)', a, b), 0.0), ('flt', a, ('fneg', b))), 3126 (('flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('flt', ('fneg', a), b)), 3127 (('flt', 0.0, ('fadd(is_used_once)', a, b) ), ('flt', ('fneg', a), b)), 3128 (('flt', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('flt', a, ('fneg', b))), 3129 (('~fge', ('fadd(is_used_once)', a, b), 0.0), ('fge', a, ('fneg', b))), 3130 (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('fge', ('fneg', a), b)), 3131 (('~fge', 0.0, ('fadd(is_used_once)', a, b) ), ('fge', ('fneg', a), b)), 3132 (('~fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fge', a, ('fneg', b))), 3133 (('~feq', ('fadd(is_used_once)', a, b), 0.0), ('feq', a, ('fneg', b))), 3134 (('~fneu', ('fadd(is_used_once)', a, b), 0.0), ('fneu', a, ('fneg', b))), 3135 3136 # If either source must be finite, then the original (a+b) cannot produce 3137 # NaN due to Inf-Inf. The patterns and the replacements produce the same 3138 # result if b is NaN. Therefore, the replacements are exact. 3139 (('fge', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fge', a, ('fneg', b))), 3140 (('fge', ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b)), 0.0), ('fge', ('fneg', a), b)), 3141 (('fge', 0.0, ('fadd(is_used_once)', 'a(is_finite)', b) ), ('fge', ('fneg', a), b)), 3142 (('fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b))), ('fge', a, ('fneg', b))), 3143 (('feq', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq', a, ('fneg', b))), 3144 (('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))), 3145 3146 # This is how SpvOpFOrdNotEqual might be implemented. Replace it with 3147 # SpvOpLessOrGreater. 3148 *add_fabs_fneg((('iand', ('fneu', 'ma', 'mb'), ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', 'ma', 'mb'), ('!flt', 'mb', 'ma'))), {'ma' : a, 'mb' : b}), 3149 (('iand', ('fneu', a, 0.0), ('feq', a, a)), ('!flt', 0.0, ('fabs', a))), 3150 3151 # This is how SpvOpFUnordEqual might be implemented. Replace it with 3152 # !SpvOpLessOrGreater. 3153 *add_fabs_fneg((('ior', ('feq', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', 'ma', 'mb'), ('!flt', 'mb', 'ma')))), {'ma' : a, 'mb' : b}), 3154 (('ior', ('feq', a, 0.0), ('fneu', a, a)), ('inot', ('!flt', 0.0, ('fabs', a)))), 3155 3156 *add_fabs_fneg((('ior', ('flt', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('fge', 'ma', 'mb'))), {'ma' : a, 'mb' : b}, False), 3157 *add_fabs_fneg((('ior', ('fge', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('flt', 'ma', 'mb'))), {'ma' : a, 'mb' : b}, False), 3158 *add_fabs_fneg((('ior', ('flt', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('inot', ('fge', 'ma', b))), {'ma' : a}), 3159 *add_fabs_fneg((('ior', ('fge', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('inot', ('flt', 'ma', b))), {'ma' : a}), 3160 *add_fabs_fneg((('ior', ('flt', 'a(is_a_number)', 'mb'), ('fneu', b, b)), ('inot', ('fge', a, 'mb'))), {'mb' : b}), 3161 *add_fabs_fneg((('ior', ('fge', 'a(is_a_number)', 'mb'), ('fneu', b, b)), ('inot', ('flt', a, 'mb'))), {'mb' : b}), 3162 *add_fabs_fneg((('iand', ('fneu', 'ma', 'b(is_a_number)'), ('feq', a, a)), ('fneo', 'ma', b), 'options->has_fneo_fcmpu'), {'ma' : a}), 3163 *add_fabs_fneg((('ior', ('feq', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('fequ', 'ma', b), 'options->has_fneo_fcmpu'), {'ma' : a}), 3164 3165 (('ior', ('flt', a, b), ('flt', b, a)), ('fneo', a, b), 'options->has_fneo_fcmpu'), 3166 (('flt', 0.0, ('fabs', a)), ('fneo', 0.0, a), 'options->has_fneo_fcmpu'), 3167 3168 3169 # These don't interfere with the previous optimizations which include this 3170 # in the search expression, because nir_algebraic_impl visits instructions 3171 # in reverse order. 3172 (('ior', ('fneu', 'a@16', a), ('fneu', 'b@16', b)), ('funord', a, b), 'options->has_ford_funord'), 3173 (('iand', ('feq', 'a@16', a), ('feq', 'b@16', b)), ('ford', a, b), 'options->has_ford_funord'), 3174 (('ior', ('fneu', 'a@32', a), ('fneu', 'b@32', b)), ('funord', a, b), 'options->has_ford_funord'), 3175 (('iand', ('feq', 'a@32', a), ('feq', 'b@32', b)), ('ford', a, b), 'options->has_ford_funord'), 3176 (('ior', ('fneu', 'a@64', a), ('fneu', 'b@64', b)), ('funord', a, b), 'options->has_ford_funord'), 3177 (('iand', ('feq', 'a@64', a), ('feq', 'b@64', b)), ('ford', a, b), 'options->has_ford_funord'), 3178 3179 (('inot', ('ford(is_used_once)', a, b)), ('funord', a, b)), 3180 (('inot', ('funord(is_used_once)', a, b)), ('ford', a, b)), 3181 (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)), 3182 (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)), 3183 (('inot', ('fequ(is_used_once)', a, b)), ('fneo', a, b)), 3184 (('inot', ('fneo(is_used_once)', a, b)), ('fequ', a, b)), 3185 (('inot', ('flt(is_used_once)', a, b)), ('fgeu', a, b), 'options->has_fneo_fcmpu'), 3186 (('inot', ('fgeu(is_used_once)', a, b)), ('flt', a, b)), 3187 (('inot', ('fge(is_used_once)', a, b)), ('fltu', a, b), 'options->has_fneo_fcmpu'), 3188 (('inot', ('fltu(is_used_once)', a, b)), ('fge', a, b)), 3189 3190 # nir_lower_to_source_mods will collapse this, but its existence during the 3191 # optimization loop can prevent other optimizations. 3192 (('fneg', ('fneg', a)), a), 3193 3194 # combine imul and iadd to imad 3195 (('iadd@32', ('imul(is_only_used_by_iadd)', a, b), c), ('imad', a, b, c), 'options->has_imad32'), 3196] 3197 3198# re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c 3199# gets combined to fma(a, b, -c). 3200for sz, mulz in itertools.product([16, 32, 64], [False, True]): 3201 # fmulz/ffmaz only for fp32 3202 if mulz and sz != 32: 3203 continue 3204 3205 # Fuse the correct fmul. Only consider fmuls where the only users are fadd 3206 # (or fneg/fabs which are assumed to be propagated away), as a heuristic to 3207 # avoid fusing in cases where it's harmful. 3208 fmul = ('fmulz' if mulz else 'fmul') + '(is_only_used_by_fadd)' 3209 ffma = 'ffmaz' if mulz else 'ffma' 3210 3211 fadd = '~fadd@{}'.format(sz) 3212 option = 'options->fuse_ffma{}'.format(sz) 3213 3214 late_optimizations.extend([ 3215 ((fadd, (fmul, a, b), c), (ffma, a, b, c), option), 3216 3217 ((fadd, ('fneg(is_only_used_by_fadd)', (fmul, a, b)), c), 3218 (ffma, ('fneg', a), b, c), option), 3219 3220 ((fadd, ('fabs(is_only_used_by_fadd)', (fmul, a, b)), c), 3221 (ffma, ('fabs', a), ('fabs', b), c), option), 3222 3223 ((fadd, ('fneg(is_only_used_by_fadd)', ('fabs', (fmul, a, b))), c), 3224 (ffma, ('fneg', ('fabs', a)), ('fabs', b), c), option), 3225 ]) 3226 3227late_optimizations.extend([ 3228 # Subtractions get lowered during optimization, so we need to recombine them 3229 (('fadd@8', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 3230 (('fadd@16', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 3231 (('fadd@32', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 3232 (('fadd@64', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub && !(options->lower_doubles_options & nir_lower_dsub)'), 3233 3234 (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'), 3235 (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'), 3236 (('ineg', a), ('isub', 0, a), 'options->lower_ineg'), 3237 (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'), 3238]) 3239 3240for s in [8, 16, 32, 64]: 3241 cond = 'options->has_iadd3' 3242 if s == 64: 3243 cond += ' && !(options->lower_int64_options & nir_lower_iadd3_64)' 3244 3245 iadd = "iadd@{}".format(s) 3246 3247 # On Intel GPUs, the constant field for an ADD3 instruction must be either 3248 # int16_t or uint16_t. 3249 late_optimizations.extend([ 3250 ((iadd, ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), cond), 3251 ((iadd, ('iadd(is_used_once)', '#a(is_16_bits)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), cond), 3252 ((iadd, ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_16_bits)'), ('iadd3', a, b, c), cond), 3253 ((iadd, ('ineg', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)')), 'c(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', b), c), cond), 3254 ((iadd, ('ineg', ('iadd(is_used_once)', '#a(is_16_bits)', 'b(is_not_const)')), 'c(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', b), c), cond), 3255 ((iadd, ('ineg', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)')), '#c(is_16_bits)'), ('iadd3', ('ineg', a), ('ineg', b), c), cond), 3256 3257 ((iadd, ('ishl', a, 1), 'b(is_not_const)'), ('iadd3', a, a, b), cond), 3258 ((iadd, ('ishl', a, 1), '#b(is_16_bits)' ), ('iadd3', a, a, b), cond), 3259 ((iadd, ('ineg', ('ishl', a, 1)), 'b(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', a), b), cond), 3260 ((iadd, ('ineg', ('ishl', a, 1)), '#b(is_16_bits)' ), ('iadd3', ('ineg', a), ('ineg', a), b), cond), 3261 3262 # Use special checks to ensure (b+b) or -(b+b) fit in 16 bits. 3263 (('ishl@{}'.format(s), ('iadd', a, '#b(is_2x_16_bits)'), 1), ('iadd3', a, a, ('iadd', b, b)), cond), 3264 (('ishl@{}'.format(s), ('ineg', ('iadd', a, '#b(is_neg2x_16_bits)')), 1), ('iadd3', ('ineg', a), ('ineg', a), ('ineg', ('iadd', b, b))), cond), 3265 ]) 3266 3267late_optimizations.extend([ 3268 # fneg_lo / fneg_hi 3269 (('vec2(is_only_used_as_float)', ('fneg@16', a), b), ('fmul', ('vec2', a, b), ('vec2', -1.0, 1.0)), 'options->vectorize_vec2_16bit'), 3270 (('vec2(is_only_used_as_float)', a, ('fneg@16', b)), ('fmul', ('vec2', a, b), ('vec2', 1.0, -1.0)), 'options->vectorize_vec2_16bit'), 3271 3272 # These are duplicated from the main optimizations table. The late 3273 # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create 3274 # new patterns like these. The patterns that compare with zero are removed 3275 # because they are unlikely to be created in by anything in 3276 # late_optimizations. 3277 (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), 3278 (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), 3279 (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), 3280 (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)), 3281 3282 (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), 3283 3284 (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))), 3285 3286 (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), 3287 (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), 3288 (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), 3289 (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)), 3290 (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)), 3291 (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)), 3292 (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)), 3293 (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)), 3294 (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)), 3295 (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)), 3296 3297 (('ior', a, a), a), 3298 (('iand', a, a), a), 3299 3300 (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), 3301 3302 (('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'), 3303 (('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'), 3304 (('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'), 3305 (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'), 3306 3307 (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)), 3308 3309 # Approximate handling of fround_even for DX9 addressing from gallium nine on 3310 # DX9-class hardware with no proper fround support. This is in 3311 # late_optimizations so that the is_integral() opts in the main pass get a 3312 # chance to eliminate the fround_even first. 3313 (('fround_even', a), ('bcsel', 3314 ('feq', ('ffract', a), 0.5), 3315 ('fadd', ('ffloor', ('fadd', a, 0.5)), 1.0), 3316 ('ffloor', ('fadd', a, 0.5))), 'options->lower_fround_even'), 3317 3318 # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this 3319 # particular operation is common for expanding values stored in a texture 3320 # from [0,1] to [-1,1]. 3321 (('~ffma@32', a, 2.0, -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), 3322 (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), 3323 (('~ffma@32', a, -2.0, 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), 3324 (('~ffma@32', a, 2.0, 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), 3325 (('~fadd@32', ('fmul(is_used_once)', 2.0, a), -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), 3326 (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), 3327 (('~fadd@32', ('fmul(is_used_once)', -2.0, a), 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), 3328 (('~fadd@32', ('fmul(is_used_once)', 2.0, a), 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), 3329 3330 # flrp(a, b, a) 3331 # a*(1-a) + b*a 3332 # a + -a*a + a*b (1) 3333 # a + a*(b - a) 3334 # Option 1: ffma(a, (b-a), a) 3335 # 3336 # Alternately, after (1): 3337 # a*(1+b) + -a*a 3338 # a*((1+b) + -a) 3339 # 3340 # Let b=1 3341 # 3342 # Option 2: ffma(a, 2, -(a*a)) 3343 # Option 3: ffma(a, 2, (-a)*a) 3344 # Option 4: ffma(a, -a, (2*a) 3345 # Option 5: a * (2 - a) 3346 # 3347 # There are a lot of other possible combinations. 3348 (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'), 3349 (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3350 (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3351 (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3352 (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3353 3354 # we do these late so that we don't get in the way of creating ffmas 3355 (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))), 3356 (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))), 3357 3358 # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c), 3359 # op(b, d)) => op(b, bcsel(a, c, d)) transformations. I do not know why. 3360 (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)), 3361 ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))), 3362 3363 # Things that look like DPH in the source shader may get expanded to 3364 # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets 3365 # to NIR. After FFMA is generated, this can look like: 3366 # 3367 # fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w) 3368 # 3369 # Reassociate the last addition into the first multiplication. 3370 # 3371 # Some shaders do not use 'invariant' in vertex and (possibly) geometry 3372 # shader stages on some outputs that are intended to be invariant. For 3373 # various reasons, this optimization may not be fully applied in all 3374 # shaders used for different rendering passes of the same geometry. This 3375 # can result in Z-fighting artifacts (at best). For now, disable this 3376 # optimization in these stages. See bugzilla #111490. In tessellation 3377 # stages applications seem to use 'precise' when necessary, so allow the 3378 # optimization in those stages. 3379 (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), 3380 ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3381 (('~fadd', ('ffma(is_used_once)', a, b, ('fmul(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), 3382 ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3383 (('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), 3384 ('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3385 3386 (('~fadd', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), 3387 ('ffmaz', a, b, ('ffmaz', c, d, ('ffmaz', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3388 (('~fadd', ('ffmaz(is_used_once)', a, b, ('fmulz(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), 3389 ('ffmaz', a, b, ('ffmaz', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3390 (('~fadd', ('fneg', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), 3391 ('ffmaz', ('fneg', a), b, ('ffmaz', ('fneg', c), d, ('ffmaz', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3392 3393 # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: 3394 # 3395 # If bits is zero, the result will be zero. 3396 # 3397 # These prevent the next two lowerings generating incorrect results when 3398 # count is zero. 3399 (('ubfe', a, b, 0), 0), 3400 (('ibfe', a, b, 0), 0), 3401 3402 # On Intel GPUs, BFE is a 3-source instruction. Like all 3-source 3403 # instructions on Intel GPUs, it cannot have an immediate values as 3404 # sources. There are also limitations on source register strides. As a 3405 # result, it is very easy for 3-source instruction combined with either 3406 # loads of immediate values or copies from weird register strides to be 3407 # more expensive than the primitive instructions it represents. 3408 (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->avoid_ternary_with_two_constants'), 3409 3410 # b is the lowest order bit to be extracted and c is the number of bits to 3411 # extract. The inner shift removes the bits above b + c by shifting left 3412 # 32 - (b + c). ishl only sees the low 5 bits of the shift count, which is 3413 # -(b + c). The outer shift moves the bit that was at b to bit zero. 3414 # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c. 3415 # This means that it must be shifted right by 32 - c or -c bits. 3416 (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->avoid_ternary_with_two_constants'), 3417 3418 # Clean up no-op shifts that may result from the bfe lowerings. 3419 (('ishl', a, 0), a), 3420 (('ishl', a, -32), a), 3421 (('ishr', a, 0), a), 3422 (('ishr', a, -32), a), 3423 (('ushr', a, 0), a), 3424 3425 (('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)), 3426 (('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)), 3427 (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)), 3428 (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)), 3429 3430 # open coded bit test 3431 (('ine', ('iand', a, '#b(is_pos_power_of_two)'), 0), ('bitnz', a, ('find_lsb', b)), 'options->has_bit_test'), 3432 (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), 0), ('bitz', a, ('find_lsb', b)), 'options->has_bit_test'), 3433 (('ine', ('iand', a, '#b(is_pos_power_of_two)'), b), ('bitz', a, ('find_lsb', b)), 'options->has_bit_test'), 3434 (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), b), ('bitnz', a, ('find_lsb', b)), 'options->has_bit_test'), 3435 (('ine', ('iand', a, ('ishl', 1, b)), 0), ('bitnz', a, b), 'options->has_bit_test'), 3436 (('ieq', ('iand', a, ('ishl', 1, b)), 0), ('bitz', a, b), 'options->has_bit_test'), 3437 (('ine', ('iand', a, ('ishl', 1, b)), ('ishl', 1, b)), ('bitz', a, b), 'options->has_bit_test'), 3438 (('ieq', ('iand', a, ('ishl', 1, b)), ('ishl', 1, b)), ('bitnz', a, b), 'options->has_bit_test'), 3439 (('bitz', ('ushr', a, b), 0), ('bitz', a, b)), 3440 (('bitz', ('ishr', a, b), 0), ('bitz', a, b)), 3441 (('bitnz', ('ushr', a, b), 0), ('bitnz', a, b)), 3442 (('bitnz', ('ishr', a, b), 0), ('bitnz', a, b)), 3443 (('ine', ('ubfe', a, b, 1), 0), ('bitnz', a, b), 'options->has_bit_test'), 3444 (('ieq', ('ubfe', a, b, 1), 0), ('bitz', a, b), 'options->has_bit_test'), 3445 (('ine', ('ubfe', a, b, 1), 1), ('bitz', a, b), 'options->has_bit_test'), 3446 (('ieq', ('ubfe', a, b, 1), 1), ('bitnz', a, b), 'options->has_bit_test'), 3447 (('ine', ('ibfe', a, b, 1), 0), ('bitnz', a, b), 'options->has_bit_test'), 3448 (('ieq', ('ibfe', a, b, 1), 0), ('bitz', a, b), 'options->has_bit_test'), 3449 (('ine', ('ibfe', a, b, 1), -1), ('bitz', a, b), 'options->has_bit_test'), 3450 (('ieq', ('ibfe', a, b, 1), -1), ('bitnz', a, b), 'options->has_bit_test'), 3451 (('inot', ('bitnz', a, b)), ('bitz', a, b)), 3452 (('inot', ('bitz', a, b)), ('bitnz', a, b)), 3453 (('bitnz', ('inot', a), b), ('bitz', a, b)), 3454 (('bitz', ('inot', a), b), ('bitnz', a, b)), 3455]) 3456 3457# A few more extract cases we'd rather leave late 3458for N in [16, 32]: 3459 aN = 'a@{0}'.format(N) 3460 u2uM = 'u2u{0}'.format(M) 3461 i2iM = 'i2i{0}'.format(M) 3462 3463 for x in ['u', 'i']: 3464 x2xN = '{0}2{0}{1}'.format(x, N) 3465 extract_x8 = 'extract_{0}8'.format(x) 3466 extract_x16 = 'extract_{0}16'.format(x) 3467 3468 late_optimizations.extend([ 3469 ((x2xN, ('u2u8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'), 3470 ((x2xN, ('i2i8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'), 3471 ]) 3472 3473 if N > 16: 3474 late_optimizations.extend([ 3475 ((x2xN, ('u2u16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), 3476 ((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), 3477 ]) 3478 3479# Byte insertion 3480late_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) 3481late_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) 3482late_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte')) 3483 3484late_optimizations += [ 3485 # Word insertion 3486 (('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'), 3487 3488 # Extract and then insert 3489 (('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)), 3490 (('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)), 3491] 3492 3493# Float sizes 3494for s in [16, 32, 64]: 3495 late_optimizations.extend([ 3496 (('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)), 3497 (('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))), 3498 ]) 3499 3500for op in ['fadd']: 3501 late_optimizations += [ 3502 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), 3503 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 3504 ] 3505 3506for op in ['ffma', 'ffmaz']: 3507 late_optimizations += [ 3508 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), 3509 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), 3510 3511 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), 3512 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), 3513 ] 3514 3515# mediump: If an opcode is surrounded by conversions, remove the conversions. 3516# The rationale is that type conversions + the low precision opcode are more 3517# expensive that the same arithmetic opcode at higher precision. 3518# 3519# This must be done in late optimizations, because we need normal optimizations to 3520# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))). 3521# 3522# Unary opcodes 3523for op in ['fabs', 'fceil', 'fcos', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg', 3524 'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']: 3525 late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))] 3526 3527# Binary opcodes 3528for op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']: 3529 late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))] 3530 3531# Ternary opcodes 3532for op in ['ffma', 'flrp']: 3533 late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))] 3534 3535# Comparison opcodes 3536for op in ['feq', 'fge', 'flt', 'fneu']: 3537 late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))] 3538 3539# Do this last, so that the f2fmp patterns above have effect. 3540late_optimizations += [ 3541 # Convert *2*mp instructions to concrete *2*16 instructions. At this point 3542 # any conversions that could have been removed will have been removed in 3543 # nir_opt_algebraic so any remaining ones are required. 3544 (('f2fmp', a), ('f2f16', a), "!options->preserve_mediump"), 3545 (('f2imp', a), ('f2i16', a), "!options->preserve_mediump"), 3546 (('f2ump', a), ('f2u16', a), "!options->preserve_mediump"), 3547 (('i2imp', a), ('i2i16', a), "!options->preserve_mediump"), 3548 (('i2fmp', a), ('i2f16', a), "!options->preserve_mediump"), 3549 (('i2imp', a), ('u2u16', a), "!options->preserve_mediump"), 3550 (('u2fmp', a), ('u2f16', a), "!options->preserve_mediump"), 3551 (('fisfinite', a), ('flt', ('fabs', a), float("inf"))), 3552 3553 (('f2f16', a), ('f2f16_rtz', a), "options->force_f2f16_rtz && !nir_is_rounding_mode_rtne(info->float_controls_execution_mode, 16)"), 3554 3555 (('fcsel', ('slt', 0, a), b, c), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"), 3556 (('fcsel', ('slt', a, 0), b, c), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3557 (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"), 3558 (('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3559 3560 (('bcsel', ('ilt', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, b, c), "options->has_fused_comp_and_csel && !options->no_integers"), 3561 (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b), "options->has_fused_comp_and_csel && !options->no_integers"), 3562 (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c), "options->has_fused_comp_and_csel && !options->no_integers"), 3563 (('bcsel', ('ige', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, c, b), "options->has_fused_comp_and_csel && !options->no_integers"), 3564 3565 (('bcsel', ('flt', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"), 3566 (('bcsel', ('flt', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3567 (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"), 3568 (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3569] 3570 3571distribute_src_mods = [ 3572 # Try to remove some spurious negations rather than pushing them down. 3573 (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)), 3574 (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), 3575 (('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)), 3576 (('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)), 3577 (('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)), 3578 (('fneg', ('fneg', a)), a), 3579 3580 (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)), 3581 (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))), 3582 3583 (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))), 3584 (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)), 3585 (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))), 3586 3587 # Note that fmin <-> fmax. I don't think there is a way to distribute 3588 # fabs() into fmin or fmax. 3589 (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))), 3590 (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))), 3591 3592 (('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)), 3593 (('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)), 3594 (('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)), 3595 3596 # fdph works mostly like fdot, but to get the correct result, the negation 3597 # must be applied to the second source. 3598 (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))), 3599 3600 (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))), 3601 (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))), 3602] 3603 3604before_lower_int64_optimizations = [ 3605 # The i2i64(a) implies that 'a' has at most 32-bits of data. 3606 (('ishl', ('i2i64', a), b), 3607 # Effective shift count of zero, just return 'a'. 3608 ('bcsel', ('ieq', ('iand', b, 63), 0), ('i2i64', a), 3609 ('bcsel', ('ilt', ('iand', b, 63), 32), 3610 # Shifting less than 32 bits, so both 32-bit halves will have 3611 # some data. These (and the else case) shift counts are of 32-bit 3612 # values, so the shift counts are implicitly moduolo 32. 3613 ('pack_64_2x32_split', ('ishl', ('i2i32', a), b), ('ishr', ('i2i32', a), ('iadd', ('ineg', b), 32) )), 3614 # Shifting 32 bits or more, so lower 32 bits must be zero. 3615 ('pack_64_2x32_split', 0 , ('ishl', ('i2i32', a), ('iabs', ('iadd', ('ineg', b), 32)))))), 3616 '(options->lower_int64_options & nir_lower_shift64) != 0'), 3617 3618 (('ishl', ('u2u64', a), b), 3619 ('bcsel', ('ieq', ('iand', b, 63), 0), ('u2u64', a), 3620 ('bcsel', ('ilt', ('iand', b, 63), 32), 3621 ('pack_64_2x32_split', ('ishl', ('u2u32', a), b), ('ushr', ('u2u32', a), ('iadd', ('ineg', b), 32) )), 3622 ('pack_64_2x32_split', 0 , ('ishl', ('u2u32', a), ('iabs', ('iadd', ('ineg', b), 32)))))), 3623 '(options->lower_int64_options & nir_lower_shift64) != 0'), 3624 3625 # If ineg64 is lowered, then the negation is not free. Try to eliminate 3626 # some of the negations. 3627 (('iadd@64', ('ineg', a), ('ineg(is_used_once)', b)), ('isub', ('ineg', a), b), '(options->lower_int64_options & nir_lower_ineg64) != 0'), 3628 (('iadd@64', a, ('ineg', b)), ('isub', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'), 3629 (('isub@64', a, ('ineg', b)), ('iadd', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'), 3630 (('isub@64', ('ineg', a), ('ineg', b)), ('isub', b, a), '(options->lower_int64_options & nir_lower_ineg64) != 0'), 3631 3632 (('imul@64', ('ineg', a), ('ineg', b)), ('imul', a, b)), 3633 (('idiv@64', ('ineg', a), ('ineg', b)), ('idiv', a, b)), 3634 3635 # If the hardware can do int64, the shift is the same cost as the add. It 3636 # should be fine to do this transformation unconditionally. 3637 (('iadd', ('i2i64', a), ('i2i64', a)), ('ishl', ('i2i64', a), 1)), 3638 (('iadd', ('u2u64', a), ('u2u64', a)), ('ishl', ('u2u64', a), 1)), 3639] 3640 3641parser = argparse.ArgumentParser() 3642parser.add_argument('--out', required=True) 3643args = parser.parse_args() 3644 3645with open(args.out, "w", encoding='utf-8') as f: 3646 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render()) 3647 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma", 3648 before_ffma_optimizations).render()) 3649 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_lower_int64", 3650 before_lower_int64_optimizations).render()) 3651 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late", 3652 late_optimizations).render()) 3653 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods", 3654 distribute_src_mods).render()) 3655