xref: /aosp_15_r20/external/mesa3d/src/gallium/auxiliary/nir/nir_to_tgsi.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2014-2015 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "compiler/nir/nir.h"
25 #include "compiler/nir/nir_deref.h"
26 #include "compiler/nir/nir_legacy.h"
27 #include "compiler/nir/nir_worklist.h"
28 #include "nir/nir_to_tgsi.h"
29 #include "pipe/p_screen.h"
30 #include "pipe/p_state.h"
31 #include "tgsi/tgsi_dump.h"
32 #include "tgsi/tgsi_from_mesa.h"
33 #include "tgsi/tgsi_info.h"
34 #include "tgsi/tgsi_parse.h"
35 #include "tgsi/tgsi_ureg.h"
36 #include "tgsi/tgsi_util.h"
37 #include "util/u_debug.h"
38 #include "util/u_math.h"
39 #include "util/u_memory.h"
40 #include "util/u_dynarray.h"
41 
42 struct ntt_insn {
43    enum tgsi_opcode opcode;
44    struct ureg_dst dst[2];
45    struct ureg_src src[4];
46    enum tgsi_texture_type tex_target;
47    enum tgsi_return_type tex_return_type;
48    struct tgsi_texture_offset tex_offset[4];
49 
50    unsigned mem_qualifier;
51    enum pipe_format mem_format;
52 
53    bool is_tex : 1;
54    bool is_mem : 1;
55    bool precise : 1;
56 };
57 
58 struct ntt_block {
59    /* Array of struct ntt_insn */
60    struct util_dynarray insns;
61    int start_ip;
62    int end_ip;
63 };
64 
65 struct ntt_reg_interval {
66    uint32_t start, end;
67 };
68 
69 struct ntt_compile {
70    nir_shader *s;
71    nir_function_impl *impl;
72    const struct nir_to_tgsi_options *options;
73    struct pipe_screen *screen;
74    struct ureg_program *ureg;
75 
76    bool needs_texcoord_semantic;
77    bool native_integers;
78    bool has_txf_lz;
79 
80    bool addr_declared[3];
81    struct ureg_dst addr_reg[3];
82 
83    /* if condition set up at the end of a block, for ntt_emit_if(). */
84    struct ureg_src if_cond;
85 
86    /* TGSI temps for our NIR SSA and register values. */
87    struct ureg_dst *reg_temp;
88    struct ureg_src *ssa_temp;
89 
90    struct ntt_reg_interval *liveness;
91 
92    /* Map from nir_block to ntt_block */
93    struct hash_table *blocks;
94    struct ntt_block *cur_block;
95    unsigned current_if_else;
96    unsigned cf_label;
97 
98    /* Whether we're currently emitting instructiosn for a precise NIR instruction. */
99    bool precise;
100 
101    unsigned num_temps;
102    unsigned first_non_array_temp;
103 
104    /* Mappings from driver_location to TGSI input/output number.
105     *
106     * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
107     * their numbers assigned incrementally, unlike inputs or constants.
108     */
109    struct ureg_src *input_index_map;
110    uint64_t centroid_inputs;
111 
112    uint32_t first_ubo;
113    uint32_t first_ssbo;
114 
115    struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
116 };
117 
118 static struct ureg_dst
ntt_temp(struct ntt_compile * c)119 ntt_temp(struct ntt_compile *c)
120 {
121    return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++);
122 }
123 
124 static struct ntt_block *
ntt_block_from_nir(struct ntt_compile * c,struct nir_block * block)125 ntt_block_from_nir(struct ntt_compile *c, struct nir_block *block)
126 {
127    struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block);
128    return entry->data;
129 }
130 
131 static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
132 static void ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list);
133 
134 static struct ntt_insn *
ntt_insn(struct ntt_compile * c,enum tgsi_opcode opcode,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1,struct ureg_src src2,struct ureg_src src3)135 ntt_insn(struct ntt_compile *c, enum tgsi_opcode opcode,
136          struct ureg_dst dst,
137          struct ureg_src src0, struct ureg_src src1,
138          struct ureg_src src2, struct ureg_src src3)
139 {
140    struct ntt_insn insn = {
141       .opcode = opcode,
142       .dst = { dst, ureg_dst_undef() },
143       .src = { src0, src1, src2, src3 },
144       .precise = c->precise,
145    };
146    util_dynarray_append(&c->cur_block->insns, struct ntt_insn, insn);
147    return util_dynarray_top_ptr(&c->cur_block->insns, struct ntt_insn);
148 }
149 
150 #define OP00( op )                                                                     \
151 static inline void ntt_##op(struct ntt_compile *c)                                     \
152 {                                                                                      \
153    ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
154 }
155 
156 #define OP01( op )                                                                     \
157 static inline void ntt_##op(struct ntt_compile *c,                                     \
158                      struct ureg_src src0)                                             \
159 {                                                                                      \
160    ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
161 }
162 
163 
164 #define OP10( op )                                                                     \
165 static inline void ntt_##op(struct ntt_compile *c,                                     \
166                      struct ureg_dst dst)                                              \
167 {                                                                                      \
168    ntt_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
169 }
170 
171 #define OP11( op )                                                                     \
172 static inline void ntt_##op(struct ntt_compile *c,                                     \
173                      struct ureg_dst dst,                                              \
174                      struct ureg_src src0)                                             \
175 {                                                                                      \
176    ntt_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
177 }
178 
179 #define OP12( op )                                                                     \
180 static inline void ntt_##op(struct ntt_compile *c,                                     \
181                      struct ureg_dst dst,                                              \
182                      struct ureg_src src0,                                             \
183                      struct ureg_src src1)                                             \
184 {                                                                                      \
185    ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef()); \
186 }
187 
188 #define OP13( op )                                                                     \
189 static inline void ntt_##op(struct ntt_compile *c,                                     \
190                      struct ureg_dst dst,                                              \
191                      struct ureg_src src0,                                             \
192                      struct ureg_src src1,                                             \
193                      struct ureg_src src2)                                             \
194 {                                                                                      \
195    ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef());             \
196 }
197 
198 #define OP14( op )                                                                     \
199 static inline void ntt_##op(struct ntt_compile *c,                                     \
200                      struct ureg_dst dst,                                              \
201                      struct ureg_src src0,                                             \
202                      struct ureg_src src1,                                             \
203                      struct ureg_src src2,                                             \
204                      struct ureg_src src3)                                             \
205 {                                                                                      \
206    ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3);                         \
207 }
208 
209 /* We hand-craft our tex instructions */
210 #define OP12_TEX(op)
211 #define OP14_TEX(op)
212 
213 /* Use a template include to generate a correctly-typed ntt_OP()
214  * function for each TGSI opcode:
215  */
216 #include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h"
217 
218 /**
219  * Interprets a nir_load_const used as a NIR src as a uint.
220  *
221  * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
222  * instruction (or in a phi-web used by an integer ALU instruction) were
223  * converted to floats and the ALU instruction swapped to the float equivalent.
224  * However, this means that integer load_consts used by intrinsics (which don't
225  * normally get that conversion) may have been reformatted to be floats.  Given
226  * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
227  * we can just look and see if they look like floats and convert them back to
228  * ints.
229  */
230 static uint32_t
ntt_src_as_uint(struct ntt_compile * c,nir_src src)231 ntt_src_as_uint(struct ntt_compile *c, nir_src src)
232 {
233    uint32_t val = nir_src_as_uint(src);
234    if (!c->native_integers && val >= fui(1.0))
235       val = (uint32_t)uif(val);
236    return val;
237 }
238 
239 static unsigned
ntt_64bit_write_mask(unsigned write_mask)240 ntt_64bit_write_mask(unsigned write_mask)
241 {
242    return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
243 }
244 
245 static struct ureg_src
ntt_64bit_1f(struct ntt_compile * c)246 ntt_64bit_1f(struct ntt_compile *c)
247 {
248    return ureg_imm4u(c->ureg,
249                      0x00000000, 0x3ff00000,
250                      0x00000000, 0x3ff00000);
251 }
252 
253 /* Per-channel masks of def/use within the block, and the per-channel
254  * livein/liveout for the block as a whole.
255  */
256 struct ntt_live_reg_block_state {
257    uint8_t *def, *use, *livein, *liveout, *defin, *defout;
258 };
259 
260 struct ntt_live_reg_state {
261    unsigned bitset_words;
262 
263    struct ntt_reg_interval *regs;
264 
265    /* Used in propagate_across_edge() */
266    BITSET_WORD *tmp_live;
267 
268    struct ntt_live_reg_block_state *blocks;
269 
270    nir_block_worklist worklist;
271 };
272 
273 static void
ntt_live_reg_mark_use(struct ntt_compile * c,struct ntt_live_reg_block_state * bs,int ip,unsigned index,unsigned used_mask)274 ntt_live_reg_mark_use(struct ntt_compile *c, struct ntt_live_reg_block_state *bs,
275                       int ip, unsigned index, unsigned used_mask)
276 {
277    bs->use[index] |= used_mask & ~bs->def[index];
278 
279    c->liveness[index].start = MIN2(c->liveness[index].start, ip);
280    c->liveness[index].end = MAX2(c->liveness[index].end, ip);
281 
282 }
283 static void
ntt_live_reg_setup_def_use(struct ntt_compile * c,nir_function_impl * impl,struct ntt_live_reg_state * state)284 ntt_live_reg_setup_def_use(struct ntt_compile *c, nir_function_impl *impl, struct ntt_live_reg_state *state)
285 {
286    for (int i = 0; i < impl->num_blocks; i++) {
287       state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps);
288       state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps);
289       state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
290       state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps);
291       state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps);
292       state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
293    }
294 
295    int ip = 0;
296    nir_foreach_block(block, impl) {
297       struct ntt_live_reg_block_state *bs = &state->blocks[block->index];
298       struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
299 
300       ntt_block->start_ip = ip;
301 
302       util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
303          const struct tgsi_opcode_info *opcode_info =
304             tgsi_get_opcode_info(insn->opcode);
305 
306          /* Set up use[] for the srcs.
307           *
308           * Uses are the channels of the reg read in the block that don't have a
309           * preceding def to screen them off.  Note that we don't do per-element
310           * tracking of array regs, so they're never screened off.
311           */
312          for (int i = 0; i < opcode_info->num_src; i++) {
313             if (insn->src[i].File != TGSI_FILE_TEMPORARY)
314                continue;
315             int index = insn->src[i].Index;
316 
317             uint32_t used_mask = tgsi_util_get_src_usage_mask(insn->opcode, i,
318                                                               insn->dst->WriteMask,
319                                                               insn->src[i].SwizzleX,
320                                                               insn->src[i].SwizzleY,
321                                                               insn->src[i].SwizzleZ,
322                                                               insn->src[i].SwizzleW,
323                                                               insn->tex_target,
324                                                               insn->tex_target);
325 
326             assert(!insn->src[i].Indirect || index < c->first_non_array_temp);
327             ntt_live_reg_mark_use(c, bs, ip, index, used_mask);
328          }
329 
330          if (insn->is_tex) {
331             for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
332                if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY)
333                   ntt_live_reg_mark_use(c, bs, ip, insn->tex_offset[i].Index, 0xf);
334             }
335          }
336 
337          /* Set up def[] for the srcs.
338           *
339           * Defs are the unconditionally-written (not R/M/W) channels of the reg in
340           * the block that don't have a preceding use.
341           */
342          for (int i = 0; i < opcode_info->num_dst; i++) {
343             if (insn->dst[i].File != TGSI_FILE_TEMPORARY)
344                continue;
345             int index = insn->dst[i].Index;
346             uint32_t writemask = insn->dst[i].WriteMask;
347 
348             bs->def[index] |= writemask & ~bs->use[index];
349             bs->defout[index] |= writemask;
350 
351             assert(!insn->dst[i].Indirect || index < c->first_non_array_temp);
352             c->liveness[index].start = MIN2(c->liveness[index].start, ip);
353             c->liveness[index].end = MAX2(c->liveness[index].end, ip);
354          }
355          ip++;
356       }
357 
358       ntt_block->end_ip = ip;
359    }
360 }
361 
362 static void
ntt_live_regs(struct ntt_compile * c,nir_function_impl * impl)363 ntt_live_regs(struct ntt_compile *c, nir_function_impl *impl)
364 {
365    nir_metadata_require(impl, nir_metadata_block_index);
366 
367    c->liveness = rzalloc_array(c, struct ntt_reg_interval, c->num_temps);
368 
369    struct ntt_live_reg_state state = {
370        .blocks = rzalloc_array(impl, struct ntt_live_reg_block_state, impl->num_blocks),
371    };
372 
373    /* The intervals start out with start > end (indicating unused) */
374    for (int i = 0; i < c->num_temps; i++)
375       c->liveness[i].start = ~0;
376 
377    ntt_live_reg_setup_def_use(c, impl, &state);
378 
379    /* Make a forward-order worklist of all the blocks. */
380    nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL);
381    nir_foreach_block(block, impl) {
382       nir_block_worklist_push_tail(&state.worklist, block);
383    }
384 
385    /* Propagate defin/defout down the CFG to calculate the live variables
386     * potentially defined along any possible control flow path.  We'll use this
387     * to keep things like conditional defs of the reg (or array regs where we
388     * don't track defs!) from making the reg's live range extend back to the
389     * start of the program.
390     */
391    while (!nir_block_worklist_is_empty(&state.worklist)) {
392       nir_block *block = nir_block_worklist_pop_head(&state.worklist);
393       for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
394          nir_block *succ = block->successors[j];
395          if (!succ || succ->index == impl->num_blocks)
396             continue;
397 
398          for (int i = 0; i < c->num_temps; i++) {
399             uint8_t new_def = state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i];
400 
401             if (new_def) {
402                state.blocks[succ->index].defin[i] |= new_def;
403                state.blocks[succ->index].defout[i] |= new_def;
404                nir_block_worklist_push_tail(&state.worklist, succ);
405             }
406          }
407       }
408    }
409 
410    /* Make a reverse-order worklist of all the blocks. */
411    nir_foreach_block(block, impl) {
412       nir_block_worklist_push_head(&state.worklist, block);
413    }
414 
415    /* We're now ready to work through the worklist and update the liveness sets
416     * of each of the blocks.  As long as we keep the worklist up-to-date as we
417     * go, everything will get covered.
418     */
419    while (!nir_block_worklist_is_empty(&state.worklist)) {
420       /* We pop them off in the reverse order we pushed them on.  This way
421        * the first walk of the instructions is backwards so we only walk
422        * once in the case of no control flow.
423        */
424       nir_block *block = nir_block_worklist_pop_head(&state.worklist);
425       struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
426       struct ntt_live_reg_block_state *bs = &state.blocks[block->index];
427 
428       for (int i = 0; i < c->num_temps; i++) {
429          /* Collect livein from our successors to include in our liveout. */
430          for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
431             nir_block *succ = block->successors[j];
432             if (!succ || succ->index == impl->num_blocks)
433                continue;
434             struct ntt_live_reg_block_state *sbs = &state.blocks[succ->index];
435 
436             uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i];
437             if (new_liveout) {
438                if (state.blocks[block->index].defout[i])
439                   c->liveness[i].end = MAX2(c->liveness[i].end, ntt_block->end_ip);
440                bs->liveout[i] |= sbs->livein[i];
441             }
442          }
443 
444          /* Propagate use requests from either our block's uses or our
445           * non-screened-off liveout up to our predecessors.
446           */
447          uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) &
448                                ~bs->livein[i]);
449          if (new_livein) {
450             bs->livein[i] |= new_livein;
451             set_foreach(block->predecessors, entry) {
452                nir_block *pred = (void *)entry->key;
453                nir_block_worklist_push_tail(&state.worklist, pred);
454             }
455 
456             if (new_livein & state.blocks[block->index].defin[i])
457                c->liveness[i].start = MIN2(c->liveness[i].start, ntt_block->start_ip);
458          }
459       }
460    }
461 
462    ralloc_free(state.blocks);
463    nir_block_worklist_fini(&state.worklist);
464 }
465 
466 static void
ntt_ra_check(struct ntt_compile * c,unsigned * ra_map,BITSET_WORD * released,int ip,unsigned index)467 ntt_ra_check(struct ntt_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index)
468 {
469    if (index < c->first_non_array_temp)
470       return;
471 
472    if (c->liveness[index].start == ip && ra_map[index] == ~0)
473       ra_map[index] = ureg_DECL_temporary(c->ureg).Index;
474 
475    if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) {
476       ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index]));
477       BITSET_SET(released, index);
478    }
479 }
480 
481 static void
ntt_allocate_regs(struct ntt_compile * c,nir_function_impl * impl)482 ntt_allocate_regs(struct ntt_compile *c, nir_function_impl *impl)
483 {
484    ntt_live_regs(c, impl);
485 
486    unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps);
487    unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps));
488 
489    /* No RA on NIR array regs */
490    for (int i = 0; i < c->first_non_array_temp; i++)
491       ra_map[i] = i;
492 
493    for (int i = c->first_non_array_temp; i < c->num_temps; i++)
494       ra_map[i] = ~0;
495 
496    int ip = 0;
497    nir_foreach_block(block, impl) {
498       struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
499 
500       for (int i = 0; i < c->num_temps; i++)
501          ntt_ra_check(c, ra_map, released, ip, i);
502 
503       util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
504          const struct tgsi_opcode_info *opcode_info =
505             tgsi_get_opcode_info(insn->opcode);
506 
507          for (int i = 0; i < opcode_info->num_src; i++) {
508             if (insn->src[i].File == TGSI_FILE_TEMPORARY) {
509                ntt_ra_check(c, ra_map, released, ip, insn->src[i].Index);
510                insn->src[i].Index = ra_map[insn->src[i].Index];
511             }
512          }
513 
514          if (insn->is_tex) {
515             for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
516                if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY) {
517                   ntt_ra_check(c, ra_map, released, ip, insn->tex_offset[i].Index);
518                   insn->tex_offset[i].Index = ra_map[insn->tex_offset[i].Index];
519                }
520             }
521          }
522 
523          for (int i = 0; i < opcode_info->num_dst; i++) {
524             if (insn->dst[i].File == TGSI_FILE_TEMPORARY) {
525                ntt_ra_check(c, ra_map, released, ip, insn->dst[i].Index);
526                insn->dst[i].Index = ra_map[insn->dst[i].Index];
527             }
528          }
529          ip++;
530       }
531 
532       for (int i = 0; i < c->num_temps; i++)
533          ntt_ra_check(c, ra_map, released, ip, i);
534    }
535 }
536 
537 static void
ntt_allocate_regs_unoptimized(struct ntt_compile * c,nir_function_impl * impl)538 ntt_allocate_regs_unoptimized(struct ntt_compile *c, nir_function_impl *impl)
539 {
540    for (int i = c->first_non_array_temp; i < c->num_temps; i++)
541       ureg_DECL_temporary(c->ureg);
542 }
543 
544 
545 /**
546  * Try to find an iadd of a constant value with a non-constant value in the
547  * nir_src's first component, returning the constant offset and replacing *src
548  * with the non-constant component.
549  */
550 static const uint32_t
ntt_extract_const_src_offset(nir_src * src)551 ntt_extract_const_src_offset(nir_src *src)
552 {
553    nir_scalar s = nir_get_scalar(src->ssa, 0);
554 
555    while (nir_scalar_is_alu(s)) {
556       nir_alu_instr *alu = nir_instr_as_alu(s.def->parent_instr);
557 
558       if (alu->op == nir_op_iadd) {
559          for (int i = 0; i < 2; i++) {
560             nir_const_value *v = nir_src_as_const_value(alu->src[i].src);
561             if (v != NULL) {
562                *src = alu->src[1 - i].src;
563                return v[alu->src[i].swizzle[s.comp]].u32;
564             }
565          }
566 
567          return 0;
568       }
569 
570       /* We'd like to reuse nir_scalar_chase_movs(), but it assumes SSA and that
571        * seems reasonable for something used in inner loops of the compiler.
572        */
573       if (alu->op == nir_op_mov) {
574          s.def = alu->src[0].src.ssa;
575          s.comp = alu->src[0].swizzle[s.comp];
576       } else if (nir_op_is_vec(alu->op)) {
577          s.def = alu->src[s.comp].src.ssa;
578          s.comp = alu->src[s.comp].swizzle[0];
579       } else {
580          return 0;
581       }
582    }
583 
584    return 0;
585 }
586 
587 static const struct glsl_type *
ntt_shader_input_type(struct ntt_compile * c,struct nir_variable * var)588 ntt_shader_input_type(struct ntt_compile *c,
589                       struct nir_variable *var)
590 {
591    switch (c->s->info.stage) {
592    case MESA_SHADER_GEOMETRY:
593    case MESA_SHADER_TESS_EVAL:
594    case MESA_SHADER_TESS_CTRL:
595       if (glsl_type_is_array(var->type))
596          return glsl_get_array_element(var->type);
597       else
598          return var->type;
599    default:
600       return var->type;
601    }
602 }
603 
604 static void
ntt_get_gl_varying_semantic(struct ntt_compile * c,unsigned location,unsigned * semantic_name,unsigned * semantic_index)605 ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
606                             unsigned *semantic_name, unsigned *semantic_index)
607 {
608    /* We want to use most of tgsi_get_gl_varying_semantic(), but the
609     * !texcoord shifting has already been applied, so avoid that.
610     */
611    if (!c->needs_texcoord_semantic &&
612        (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
613       *semantic_name = TGSI_SEMANTIC_GENERIC;
614       *semantic_index = location - VARYING_SLOT_VAR0;
615       return;
616    }
617 
618    tgsi_get_gl_varying_semantic(location, true,
619                                 semantic_name, semantic_index);
620 }
621 
622 /* TGSI varying declarations have a component usage mask associated (used by
623  * r600 and svga).
624  */
625 static uint32_t
ntt_tgsi_usage_mask(unsigned start_component,unsigned num_components,bool is_64)626 ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
627                     bool is_64)
628 {
629    uint32_t usage_mask =
630       u_bit_consecutive(start_component, num_components);
631 
632    if (is_64) {
633       if (start_component >= 2)
634          usage_mask >>= 2;
635 
636       uint32_t tgsi_usage_mask = 0;
637 
638       if (usage_mask & TGSI_WRITEMASK_X)
639          tgsi_usage_mask |= TGSI_WRITEMASK_XY;
640       if (usage_mask & TGSI_WRITEMASK_Y)
641          tgsi_usage_mask |= TGSI_WRITEMASK_ZW;
642 
643       return tgsi_usage_mask;
644    } else {
645       return usage_mask;
646    }
647 }
648 
649 /* TGSI varying declarations have a component usage mask associated (used by
650  * r600 and svga).
651  */
652 static uint32_t
ntt_tgsi_var_usage_mask(const struct nir_variable * var)653 ntt_tgsi_var_usage_mask(const struct nir_variable *var)
654 {
655    const struct glsl_type *type_without_array =
656       glsl_without_array(var->type);
657    unsigned num_components = glsl_get_vector_elements(type_without_array);
658    if (num_components == 0) /* structs */
659       num_components = 4;
660 
661    return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
662                               glsl_type_is_64bit(type_without_array));
663 }
664 
665 static struct ureg_dst
ntt_output_decl(struct ntt_compile * c,nir_intrinsic_instr * instr,uint32_t * frac)666 ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
667 {
668    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
669    int base = nir_intrinsic_base(instr);
670    *frac = nir_intrinsic_component(instr);
671    bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
672 
673    struct ureg_dst out;
674    if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
675       unsigned semantic_name, semantic_index;
676       tgsi_get_gl_frag_result_semantic(semantics.location,
677                                        &semantic_name, &semantic_index);
678       semantic_index += semantics.dual_source_blend_index;
679 
680       switch (semantics.location) {
681       case FRAG_RESULT_DEPTH:
682          *frac = 2; /* z write is the to the .z channel in TGSI */
683          break;
684       case FRAG_RESULT_STENCIL:
685          *frac = 1;
686          break;
687       default:
688          break;
689       }
690 
691       out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
692    } else {
693       unsigned semantic_name, semantic_index;
694 
695       ntt_get_gl_varying_semantic(c, semantics.location,
696                                   &semantic_name, &semantic_index);
697 
698       uint32_t usage_mask = ntt_tgsi_usage_mask(*frac,
699                                                 instr->num_components,
700                                                 is_64);
701       uint32_t gs_streams = semantics.gs_streams;
702       for (int i = 0; i < 4; i++) {
703          if (!(usage_mask & (1 << i)))
704             gs_streams &= ~(0x3 << 2 * i);
705       }
706 
707       /* No driver appears to use array_id of outputs. */
708       unsigned array_id = 0;
709 
710       /* This bit is lost in the i/o semantics, but it's unused in in-tree
711        * drivers.
712        */
713       bool invariant = semantics.invariant;
714 
715       unsigned num_slots = semantics.num_slots;
716       if (semantics.location == VARYING_SLOT_TESS_LEVEL_INNER ||
717           semantics.location == VARYING_SLOT_TESS_LEVEL_OUTER) {
718          /* Compact vars get a num_slots in NIR as number of components, but we
719           * want the number of vec4 slots here.
720           */
721          num_slots = 1;
722       }
723 
724       out = ureg_DECL_output_layout(c->ureg,
725                                     semantic_name, semantic_index,
726                                     gs_streams,
727                                     base,
728                                     usage_mask,
729                                     array_id,
730                                     num_slots,
731                                     invariant);
732    }
733 
734    unsigned write_mask;
735    if (nir_intrinsic_has_write_mask(instr))
736       write_mask = nir_intrinsic_write_mask(instr);
737    else
738       write_mask = ((1 << instr->num_components) - 1) << *frac;
739 
740    if (is_64) {
741       write_mask = ntt_64bit_write_mask(write_mask);
742       if (*frac >= 2)
743          write_mask = write_mask << 2;
744    } else {
745       write_mask = write_mask << *frac;
746    }
747    return ureg_writemask(out, write_mask);
748 }
749 
750 static bool
ntt_try_store_in_tgsi_output_with_use(struct ntt_compile * c,struct ureg_dst * dst,nir_src * src)751 ntt_try_store_in_tgsi_output_with_use(struct ntt_compile *c,
752                                       struct ureg_dst *dst,
753                                       nir_src *src)
754 {
755    *dst = ureg_dst_undef();
756 
757    switch (c->s->info.stage) {
758    case MESA_SHADER_FRAGMENT:
759    case MESA_SHADER_VERTEX:
760       break;
761    default:
762       /* tgsi_exec (at least) requires that output stores happen per vertex
763        * emitted, you don't get to reuse a previous output value for the next
764        * vertex.
765        */
766       return false;
767    }
768 
769    if (nir_src_is_if(src))
770       return false;
771 
772    if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
773       return false;
774 
775    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(nir_src_parent_instr(src));
776    if (intr->intrinsic != nir_intrinsic_store_output ||
777        !nir_src_is_const(intr->src[1])) {
778       return false;
779    }
780 
781    uint32_t frac;
782    *dst = ntt_output_decl(c, intr, &frac);
783    dst->Index += ntt_src_as_uint(c, intr->src[1]);
784 
785    return frac == 0;
786 }
787 
788 /* If this reg is used only for storing an output, then in the simple
789  * cases we can write directly to the TGSI output instead of having
790  * store_output emit its own MOV.
791  */
792 static bool
ntt_try_store_reg_in_tgsi_output(struct ntt_compile * c,struct ureg_dst * dst,nir_intrinsic_instr * reg_decl)793 ntt_try_store_reg_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
794                                  nir_intrinsic_instr *reg_decl)
795 {
796    assert(reg_decl->intrinsic == nir_intrinsic_decl_reg);
797 
798    *dst = ureg_dst_undef();
799 
800    /* Look for a single use for try_store_in_tgsi_output */
801    nir_src *use = NULL;
802    nir_foreach_reg_load(src, reg_decl) {
803       nir_intrinsic_instr *load = nir_instr_as_intrinsic(nir_src_parent_instr(src));
804       nir_foreach_use_including_if(load_use, &load->def) {
805          /* We can only have one use */
806          if (use != NULL)
807             return false;
808 
809          use = load_use;
810       }
811    }
812 
813    if (use == NULL)
814       return false;
815 
816    return ntt_try_store_in_tgsi_output_with_use(c, dst, use);
817 }
818 
819 /* If this SSA def is used only for storing an output, then in the simple
820  * cases we can write directly to the TGSI output instead of having
821  * store_output emit its own MOV.
822  */
823 static bool
ntt_try_store_ssa_in_tgsi_output(struct ntt_compile * c,struct ureg_dst * dst,nir_def * def)824 ntt_try_store_ssa_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
825                                  nir_def *def)
826 {
827    *dst = ureg_dst_undef();
828 
829    if (!list_is_singular(&def->uses))
830       return false;
831 
832    nir_foreach_use_including_if(use, def) {
833       return ntt_try_store_in_tgsi_output_with_use(c, dst, use);
834    }
835    unreachable("We have one use");
836 }
837 
838 static void
ntt_setup_inputs(struct ntt_compile * c)839 ntt_setup_inputs(struct ntt_compile *c)
840 {
841    if (c->s->info.stage != MESA_SHADER_FRAGMENT)
842       return;
843 
844    unsigned num_inputs = 0;
845    int num_input_arrays = 0;
846 
847    nir_foreach_shader_in_variable(var, c->s) {
848       const struct glsl_type *type = ntt_shader_input_type(c, var);
849       unsigned array_len =
850          glsl_count_attribute_slots(type, false);
851 
852       num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
853    }
854 
855    c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
856 
857    nir_foreach_shader_in_variable(var, c->s) {
858       const struct glsl_type *type = ntt_shader_input_type(c, var);
859       unsigned array_len =
860          glsl_count_attribute_slots(type, false);
861 
862       unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
863       unsigned sample_loc;
864       struct ureg_src decl;
865 
866       if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
867          interpolation =
868             tgsi_get_interp_mode(var->data.interpolation,
869                                  var->data.location == VARYING_SLOT_COL0 ||
870                                  var->data.location == VARYING_SLOT_COL1);
871 
872          if (var->data.location == VARYING_SLOT_POS)
873             interpolation = TGSI_INTERPOLATE_LINEAR;
874       }
875 
876       unsigned semantic_name, semantic_index;
877       ntt_get_gl_varying_semantic(c, var->data.location,
878                                   &semantic_name, &semantic_index);
879 
880       if (var->data.sample) {
881          sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
882       } else if (var->data.centroid) {
883          sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
884          c->centroid_inputs |= (BITSET_MASK(array_len) <<
885                                 var->data.driver_location);
886       } else {
887          sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
888       }
889 
890       unsigned array_id = 0;
891       if (glsl_type_is_array(type))
892          array_id = ++num_input_arrays;
893 
894       uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
895 
896       decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
897                                                 semantic_name,
898                                                 semantic_index,
899                                                 interpolation,
900                                                 sample_loc,
901                                                 var->data.driver_location,
902                                                 usage_mask,
903                                                 array_id, array_len);
904 
905       if (semantic_name == TGSI_SEMANTIC_FACE) {
906          struct ureg_dst temp = ntt_temp(c);
907          if (c->native_integers) {
908             /* NIR is ~0 front and 0 back, while TGSI is +1 front */
909             ntt_SGE(c, temp, decl, ureg_imm1f(c->ureg, 0));
910          } else {
911             /* tgsi docs say that floating point FACE will be positive for
912              * frontface and negative for backface, but realistically
913              * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0.
914              * Copy that behavior, since some drivers (r300) have been doing a
915              * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0
916              * front face).
917              */
918             temp.Saturate = true;
919             ntt_MOV(c, temp, decl);
920 
921          }
922          decl = ureg_src(temp);
923       }
924 
925       for (unsigned i = 0; i < array_len; i++) {
926          c->input_index_map[var->data.driver_location + i] = decl;
927          c->input_index_map[var->data.driver_location + i].Index += i;
928       }
929    }
930 }
931 
932 static int
ntt_sort_by_location(const nir_variable * a,const nir_variable * b)933 ntt_sort_by_location(const nir_variable *a, const nir_variable *b)
934 {
935    return a->data.location - b->data.location;
936 }
937 
938 /**
939  * Workaround for virglrenderer requiring that TGSI FS output color variables
940  * are declared in order.  Besides, it's a lot nicer to read the TGSI this way.
941  */
942 static void
ntt_setup_outputs(struct ntt_compile * c)943 ntt_setup_outputs(struct ntt_compile *c)
944 {
945    if (c->s->info.stage != MESA_SHADER_FRAGMENT)
946       return;
947 
948    nir_sort_variables_with_modes(c->s, ntt_sort_by_location, nir_var_shader_out);
949 
950    nir_foreach_shader_out_variable(var, c->s) {
951       if (var->data.location == FRAG_RESULT_COLOR)
952          ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
953 
954       unsigned semantic_name, semantic_index;
955       tgsi_get_gl_frag_result_semantic(var->data.location,
956                                        &semantic_name, &semantic_index);
957 
958       (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
959    }
960 }
961 
962 static enum tgsi_texture_type
tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim,bool is_array,bool is_shadow)963 tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
964 {
965    switch (dim) {
966    case GLSL_SAMPLER_DIM_1D:
967       if (is_shadow)
968          return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
969       else
970          return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
971    case GLSL_SAMPLER_DIM_2D:
972    case GLSL_SAMPLER_DIM_EXTERNAL:
973       if (is_shadow)
974          return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
975       else
976          return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
977    case GLSL_SAMPLER_DIM_3D:
978       return TGSI_TEXTURE_3D;
979    case GLSL_SAMPLER_DIM_CUBE:
980       if (is_shadow)
981          return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
982       else
983          return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
984    case GLSL_SAMPLER_DIM_RECT:
985       if (is_shadow)
986          return TGSI_TEXTURE_SHADOWRECT;
987       else
988          return TGSI_TEXTURE_RECT;
989    case GLSL_SAMPLER_DIM_MS:
990       return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
991    case GLSL_SAMPLER_DIM_BUF:
992       return TGSI_TEXTURE_BUFFER;
993    default:
994       unreachable("unknown sampler dim");
995    }
996 }
997 
998 static enum tgsi_return_type
tgsi_return_type_from_base_type(enum glsl_base_type type)999 tgsi_return_type_from_base_type(enum glsl_base_type type)
1000 {
1001    switch (type) {
1002    case GLSL_TYPE_INT:
1003       return TGSI_RETURN_TYPE_SINT;
1004    case GLSL_TYPE_UINT:
1005       return TGSI_RETURN_TYPE_UINT;
1006    case GLSL_TYPE_FLOAT:
1007      return TGSI_RETURN_TYPE_FLOAT;
1008    default:
1009       unreachable("unexpected texture type");
1010    }
1011 }
1012 
1013 static void
ntt_setup_uniforms(struct ntt_compile * c)1014 ntt_setup_uniforms(struct ntt_compile *c)
1015 {
1016    nir_foreach_uniform_variable(var, c->s) {
1017       if (glsl_type_is_sampler(glsl_without_array(var->type)) ||
1018           glsl_type_is_texture(glsl_without_array(var->type))) {
1019          /* Don't use this size for the check for samplers -- arrays of structs
1020           * containing samplers should be ignored, and just the separate lowered
1021           * sampler uniform decl used.
1022           */
1023          int size = glsl_type_get_sampler_count(var->type) +
1024                     glsl_type_get_texture_count(var->type);
1025 
1026          const struct glsl_type *stype = glsl_without_array(var->type);
1027          enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
1028                                                                             glsl_sampler_type_is_array(stype),
1029                                                                             glsl_sampler_type_is_shadow(stype));
1030          enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
1031          for (int i = 0; i < size; i++) {
1032             ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
1033                target, ret_type, ret_type, ret_type, ret_type);
1034             ureg_DECL_sampler(c->ureg, var->data.binding + i);
1035          }
1036       } else if (glsl_contains_atomic(var->type)) {
1037          uint32_t offset = var->data.offset / 4;
1038          uint32_t size = glsl_atomic_size(var->type) / 4;
1039          ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0);
1040       }
1041 
1042       /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
1043        * size declaration happens with other UBOs below.
1044        */
1045    }
1046 
1047    nir_foreach_image_variable(var, c->s) {
1048       int image_count = glsl_type_get_image_count(var->type);
1049       const struct glsl_type *itype = glsl_without_array(var->type);
1050       enum tgsi_texture_type tex_type =
1051             tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype),
1052                                                glsl_sampler_type_is_array(itype), false);
1053 
1054       for (int i = 0; i < image_count; i++) {
1055          c->images[var->data.binding] = ureg_DECL_image(c->ureg,
1056                                                         var->data.binding + i,
1057                                                         tex_type,
1058                                                         var->data.image.format,
1059                                                         !(var->data.access & ACCESS_NON_WRITEABLE),
1060                                                         false);
1061       }
1062    }
1063 
1064    c->first_ubo = ~0;
1065 
1066    unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
1067    nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
1068       int ubo = var->data.driver_location;
1069       if (ubo == -1)
1070          continue;
1071 
1072       if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
1073          c->first_ubo = MIN2(c->first_ubo, ubo);
1074 
1075       unsigned size = glsl_get_explicit_size(var->interface_type, false);
1076 
1077       int array_size = 1;
1078       if (glsl_type_is_interface(glsl_without_array(var->type)))
1079          array_size = MAX2(1, glsl_get_aoa_size(var->type));
1080 
1081       for (int i = 0; i < array_size; i++) {
1082          /* Even if multiple NIR variables are in the same uniform block, their
1083           * explicit size is the size of the block.
1084           */
1085          if (ubo_sizes[ubo + i])
1086             assert(ubo_sizes[ubo + i] == size);
1087 
1088          ubo_sizes[ubo + i] = size;
1089       }
1090    }
1091 
1092    for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
1093       if (ubo_sizes[i])
1094          ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
1095    }
1096 
1097    if (c->options->lower_ssbo_bindings) {
1098       c->first_ssbo = 255;
1099       nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ssbo) {
1100          if (c->first_ssbo > var->data.binding)
1101             c->first_ssbo = var->data.binding;
1102       }
1103    } else
1104       c->first_ssbo = 0;
1105 
1106    /* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic
1107     * counters
1108     */
1109    bool atomic = false;
1110    for (int i = 0; i < c->s->info.num_ssbos; ++i)
1111       ureg_DECL_buffer(c->ureg, c->first_ssbo + i, atomic);
1112 
1113 }
1114 
1115 static void
ntt_setup_registers(struct ntt_compile * c)1116 ntt_setup_registers(struct ntt_compile *c)
1117 {
1118    assert(c->num_temps == 0);
1119 
1120    nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
1121       /* Permanently allocate all the array regs at the start. */
1122       unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
1123       unsigned index = nir_reg->def.index;
1124 
1125       if (num_array_elems != 0) {
1126          struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, num_array_elems, true);
1127          c->reg_temp[index] = decl;
1128          assert(c->num_temps == decl.Index);
1129          c->num_temps += num_array_elems;
1130       }
1131    }
1132    c->first_non_array_temp = c->num_temps;
1133 
1134    /* After that, allocate non-array regs in our virtual space that we'll
1135     * register-allocate before ureg emit.
1136     */
1137    nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
1138       unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
1139       unsigned num_components = nir_intrinsic_num_components(nir_reg);
1140       unsigned bit_size = nir_intrinsic_bit_size(nir_reg);
1141       unsigned index = nir_reg->def.index;
1142 
1143       /* We already handled arrays */
1144       if (num_array_elems == 0) {
1145          struct ureg_dst decl;
1146          uint32_t write_mask = BITFIELD_MASK(num_components);
1147 
1148          if (!ntt_try_store_reg_in_tgsi_output(c, &decl, nir_reg)) {
1149             if (bit_size == 64) {
1150                if (num_components > 2) {
1151                   fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
1152                           num_components, index);
1153                }
1154 
1155                write_mask = ntt_64bit_write_mask(write_mask);
1156             }
1157 
1158             decl = ureg_writemask(ntt_temp(c), write_mask);
1159          }
1160          c->reg_temp[index] = decl;
1161       }
1162    }
1163 }
1164 
1165 static struct ureg_src
ntt_get_load_const_src(struct ntt_compile * c,nir_load_const_instr * instr)1166 ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
1167 {
1168    int num_components = instr->def.num_components;
1169 
1170    if (!c->native_integers) {
1171       float values[4];
1172       assert(instr->def.bit_size == 32);
1173       for (int i = 0; i < num_components; i++)
1174          values[i] = uif(instr->value[i].u32);
1175 
1176       return ureg_DECL_immediate(c->ureg, values, num_components);
1177    } else {
1178       uint32_t values[4];
1179 
1180       if (instr->def.bit_size == 32) {
1181          for (int i = 0; i < num_components; i++)
1182             values[i] = instr->value[i].u32;
1183       } else {
1184          assert(num_components <= 2);
1185          for (int i = 0; i < num_components; i++) {
1186             values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
1187             values[i * 2 + 1] = instr->value[i].u64 >> 32;
1188          }
1189          num_components *= 2;
1190       }
1191 
1192       return ureg_DECL_immediate_uint(c->ureg, values, num_components);
1193    }
1194 }
1195 
1196 static struct ureg_src
ntt_reladdr(struct ntt_compile * c,struct ureg_src addr,int addr_index)1197 ntt_reladdr(struct ntt_compile *c, struct ureg_src addr, int addr_index)
1198 {
1199    assert(addr_index < ARRAY_SIZE(c->addr_reg));
1200 
1201    for (int i = 0; i <= addr_index; i++) {
1202       if (!c->addr_declared[i]) {
1203          c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg),
1204                                              TGSI_WRITEMASK_X);
1205          c->addr_declared[i] = true;
1206       }
1207    }
1208 
1209    if (c->native_integers)
1210       ntt_UARL(c, c->addr_reg[addr_index], addr);
1211    else
1212       ntt_ARL(c, c->addr_reg[addr_index], addr);
1213    return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0);
1214 }
1215 
1216 /* Forward declare for recursion with indirects */
1217 static struct ureg_src
1218 ntt_get_src(struct ntt_compile *c, nir_src src);
1219 
1220 static struct ureg_src
ntt_get_chased_src(struct ntt_compile * c,nir_legacy_src * src)1221 ntt_get_chased_src(struct ntt_compile *c, nir_legacy_src *src)
1222 {
1223    if (src->is_ssa) {
1224       if (src->ssa->parent_instr->type == nir_instr_type_load_const)
1225          return ntt_get_load_const_src(c, nir_instr_as_load_const(src->ssa->parent_instr));
1226 
1227       return c->ssa_temp[src->ssa->index];
1228    } else {
1229       struct ureg_dst reg_temp = c->reg_temp[src->reg.handle->index];
1230       reg_temp.Index += src->reg.base_offset;
1231 
1232       if (src->reg.indirect) {
1233          struct ureg_src offset = ntt_get_src(c, nir_src_for_ssa(src->reg.indirect));
1234          return ureg_src_indirect(ureg_src(reg_temp),
1235                                   ntt_reladdr(c, offset, 0));
1236       } else {
1237          return ureg_src(reg_temp);
1238       }
1239    }
1240 }
1241 
1242 static struct ureg_src
ntt_get_src(struct ntt_compile * c,nir_src src)1243 ntt_get_src(struct ntt_compile *c, nir_src src)
1244 {
1245    nir_legacy_src chased = nir_legacy_chase_src(&src);
1246    return ntt_get_chased_src(c, &chased);
1247 }
1248 
1249 static struct ureg_src
ntt_get_alu_src(struct ntt_compile * c,nir_alu_instr * instr,int i)1250 ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
1251 {
1252    /* We only support 32-bit float modifiers.  The only other modifier type
1253     * officially supported by TGSI is 32-bit integer negates, but even those are
1254     * broken on virglrenderer, so skip lowering all integer and f64 float mods.
1255     *
1256     * The options->lower_fabs requests that we not have native source modifiers
1257     * for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
1258     */
1259    nir_legacy_alu_src src =
1260       nir_legacy_chase_alu_src(&instr->src[i], !c->options->lower_fabs);
1261    struct ureg_src usrc = ntt_get_chased_src(c, &src.src);
1262 
1263    /* Expand double/dvec2 src references to TGSI swizzles using a pair of 32-bit
1264     * channels.  We skip this for undefs, as those don't get split to vec2s (but
1265     * the specific swizzles from an undef don't matter)
1266     */
1267    if (nir_src_bit_size(instr->src[i].src) == 64 &&
1268       !(src.src.is_ssa && src.src.ssa->parent_instr->type == nir_instr_type_undef)) {
1269       int chan1 = 1;
1270       if (nir_op_infos[instr->op].input_sizes[i] == 0) {
1271          chan1 = instr->def.num_components > 1 ? 1 : 0;
1272       }
1273       usrc = ureg_swizzle(usrc,
1274                           src.swizzle[0] * 2,
1275                           src.swizzle[0] * 2 + 1,
1276                           src.swizzle[chan1] * 2,
1277                           src.swizzle[chan1] * 2 + 1);
1278    } else {
1279       usrc = ureg_swizzle(usrc,
1280                           src.swizzle[0],
1281                           src.swizzle[1],
1282                           src.swizzle[2],
1283                           src.swizzle[3]);
1284    }
1285 
1286    if (src.fabs)
1287       usrc = ureg_abs(usrc);
1288    if (src.fneg)
1289       usrc = ureg_negate(usrc);
1290 
1291    return usrc;
1292 }
1293 
1294 /* Reswizzles a source so that the unset channels in the write mask still refer
1295  * to one of the channels present in the write mask.
1296  */
1297 static struct ureg_src
ntt_swizzle_for_write_mask(struct ureg_src src,uint32_t write_mask)1298 ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
1299 {
1300    assert(write_mask);
1301    int first_chan = ffs(write_mask) - 1;
1302    return ureg_swizzle(src,
1303                        (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
1304                        (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
1305                        (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
1306                        (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
1307 }
1308 
1309 static struct ureg_dst
ntt_get_ssa_def_decl(struct ntt_compile * c,nir_def * ssa)1310 ntt_get_ssa_def_decl(struct ntt_compile *c, nir_def *ssa)
1311 {
1312    uint32_t writemask = BITSET_MASK(ssa->num_components);
1313    if (ssa->bit_size == 64)
1314       writemask = ntt_64bit_write_mask(writemask);
1315 
1316    struct ureg_dst dst;
1317    if (!ntt_try_store_ssa_in_tgsi_output(c, &dst, ssa))
1318       dst = ntt_temp(c);
1319 
1320    c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask);
1321 
1322    return ureg_writemask(dst, writemask);
1323 }
1324 
1325 static struct ureg_dst
ntt_get_chased_dest_decl(struct ntt_compile * c,nir_legacy_dest * dest)1326 ntt_get_chased_dest_decl(struct ntt_compile *c, nir_legacy_dest *dest)
1327 {
1328    if (dest->is_ssa)
1329       return ntt_get_ssa_def_decl(c, dest->ssa);
1330    else
1331       return c->reg_temp[dest->reg.handle->index];
1332 }
1333 
1334 static struct ureg_dst
ntt_get_chased_dest(struct ntt_compile * c,nir_legacy_dest * dest)1335 ntt_get_chased_dest(struct ntt_compile *c, nir_legacy_dest *dest)
1336 {
1337    struct ureg_dst dst = ntt_get_chased_dest_decl(c, dest);
1338 
1339    if (!dest->is_ssa) {
1340       dst.Index += dest->reg.base_offset;
1341 
1342       if (dest->reg.indirect) {
1343          struct ureg_src offset = ntt_get_src(c, nir_src_for_ssa(dest->reg.indirect));
1344          dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset, 0));
1345       }
1346    }
1347 
1348    return dst;
1349 }
1350 
1351 static struct ureg_dst
ntt_get_dest(struct ntt_compile * c,nir_def * def)1352 ntt_get_dest(struct ntt_compile *c, nir_def *def)
1353 {
1354    nir_legacy_dest chased = nir_legacy_chase_dest(def);
1355    return ntt_get_chased_dest(c, &chased);
1356 }
1357 
1358 static struct ureg_dst
ntt_get_alu_dest(struct ntt_compile * c,nir_def * def)1359 ntt_get_alu_dest(struct ntt_compile *c, nir_def *def)
1360 {
1361    nir_legacy_alu_dest chased = nir_legacy_chase_alu_dest(def);
1362    struct ureg_dst dst = ntt_get_chased_dest(c, &chased.dest);
1363 
1364    if (chased.fsat)
1365       dst.Saturate = true;
1366 
1367    /* Only registers get write masks */
1368    if (chased.dest.is_ssa)
1369       return dst;
1370 
1371    int dst_64 = def->bit_size == 64;
1372    unsigned write_mask = chased.write_mask;
1373 
1374    if (dst_64)
1375       return ureg_writemask(dst, ntt_64bit_write_mask(write_mask));
1376    else
1377       return ureg_writemask(dst, write_mask);
1378 }
1379 
1380 /* For an SSA dest being populated by a constant src, replace the storage with
1381  * a copy of the ureg_src.
1382  */
1383 static void
ntt_store_def(struct ntt_compile * c,nir_def * def,struct ureg_src src)1384 ntt_store_def(struct ntt_compile *c, nir_def *def, struct ureg_src src)
1385 {
1386    if (!src.Indirect && !src.DimIndirect) {
1387       switch (src.File) {
1388       case TGSI_FILE_IMMEDIATE:
1389       case TGSI_FILE_INPUT:
1390       case TGSI_FILE_CONSTANT:
1391       case TGSI_FILE_SYSTEM_VALUE:
1392          c->ssa_temp[def->index] = src;
1393          return;
1394       }
1395    }
1396 
1397    ntt_MOV(c, ntt_get_ssa_def_decl(c, def), src);
1398 }
1399 
1400 static void
ntt_store(struct ntt_compile * c,nir_def * def,struct ureg_src src)1401 ntt_store(struct ntt_compile *c, nir_def *def, struct ureg_src src)
1402 {
1403    nir_legacy_dest chased = nir_legacy_chase_dest(def);
1404 
1405    if (chased.is_ssa)
1406       ntt_store_def(c, chased.ssa, src);
1407    else {
1408       struct ureg_dst dst = ntt_get_chased_dest(c, &chased);
1409       ntt_MOV(c, dst, src);
1410    }
1411 }
1412 
1413 static void
ntt_emit_scalar(struct ntt_compile * c,unsigned tgsi_op,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1)1414 ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
1415                 struct ureg_dst dst,
1416                 struct ureg_src src0,
1417                 struct ureg_src src1)
1418 {
1419    unsigned i;
1420 
1421    /* POW is the only 2-operand scalar op. */
1422    if (tgsi_op != TGSI_OPCODE_POW)
1423       src1 = src0;
1424 
1425    for (i = 0; i < 4; i++) {
1426       if (dst.WriteMask & (1 << i)) {
1427          ntt_insn(c, tgsi_op,
1428                   ureg_writemask(dst, 1 << i),
1429                   ureg_scalar(src0, i),
1430                   ureg_scalar(src1, i),
1431                   ureg_src_undef(), ureg_src_undef());
1432       }
1433    }
1434 }
1435 
1436 static void
ntt_emit_alu(struct ntt_compile * c,nir_alu_instr * instr)1437 ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
1438 {
1439    struct ureg_src src[4];
1440    struct ureg_dst dst;
1441    unsigned i;
1442    int dst_64 = instr->def.bit_size == 64;
1443    int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
1444    int num_srcs = nir_op_infos[instr->op].num_inputs;
1445 
1446    /* Don't try to translate folded fsat since their source won't be valid */
1447    if (instr->op == nir_op_fsat && nir_legacy_fsat_folds(instr))
1448       return;
1449 
1450    c->precise = instr->exact;
1451 
1452    assert(num_srcs <= ARRAY_SIZE(src));
1453    for (i = 0; i < num_srcs; i++)
1454       src[i] = ntt_get_alu_src(c, instr, i);
1455    for (; i < ARRAY_SIZE(src); i++)
1456       src[i] = ureg_src_undef();
1457 
1458    dst = ntt_get_alu_dest(c, &instr->def);
1459 
1460    static enum tgsi_opcode op_map[][2] = {
1461       [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
1462 
1463       /* fabs/fneg 32-bit are special-cased below. */
1464       [nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
1465       [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },
1466 
1467       [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
1468       [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
1469       [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
1470       [nir_op_fdot2_replicated] = { TGSI_OPCODE_DP2 },
1471       [nir_op_fdot3_replicated] = { TGSI_OPCODE_DP3 },
1472       [nir_op_fdot4_replicated] = { TGSI_OPCODE_DP4 },
1473       [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
1474       [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
1475       [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
1476       [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
1477       [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
1478       [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
1479       [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },
1480 
1481       [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
1482       [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
1483       [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },
1484 
1485       /* The conversions will have one combination of src and dst bitsize. */
1486       [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
1487       [nir_op_f2f64] = { TGSI_OPCODE_F2D },
1488       [nir_op_i2i64] = { TGSI_OPCODE_I2I64 },
1489 
1490       [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
1491       [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
1492       [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
1493       [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
1494       [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
1495       [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
1496       [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
1497       [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },
1498 
1499       [nir_op_slt] = { TGSI_OPCODE_SLT },
1500       [nir_op_sge] = { TGSI_OPCODE_SGE },
1501       [nir_op_seq] = { TGSI_OPCODE_SEQ },
1502       [nir_op_sne] = { TGSI_OPCODE_SNE },
1503 
1504       [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
1505       [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
1506       [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
1507       [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },
1508 
1509       [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
1510       [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
1511       [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
1512       [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },
1513 
1514       [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
1515       [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },
1516 
1517       [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
1518       [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
1519       [nir_op_fsign] = { TGSI_OPCODE_SSG, TGSI_OPCODE_DSSG },
1520       [nir_op_isign] = { TGSI_OPCODE_ISSG, TGSI_OPCODE_I64SSG },
1521       [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
1522       [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
1523       [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
1524       [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
1525       [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
1526       [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
1527       [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
1528       [nir_op_bit_count] = { TGSI_OPCODE_POPC },
1529       [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
1530       [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
1531       [nir_op_find_lsb] = { TGSI_OPCODE_LSB },
1532       [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
1533       [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
1534       [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
1535       [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
1536       [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
1537       [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
1538       [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
1539       [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
1540       [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
1541       [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
1542       [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },
1543 
1544       /* These bitwise ops don't care about 32 vs 64 types, so they have the
1545        * same TGSI op.
1546        */
1547       [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
1548       [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
1549       [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
1550       [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },
1551 
1552       [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
1553       [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
1554       [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
1555       [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
1556       [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
1557       [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
1558       [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
1559       [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
1560    };
1561 
1562    if (src_64 && !dst_64) {
1563       if (num_srcs == 2 || nir_op_infos[instr->op].output_type == nir_type_bool32) {
1564          /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
1565          * of .xy.
1566          */
1567          assert(!(dst.WriteMask & TGSI_WRITEMASK_YW));
1568       } else {
1569          /* TGSI 64bit-to-32-bit conversions only generate results in the .xy
1570          * channels and will need to get fixed up.
1571          */
1572         assert(!(dst.WriteMask & TGSI_WRITEMASK_ZW));
1573       }
1574    }
1575 
1576    bool table_op64 = src_64;
1577    if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
1578       /* The normal path for NIR to TGSI ALU op translation */
1579       ntt_insn(c, op_map[instr->op][table_op64],
1580                 dst, src[0], src[1], src[2], src[3]);
1581    } else {
1582       /* Special cases for NIR to TGSI ALU op translation. */
1583 
1584       /* TODO: Use something like the ntt_store() path for the MOV calls so we
1585        * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
1586        */
1587 
1588       switch (instr->op) {
1589       case nir_op_u2u64:
1590          ntt_AND(c, dst, ureg_swizzle(src[0],
1591                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1592                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1593                   ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
1594          break;
1595 
1596       case nir_op_i2i32:
1597       case nir_op_u2u32:
1598          assert(src_64);
1599          ntt_MOV(c, dst, ureg_swizzle(src[0],
1600                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1601                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
1602          break;
1603 
1604       case nir_op_fabs:
1605          /* Try to eliminate */
1606          if (!c->options->lower_fabs && nir_legacy_float_mod_folds(instr))
1607             break;
1608 
1609          if (c->options->lower_fabs)
1610             ntt_MAX(c, dst, src[0], ureg_negate(src[0]));
1611          else
1612             ntt_MOV(c, dst, ureg_abs(src[0]));
1613          break;
1614 
1615       case nir_op_fsat:
1616          if (dst_64) {
1617             ntt_MIN(c, dst, src[0], ntt_64bit_1f(c));
1618             ntt_MAX(c, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
1619          } else {
1620             ntt_MOV(c, ureg_saturate(dst), src[0]);
1621          }
1622          break;
1623 
1624       case nir_op_fneg:
1625          /* Try to eliminate */
1626          if (nir_legacy_float_mod_folds(instr))
1627             break;
1628 
1629          ntt_MOV(c, dst, ureg_negate(src[0]));
1630          break;
1631 
1632          /* NOTE: TGSI 32-bit math ops have the old "one source channel
1633           * replicated to all dst channels" behavior, while 64 is normal mapping
1634           * of src channels to dst.
1635           */
1636       case nir_op_frcp:
1637          assert(!dst_64);
1638          ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef());
1639          break;
1640 
1641       case nir_op_frsq:
1642          assert(!dst_64);
1643          ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef());
1644          break;
1645 
1646       case nir_op_fsqrt:
1647          assert(!dst_64);
1648          ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], ureg_src_undef());
1649          break;
1650 
1651       case nir_op_fexp2:
1652          assert(!dst_64);
1653          ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef());
1654          break;
1655 
1656       case nir_op_flog2:
1657          assert(!dst_64);
1658          ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef());
1659          break;
1660 
1661       case nir_op_b2f32:
1662          ntt_AND(c, dst, src[0], ureg_imm1f(c->ureg, 1.0));
1663          break;
1664 
1665       case nir_op_b2f64:
1666          ntt_AND(c, dst,
1667                   ureg_swizzle(src[0],
1668                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1669                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1670                   ntt_64bit_1f(c));
1671          break;
1672 
1673       case nir_op_b2i32:
1674          ntt_AND(c, dst, src[0], ureg_imm1u(c->ureg, 1));
1675          break;
1676 
1677       case nir_op_b2i64:
1678          ntt_AND(c, dst,
1679                   ureg_swizzle(src[0],
1680                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1681                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1682                   ureg_imm4u(c->ureg, 1, 0, 1, 0));
1683          break;
1684 
1685       case nir_op_fsin:
1686          ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef());
1687          break;
1688 
1689       case nir_op_fcos:
1690          ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef());
1691          break;
1692 
1693       case nir_op_fsub:
1694          assert(!dst_64);
1695          ntt_ADD(c, dst, src[0], ureg_negate(src[1]));
1696          break;
1697 
1698       case nir_op_isub:
1699          assert(!dst_64);
1700          ntt_UADD(c, dst, src[0], ureg_negate(src[1]));
1701          break;
1702 
1703       case nir_op_fmod:
1704          unreachable("should be handled by .lower_fmod = true");
1705          break;
1706 
1707       case nir_op_fpow:
1708          ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
1709          break;
1710 
1711       case nir_op_flrp:
1712          ntt_LRP(c, dst, src[2], src[1], src[0]);
1713          break;
1714 
1715       case nir_op_pack_64_2x32_split:
1716          ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
1717                   ureg_swizzle(src[0],
1718                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1719                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1720          ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_YW),
1721                   ureg_swizzle(src[1],
1722                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1723                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1724          break;
1725 
1726       case nir_op_unpack_64_2x32_split_x:
1727          ntt_MOV(c, dst, ureg_swizzle(src[0],
1728                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1729                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
1730          break;
1731 
1732       case nir_op_unpack_64_2x32_split_y:
1733          ntt_MOV(c, dst, ureg_swizzle(src[0],
1734                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
1735                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
1736          break;
1737 
1738       case nir_op_b32csel:
1739          if (nir_src_bit_size(instr->src[1].src) == 64) {
1740             ntt_UCMP(c, dst, ureg_swizzle(src[0],
1741                                                  TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1742                                                  TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1743                       src[1], src[2]);
1744          } else {
1745             ntt_UCMP(c, dst, src[0], src[1], src[2]);
1746          }
1747          break;
1748 
1749       case nir_op_fcsel:
1750          /* If CMP isn't supported, then the flags that enable NIR to generate
1751           * this opcode should also not be set.
1752           */
1753          assert(!c->options->lower_cmp);
1754 
1755          /* Implement this as CMP(-abs(src0), src1, src2). */
1756          ntt_CMP(c, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]);
1757          break;
1758 
1759       case nir_op_fcsel_gt:
1760          /* If CMP isn't supported, then the flags that enable NIR to generate
1761           * these opcodes should also not be set.
1762           */
1763          assert(!c->options->lower_cmp);
1764 
1765          ntt_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]);
1766          break;
1767 
1768       case nir_op_fcsel_ge:
1769          /* If CMP isn't supported, then the flags that enable NIR to generate
1770           * these opcodes should also not be set.
1771           */
1772          assert(!c->options->lower_cmp);
1773 
1774          /* Implement this as if !(src0 < 0.0) was identical to src0 >= 0.0. */
1775          ntt_CMP(c, dst, src[0], src[2], src[1]);
1776          break;
1777 
1778       case nir_op_frexp_sig:
1779       case nir_op_frexp_exp:
1780          unreachable("covered by nir_lower_frexp()");
1781          break;
1782 
1783       case nir_op_ldexp:
1784          assert(dst_64); /* 32bit handled in table. */
1785          ntt_DLDEXP(c, dst, src[0],
1786                      ureg_swizzle(src[1],
1787                                   TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1788                                   TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1789          break;
1790 
1791       case nir_op_vec4:
1792       case nir_op_vec3:
1793       case nir_op_vec2:
1794          unreachable("covered by nir_lower_vec_to_movs()");
1795 
1796       default:
1797          fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
1798          unreachable("Unknown NIR opcode");
1799       }
1800    }
1801 
1802    c->precise = false;
1803 }
1804 
1805 static struct ureg_src
ntt_ureg_src_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src,int addr_reg)1806 ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
1807                       nir_src src, int addr_reg)
1808 {
1809    if (nir_src_is_const(src)) {
1810       usrc.Index += ntt_src_as_uint(c, src);
1811       return usrc;
1812    } else {
1813       return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src), addr_reg));
1814    }
1815 }
1816 
1817 static struct ureg_dst
ntt_ureg_dst_indirect(struct ntt_compile * c,struct ureg_dst dst,nir_src src)1818 ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
1819                       nir_src src)
1820 {
1821    if (nir_src_is_const(src)) {
1822       dst.Index += ntt_src_as_uint(c, src);
1823       return dst;
1824    } else {
1825       return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src), 0));
1826    }
1827 }
1828 
1829 static struct ureg_src
ntt_ureg_src_dimension_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src)1830 ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
1831                          nir_src src)
1832 {
1833    if (nir_src_is_const(src)) {
1834       return ureg_src_dimension(usrc, ntt_src_as_uint(c, src));
1835    }
1836    else
1837    {
1838       return ureg_src_dimension_indirect(usrc,
1839                                          ntt_reladdr(c, ntt_get_src(c, src), 1),
1840                                          0);
1841    }
1842 }
1843 
1844 static struct ureg_dst
ntt_ureg_dst_dimension_indirect(struct ntt_compile * c,struct ureg_dst udst,nir_src src)1845 ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst,
1846                                 nir_src src)
1847 {
1848    if (nir_src_is_const(src)) {
1849       return ureg_dst_dimension(udst, ntt_src_as_uint(c, src));
1850    } else {
1851       return ureg_dst_dimension_indirect(udst,
1852                                          ntt_reladdr(c, ntt_get_src(c, src), 1),
1853                                          0);
1854    }
1855 }
1856 /* Some load operations in NIR will have a fractional offset that we need to
1857  * swizzle down before storing to the result register.
1858  */
1859 static struct ureg_src
ntt_shift_by_frac(struct ureg_src src,unsigned frac,unsigned num_components)1860 ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1861 {
1862    return ureg_swizzle(src,
1863                        frac,
1864                        frac + MIN2(num_components - 1, 1),
1865                        frac + MIN2(num_components - 1, 2),
1866                        frac + MIN2(num_components - 1, 3));
1867 }
1868 
1869 
1870 static void
ntt_emit_load_ubo(struct ntt_compile * c,nir_intrinsic_instr * instr)1871 ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
1872 {
1873    int bit_size = instr->def.bit_size;
1874    assert(bit_size == 32 || instr->num_components <= 2);
1875 
1876    struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
1877 
1878    struct ureg_dst addr_temp = ureg_dst_undef();
1879 
1880    if (nir_src_is_const(instr->src[0])) {
1881       src = ureg_src_dimension(src, ntt_src_as_uint(c, instr->src[0]));
1882    } else {
1883       /* virglrenderer requires that indirect UBO references have the UBO
1884        * array's base index in the Index field, not added to the indrect
1885        * address.
1886        *
1887        * Many nir intrinsics have a base address const value for the start of
1888        * their array indirection, but load_ubo doesn't.  We fake it by
1889        * subtracting it off here.
1890        */
1891       addr_temp = ntt_temp(c);
1892       ntt_UADD(c, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
1893       src = ureg_src_dimension_indirect(src,
1894                                          ntt_reladdr(c, ureg_src(addr_temp), 1),
1895                                          c->first_ubo);
1896    }
1897 
1898    if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {
1899       /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
1900        * file.
1901        */
1902       src.Index = nir_intrinsic_base(instr);
1903 
1904       if (nir_src_is_const(instr->src[1])) {
1905          src.Index += ntt_src_as_uint(c, instr->src[1]);
1906       } else {
1907          src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1]), 0));
1908       }
1909 
1910       int start_component = nir_intrinsic_component(instr);
1911       if (bit_size == 64)
1912          start_component *= 2;
1913 
1914       src = ntt_shift_by_frac(src, start_component,
1915                               instr->num_components * bit_size / 32);
1916 
1917       ntt_store(c, &instr->def, src);
1918    } else {
1919       /* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a
1920        * TGSI_OPCODE_LOAD instruction from the const file.
1921        */
1922       struct ntt_insn *insn =
1923          ntt_insn(c, TGSI_OPCODE_LOAD,
1924                   ntt_get_dest(c, &instr->def),
1925                   src, ntt_get_src(c, instr->src[1]),
1926                   ureg_src_undef(), ureg_src_undef());
1927       insn->is_mem = true;
1928       insn->tex_target = 0;
1929       insn->mem_qualifier = 0;
1930       insn->mem_format = 0; /* unused */
1931    }
1932 }
1933 
1934 static unsigned
ntt_get_access_qualifier(nir_intrinsic_instr * instr)1935 ntt_get_access_qualifier(nir_intrinsic_instr *instr)
1936 {
1937    enum gl_access_qualifier access = nir_intrinsic_access(instr);
1938    unsigned qualifier = 0;
1939 
1940    if (access & ACCESS_COHERENT)
1941       qualifier |= TGSI_MEMORY_COHERENT;
1942    if (access & ACCESS_VOLATILE)
1943       qualifier |= TGSI_MEMORY_VOLATILE;
1944    if (access & ACCESS_RESTRICT)
1945       qualifier |= TGSI_MEMORY_RESTRICT;
1946 
1947    return qualifier;
1948 }
1949 
1950 static unsigned
ntt_translate_atomic_op(nir_atomic_op op)1951 ntt_translate_atomic_op(nir_atomic_op op)
1952 {
1953    switch (op) {
1954    case nir_atomic_op_iadd: return TGSI_OPCODE_ATOMUADD;
1955    case nir_atomic_op_fadd: return TGSI_OPCODE_ATOMFADD;
1956    case nir_atomic_op_imin: return TGSI_OPCODE_ATOMIMIN;
1957    case nir_atomic_op_imax: return TGSI_OPCODE_ATOMIMAX;
1958    case nir_atomic_op_umin: return TGSI_OPCODE_ATOMUMIN;
1959    case nir_atomic_op_umax: return TGSI_OPCODE_ATOMUMAX;
1960    case nir_atomic_op_iand: return TGSI_OPCODE_ATOMAND;
1961    case nir_atomic_op_ixor: return TGSI_OPCODE_ATOMXOR;
1962    case nir_atomic_op_ior:  return TGSI_OPCODE_ATOMOR;
1963    case nir_atomic_op_xchg: return TGSI_OPCODE_ATOMXCHG;
1964    default: unreachable("invalid atomic");
1965    }
1966 }
1967 
1968 static void
ntt_emit_mem(struct ntt_compile * c,nir_intrinsic_instr * instr,nir_variable_mode mode)1969 ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
1970              nir_variable_mode mode)
1971 {
1972    bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
1973                     instr->intrinsic == nir_intrinsic_store_shared);
1974    bool is_load = (instr->intrinsic == nir_intrinsic_atomic_counter_read ||
1975                     instr->intrinsic == nir_intrinsic_load_ssbo ||
1976                     instr->intrinsic == nir_intrinsic_load_shared);
1977    unsigned opcode;
1978    struct ureg_src src[4];
1979    int num_src = 0;
1980    int next_src;
1981    struct ureg_dst addr_temp = ureg_dst_undef();
1982 
1983    struct ureg_src memory;
1984    switch (mode) {
1985    case nir_var_mem_ssbo:
1986       memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER,
1987                                                           c->first_ssbo),
1988                                      instr->src[is_store ? 1 : 0], 2);
1989       next_src = 1;
1990       break;
1991    case nir_var_mem_shared:
1992       memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
1993       next_src = 0;
1994       break;
1995    case nir_var_uniform: { /* HW atomic buffers */
1996       nir_src src = instr->src[0];
1997       uint32_t offset = (ntt_extract_const_src_offset(&src) +
1998                          nir_intrinsic_range_base(instr)) / 4;
1999 
2000       memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, offset);
2001       /* ntt_ureg_src_indirect, except dividing by 4 */
2002       if (nir_src_is_const(src)) {
2003          memory.Index += nir_src_as_uint(src) / 4;
2004       } else {
2005          addr_temp = ntt_temp(c);
2006          ntt_USHR(c, addr_temp, ntt_get_src(c, src), ureg_imm1i(c->ureg, 2));
2007          memory = ureg_src_indirect(memory, ntt_reladdr(c, ureg_src(addr_temp), 2));
2008       }
2009       memory = ureg_src_dimension(memory, nir_intrinsic_base(instr));
2010       next_src = 0;
2011       break;
2012    }
2013 
2014    default:
2015       unreachable("unknown memory type");
2016    }
2017 
2018    if (is_store) {
2019       src[num_src++] = ntt_get_src(c, instr->src[next_src + 1]); /* offset */
2020       src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
2021    } else {
2022       src[num_src++] = memory;
2023       if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
2024          src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* offset */
2025          switch (instr->intrinsic) {
2026          case nir_intrinsic_atomic_counter_inc:
2027             src[num_src++] = ureg_imm1i(c->ureg, 1);
2028             break;
2029          case nir_intrinsic_atomic_counter_post_dec:
2030             src[num_src++] = ureg_imm1i(c->ureg, -1);
2031             break;
2032          default:
2033             if (!is_load)
2034                src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* value */
2035             break;
2036          }
2037       }
2038    }
2039 
2040 
2041    switch (instr->intrinsic) {
2042    case nir_intrinsic_ssbo_atomic:
2043    case nir_intrinsic_shared_atomic:
2044       opcode = ntt_translate_atomic_op(nir_intrinsic_atomic_op(instr));
2045       break;
2046    case nir_intrinsic_atomic_counter_add:
2047    case nir_intrinsic_atomic_counter_inc:
2048    case nir_intrinsic_atomic_counter_post_dec:
2049       opcode = TGSI_OPCODE_ATOMUADD;
2050       break;
2051    case nir_intrinsic_atomic_counter_min:
2052       opcode = TGSI_OPCODE_ATOMIMIN;
2053       break;
2054    case nir_intrinsic_atomic_counter_max:
2055       opcode = TGSI_OPCODE_ATOMIMAX;
2056       break;
2057    case nir_intrinsic_atomic_counter_and:
2058       opcode = TGSI_OPCODE_ATOMAND;
2059       break;
2060    case nir_intrinsic_atomic_counter_or:
2061       opcode = TGSI_OPCODE_ATOMOR;
2062       break;
2063    case nir_intrinsic_atomic_counter_xor:
2064       opcode = TGSI_OPCODE_ATOMXOR;
2065       break;
2066    case nir_intrinsic_atomic_counter_exchange:
2067       opcode = TGSI_OPCODE_ATOMXCHG;
2068       break;
2069    case nir_intrinsic_atomic_counter_comp_swap:
2070    case nir_intrinsic_ssbo_atomic_swap:
2071    case nir_intrinsic_shared_atomic_swap:
2072       opcode = TGSI_OPCODE_ATOMCAS;
2073       src[num_src++] = ntt_get_src(c, instr->src[next_src++]);
2074       break;
2075    case nir_intrinsic_atomic_counter_read:
2076    case nir_intrinsic_load_ssbo:
2077    case nir_intrinsic_load_shared:
2078       opcode = TGSI_OPCODE_LOAD;
2079       break;
2080    case nir_intrinsic_store_ssbo:
2081    case nir_intrinsic_store_shared:
2082       opcode = TGSI_OPCODE_STORE;
2083       break;
2084    case nir_intrinsic_get_ssbo_size:
2085       opcode = TGSI_OPCODE_RESQ;
2086       break;
2087    default:
2088       unreachable("unknown memory op");
2089    }
2090 
2091    unsigned qualifier = 0;
2092    if (mode == nir_var_mem_ssbo &&
2093        instr->intrinsic != nir_intrinsic_get_ssbo_size) {
2094       qualifier = ntt_get_access_qualifier(instr);
2095    }
2096 
2097    struct ureg_dst dst;
2098    if (is_store) {
2099       dst = ureg_dst(memory);
2100 
2101       unsigned write_mask = nir_intrinsic_write_mask(instr);
2102       if (nir_src_bit_size(instr->src[0]) == 64)
2103          write_mask = ntt_64bit_write_mask(write_mask);
2104       dst = ureg_writemask(dst, write_mask);
2105    } else {
2106       dst = ntt_get_dest(c, &instr->def);
2107    }
2108 
2109    struct ntt_insn *insn = ntt_insn(c, opcode, dst, src[0], src[1], src[2], src[3]);
2110    insn->tex_target = TGSI_TEXTURE_BUFFER;
2111    insn->mem_qualifier = qualifier;
2112    insn->mem_format = 0; /* unused */
2113    insn->is_mem = true;
2114 }
2115 
2116 static void
ntt_emit_image_load_store(struct ntt_compile * c,nir_intrinsic_instr * instr)2117 ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
2118 {
2119    unsigned op;
2120    struct ureg_src srcs[4];
2121    int num_src = 0;
2122    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
2123    bool is_array = nir_intrinsic_image_array(instr);
2124 
2125    struct ureg_dst temp = ureg_dst_undef();
2126 
2127    enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false);
2128 
2129    struct ureg_src resource;
2130    switch (instr->intrinsic) {
2131    case nir_intrinsic_bindless_image_load:
2132    case nir_intrinsic_bindless_image_store:
2133    case nir_intrinsic_bindless_image_size:
2134    case nir_intrinsic_bindless_image_samples:
2135    case nir_intrinsic_bindless_image_atomic:
2136    case nir_intrinsic_bindless_image_atomic_swap:
2137       resource = ntt_get_src(c, instr->src[0]);
2138       break;
2139    default:
2140       resource = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
2141                                        instr->src[0], 2);
2142       resource.Index += nir_intrinsic_range_base(instr);
2143    }
2144 
2145    struct ureg_dst dst;
2146    if (instr->intrinsic == nir_intrinsic_image_store ||
2147        instr->intrinsic == nir_intrinsic_bindless_image_store) {
2148       dst = ureg_dst(resource);
2149    } else {
2150       srcs[num_src++] = resource;
2151       dst = ntt_get_dest(c, &instr->def);
2152    }
2153    struct ureg_dst opcode_dst = dst;
2154 
2155    if (instr->intrinsic != nir_intrinsic_image_size &&
2156        instr->intrinsic != nir_intrinsic_image_samples &&
2157        instr->intrinsic != nir_intrinsic_bindless_image_size &&
2158        instr->intrinsic != nir_intrinsic_bindless_image_samples) {
2159       struct ureg_src coord = ntt_get_src(c, instr->src[1]);
2160 
2161       if (dim == GLSL_SAMPLER_DIM_MS) {
2162          temp = ntt_temp(c);
2163          ntt_MOV(c, temp, coord);
2164          ntt_MOV(c, ureg_writemask(temp, TGSI_WRITEMASK_W),
2165                   ureg_scalar(ntt_get_src(c, instr->src[2]), TGSI_SWIZZLE_X));
2166          coord = ureg_src(temp);
2167       }
2168       srcs[num_src++] = coord;
2169 
2170       if (instr->intrinsic != nir_intrinsic_image_load &&
2171           instr->intrinsic != nir_intrinsic_bindless_image_load) {
2172          srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
2173          if (instr->intrinsic == nir_intrinsic_image_atomic_swap ||
2174              instr->intrinsic == nir_intrinsic_bindless_image_atomic_swap)
2175             srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
2176       }
2177    }
2178 
2179    switch (instr->intrinsic) {
2180    case nir_intrinsic_image_load:
2181    case nir_intrinsic_bindless_image_load:
2182       op = TGSI_OPCODE_LOAD;
2183       break;
2184    case nir_intrinsic_image_store:
2185    case nir_intrinsic_bindless_image_store:
2186       op = TGSI_OPCODE_STORE;
2187       break;
2188    case nir_intrinsic_image_size:
2189    case nir_intrinsic_bindless_image_size:
2190       op = TGSI_OPCODE_RESQ;
2191       break;
2192    case nir_intrinsic_image_samples:
2193    case nir_intrinsic_bindless_image_samples:
2194       op = TGSI_OPCODE_RESQ;
2195       opcode_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2196       break;
2197    case nir_intrinsic_image_atomic:
2198    case nir_intrinsic_bindless_image_atomic:
2199       op = ntt_translate_atomic_op(nir_intrinsic_atomic_op(instr));
2200       break;
2201    case nir_intrinsic_image_atomic_swap:
2202    case nir_intrinsic_bindless_image_atomic_swap:
2203       op = TGSI_OPCODE_ATOMCAS;
2204       break;
2205    default:
2206       unreachable("bad op");
2207    }
2208 
2209    struct ntt_insn *insn = ntt_insn(c, op, opcode_dst, srcs[0], srcs[1], srcs[2], srcs[3]);
2210    insn->tex_target = target;
2211    insn->mem_qualifier = ntt_get_access_qualifier(instr);
2212    insn->mem_format = nir_intrinsic_format(instr);
2213    insn->is_mem = true;
2214 
2215    if (instr->intrinsic == nir_intrinsic_image_samples ||
2216        instr->intrinsic == nir_intrinsic_bindless_image_samples)
2217       ntt_MOV(c, dst, ureg_scalar(ureg_src(opcode_dst), 3));
2218 }
2219 
2220 static void
ntt_emit_load_input(struct ntt_compile * c,nir_intrinsic_instr * instr)2221 ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
2222 {
2223    uint32_t frac = nir_intrinsic_component(instr);
2224    uint32_t num_components = instr->num_components;
2225    unsigned base = nir_intrinsic_base(instr);
2226    struct ureg_src input;
2227    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2228    bool is_64 = instr->def.bit_size == 64;
2229 
2230    if (c->s->info.stage == MESA_SHADER_VERTEX) {
2231       input = ureg_DECL_vs_input(c->ureg, base);
2232       for (int i = 1; i < semantics.num_slots; i++)
2233          ureg_DECL_vs_input(c->ureg, base + i);
2234    } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
2235       unsigned semantic_name, semantic_index;
2236       ntt_get_gl_varying_semantic(c, semantics.location,
2237                                   &semantic_name, &semantic_index);
2238 
2239       /* XXX: ArrayID is used in r600 gs inputs */
2240       uint32_t array_id = 0;
2241 
2242       input = ureg_DECL_input_layout(c->ureg,
2243                                      semantic_name,
2244                                      semantic_index,
2245                                      base,
2246                                      ntt_tgsi_usage_mask(frac,
2247                                                          instr->num_components,
2248                                                          is_64),
2249                                      array_id,
2250                                      semantics.num_slots);
2251    } else {
2252       input = c->input_index_map[base];
2253    }
2254 
2255    if (is_64)
2256       num_components *= 2;
2257 
2258    input = ntt_shift_by_frac(input, frac, num_components);
2259 
2260    switch (instr->intrinsic) {
2261    case nir_intrinsic_load_input:
2262       input = ntt_ureg_src_indirect(c, input, instr->src[0], 0);
2263       ntt_store(c, &instr->def, input);
2264       break;
2265 
2266    case nir_intrinsic_load_per_vertex_input:
2267       input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2268       input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
2269       ntt_store(c, &instr->def, input);
2270       break;
2271 
2272    case nir_intrinsic_load_interpolated_input: {
2273       input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2274 
2275       nir_intrinsic_instr *bary_instr =
2276          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
2277 
2278       switch (bary_instr->intrinsic) {
2279       case nir_intrinsic_load_barycentric_pixel:
2280       case nir_intrinsic_load_barycentric_sample:
2281          /* For these, we know that the barycentric load matches the
2282           * interpolation on the input declaration, so we can use it directly.
2283           */
2284          ntt_store(c, &instr->def, input);
2285          break;
2286 
2287       case nir_intrinsic_load_barycentric_centroid:
2288          /* If the input was declared centroid, then there's no need to
2289           * emit the extra TGSI interp instruction, we can just read the
2290           * input.
2291           */
2292          if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
2293             ntt_store(c, &instr->def, input);
2294          } else {
2295             ntt_INTERP_CENTROID(c, ntt_get_dest(c, &instr->def), input);
2296          }
2297          break;
2298 
2299       case nir_intrinsic_load_barycentric_at_sample:
2300          /* We stored the sample in the fake "bary" dest. */
2301          ntt_INTERP_SAMPLE(c, ntt_get_dest(c, &instr->def), input,
2302                             ntt_get_src(c, instr->src[0]));
2303          break;
2304 
2305       case nir_intrinsic_load_barycentric_at_offset:
2306          /* We stored the offset in the fake "bary" dest. */
2307          ntt_INTERP_OFFSET(c, ntt_get_dest(c, &instr->def), input,
2308                             ntt_get_src(c, instr->src[0]));
2309          break;
2310 
2311       default:
2312          unreachable("bad barycentric interp intrinsic\n");
2313       }
2314       break;
2315    }
2316 
2317    default:
2318       unreachable("bad load input intrinsic\n");
2319    }
2320 }
2321 
2322 static void
ntt_emit_store_output(struct ntt_compile * c,nir_intrinsic_instr * instr)2323 ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2324 {
2325    struct ureg_src src = ntt_get_src(c, instr->src[0]);
2326 
2327    if (src.File == TGSI_FILE_OUTPUT) {
2328       /* If our src is the output file, that's an indication that we were able
2329        * to emit the output stores in the generating instructions and we have
2330        * nothing to do here.
2331        */
2332       return;
2333    }
2334 
2335    uint32_t frac;
2336    struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2337 
2338    if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
2339       out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
2340       out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]);
2341    } else {
2342       out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2343    }
2344 
2345    uint8_t swizzle[4] = { 0, 0, 0, 0 };
2346    for (int i = frac; i < 4; i++) {
2347       if (out.WriteMask & (1 << i))
2348          swizzle[i] = i - frac;
2349    }
2350 
2351    src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2352 
2353    ntt_MOV(c, out, src);
2354 }
2355 
2356 static void
ntt_emit_load_output(struct ntt_compile * c,nir_intrinsic_instr * instr)2357 ntt_emit_load_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2358 {
2359    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2360 
2361    /* ntt_try_store_in_tgsi_output() optimization is not valid if normal
2362     * load_output is present.
2363     */
2364    assert(c->s->info.stage != MESA_SHADER_VERTEX &&
2365           (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output));
2366 
2367    uint32_t frac;
2368    struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2369 
2370    if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
2371       out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2372       out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[0]);
2373    } else {
2374       out = ntt_ureg_dst_indirect(c, out, instr->src[0]);
2375    }
2376 
2377    struct ureg_dst dst = ntt_get_dest(c, &instr->def);
2378    struct ureg_src out_src = ureg_src(out);
2379 
2380    /* Don't swizzling unavailable channels of the output in the writemasked-out
2381     * components. Avoids compile failures in virglrenderer with
2382     * TESS_LEVEL_INNER.
2383     */
2384    int fill_channel = ffs(dst.WriteMask) - 1;
2385    uint8_t swizzles[4] = { 0, 1, 2, 3 };
2386    for (int i = 0; i < 4; i++)
2387       if (!(dst.WriteMask & (1 << i)))
2388          swizzles[i] = fill_channel;
2389    out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]);
2390 
2391    if (semantics.fb_fetch_output)
2392       ntt_FBFETCH(c, dst, out_src);
2393    else
2394       ntt_MOV(c, dst, out_src);
2395 }
2396 
2397 static void
ntt_emit_load_sysval(struct ntt_compile * c,nir_intrinsic_instr * instr)2398 ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
2399 {
2400    gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
2401    enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
2402    struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
2403 
2404    /* virglrenderer doesn't like references to channels of the sysval that
2405     * aren't defined, even if they aren't really read.  (GLSL compile fails on
2406     * gl_NumWorkGroups.w, for example).
2407     */
2408    uint32_t write_mask = BITSET_MASK(instr->def.num_components);
2409    sv = ntt_swizzle_for_write_mask(sv, write_mask);
2410 
2411    /* TGSI and NIR define these intrinsics as always loading ints, but they can
2412     * still appear on hardware with non-native-integers fragment shaders using
2413     * the draw path (i915g).  In that case, having called nir_lower_int_to_float
2414     * means that we actually want floats instead.
2415     */
2416    if (!c->native_integers) {
2417       switch (instr->intrinsic) {
2418       case nir_intrinsic_load_vertex_id:
2419       case nir_intrinsic_load_instance_id:
2420          ntt_U2F(c, ntt_get_dest(c, &instr->def), sv);
2421          return;
2422 
2423       default:
2424          break;
2425       }
2426    }
2427 
2428    ntt_store(c, &instr->def, sv);
2429 }
2430 
2431 static void
ntt_emit_barrier(struct ntt_compile * c,nir_intrinsic_instr * intr)2432 ntt_emit_barrier(struct ntt_compile *c, nir_intrinsic_instr *intr)
2433 {
2434    bool compute = gl_shader_stage_is_compute(c->s->info.stage);
2435 
2436    if (nir_intrinsic_memory_scope(intr) != SCOPE_NONE) {
2437       nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
2438       unsigned membar = 0;
2439 
2440       if (modes & nir_var_image)
2441          membar |= TGSI_MEMBAR_SHADER_IMAGE;
2442 
2443       if (modes & nir_var_mem_shared)
2444          membar |= TGSI_MEMBAR_SHARED;
2445 
2446       /* Atomic counters are lowered to SSBOs, there's no NIR mode corresponding
2447        * exactly to atomics. Take the closest match.
2448        */
2449       if (modes & nir_var_mem_ssbo)
2450          membar |= TGSI_MEMBAR_SHADER_BUFFER | TGSI_MEMBAR_ATOMIC_BUFFER;
2451 
2452       if (modes & nir_var_mem_global)
2453          membar |= TGSI_MEMBAR_SHADER_BUFFER;
2454 
2455       /* Hack for virglrenderer: the GLSL specific memory barrier functions,
2456        * memoryBarrier{Buffer,Image,Shared,AtomicCounter}(), are only
2457        * available in compute shaders prior to GLSL 4.30.  In other stages,
2458        * it needs to use the full memoryBarrier().  It may be possible to
2459        * make them available via #extension directives in older versions,
2460        * but it's confusingly underspecified, and Mesa/virglrenderer don't
2461        * currently agree on how to do it.  So, just promote partial memory
2462        * barriers back to full ones outside of compute shaders when asked.
2463        */
2464       if (membar && !compute &&
2465           c->options->non_compute_membar_needs_all_modes) {
2466          membar |= TGSI_MEMBAR_SHADER_BUFFER |
2467                    TGSI_MEMBAR_ATOMIC_BUFFER |
2468                    TGSI_MEMBAR_SHADER_IMAGE |
2469                    TGSI_MEMBAR_SHARED;
2470       }
2471 
2472       /* If we only need workgroup scope (not device-scope), we might be able to
2473        * optimize a bit.
2474        */
2475       if (membar && compute &&
2476           nir_intrinsic_memory_scope(intr) == SCOPE_WORKGROUP) {
2477 
2478          membar |= TGSI_MEMBAR_THREAD_GROUP;
2479       }
2480 
2481       /* Only emit a memory barrier if there are any relevant modes */
2482       if (membar)
2483          ntt_MEMBAR(c, ureg_imm1u(c->ureg, membar));
2484    }
2485 
2486    if (nir_intrinsic_execution_scope(intr) != SCOPE_NONE) {
2487       assert(compute || c->s->info.stage == MESA_SHADER_TESS_CTRL);
2488       ntt_BARRIER(c);
2489    }
2490 }
2491 
2492 static void
ntt_emit_intrinsic(struct ntt_compile * c,nir_intrinsic_instr * instr)2493 ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
2494 {
2495    switch (instr->intrinsic) {
2496    case nir_intrinsic_load_ubo:
2497    case nir_intrinsic_load_ubo_vec4:
2498       ntt_emit_load_ubo(c, instr);
2499       break;
2500 
2501       /* Vertex */
2502    case nir_intrinsic_load_vertex_id:
2503    case nir_intrinsic_load_vertex_id_zero_base:
2504    case nir_intrinsic_load_base_vertex:
2505    case nir_intrinsic_load_base_instance:
2506    case nir_intrinsic_load_instance_id:
2507    case nir_intrinsic_load_draw_id:
2508    case nir_intrinsic_load_invocation_id:
2509    case nir_intrinsic_load_frag_coord:
2510    case nir_intrinsic_load_point_coord:
2511    case nir_intrinsic_load_front_face:
2512    case nir_intrinsic_load_sample_id:
2513    case nir_intrinsic_load_sample_pos:
2514    case nir_intrinsic_load_sample_mask_in:
2515    case nir_intrinsic_load_helper_invocation:
2516    case nir_intrinsic_load_tess_coord:
2517    case nir_intrinsic_load_patch_vertices_in:
2518    case nir_intrinsic_load_primitive_id:
2519    case nir_intrinsic_load_tess_level_outer:
2520    case nir_intrinsic_load_tess_level_inner:
2521    case nir_intrinsic_load_local_invocation_id:
2522    case nir_intrinsic_load_workgroup_id:
2523    case nir_intrinsic_load_num_workgroups:
2524    case nir_intrinsic_load_workgroup_size:
2525    case nir_intrinsic_load_subgroup_size:
2526    case nir_intrinsic_load_subgroup_invocation:
2527    case nir_intrinsic_load_subgroup_eq_mask:
2528    case nir_intrinsic_load_subgroup_ge_mask:
2529    case nir_intrinsic_load_subgroup_gt_mask:
2530    case nir_intrinsic_load_subgroup_lt_mask:
2531    case nir_intrinsic_load_subgroup_le_mask:
2532       ntt_emit_load_sysval(c, instr);
2533       break;
2534 
2535    case nir_intrinsic_load_input:
2536    case nir_intrinsic_load_per_vertex_input:
2537    case nir_intrinsic_load_interpolated_input:
2538       ntt_emit_load_input(c, instr);
2539       break;
2540 
2541    case nir_intrinsic_store_output:
2542    case nir_intrinsic_store_per_vertex_output:
2543       ntt_emit_store_output(c, instr);
2544       break;
2545 
2546    case nir_intrinsic_load_output:
2547    case nir_intrinsic_load_per_vertex_output:
2548       ntt_emit_load_output(c, instr);
2549       break;
2550 
2551    case nir_intrinsic_demote:
2552       ntt_DEMOTE(c);
2553       break;
2554 
2555    case nir_intrinsic_terminate:
2556       ntt_KILL(c);
2557       break;
2558 
2559    case nir_intrinsic_terminate_if: {
2560       struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);
2561 
2562       if (c->native_integers) {
2563          struct ureg_dst temp = ureg_writemask(ntt_temp(c), 1);
2564          ntt_AND(c, temp, cond, ureg_imm1f(c->ureg, 1.0));
2565          ntt_KILL_IF(c, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
2566       } else {
2567          /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
2568          ntt_KILL_IF(c, ureg_negate(cond));
2569       }
2570       break;
2571    }
2572 
2573    case nir_intrinsic_is_helper_invocation:
2574       ntt_READ_HELPER(c, ntt_get_dest(c, &instr->def));
2575       break;
2576 
2577    case nir_intrinsic_vote_all:
2578       ntt_VOTE_ALL(c, ntt_get_dest(c, &instr->def), ntt_get_src(c,instr->src[0]));
2579       return;
2580    case nir_intrinsic_vote_any:
2581       ntt_VOTE_ANY(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2582       return;
2583    case nir_intrinsic_vote_ieq:
2584       ntt_VOTE_EQ(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2585       return;
2586    case nir_intrinsic_ballot:
2587       ntt_BALLOT(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2588       return;
2589    case nir_intrinsic_read_first_invocation:
2590       ntt_READ_FIRST(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2591       return;
2592    case nir_intrinsic_read_invocation:
2593       ntt_READ_INVOC(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]), ntt_get_src(c, instr->src[1]));
2594       return;
2595 
2596    case nir_intrinsic_ddx:
2597    case nir_intrinsic_ddx_coarse:
2598       ntt_DDX(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2599       return;
2600    case nir_intrinsic_ddx_fine:
2601       ntt_DDX_FINE(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2602       return;
2603    case nir_intrinsic_ddy:
2604    case nir_intrinsic_ddy_coarse:
2605       ntt_DDY(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2606       return;
2607    case nir_intrinsic_ddy_fine:
2608       ntt_DDY_FINE(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2609       return;
2610 
2611    case nir_intrinsic_load_ssbo:
2612    case nir_intrinsic_store_ssbo:
2613    case nir_intrinsic_ssbo_atomic:
2614    case nir_intrinsic_ssbo_atomic_swap:
2615    case nir_intrinsic_get_ssbo_size:
2616       ntt_emit_mem(c, instr, nir_var_mem_ssbo);
2617       break;
2618 
2619    case nir_intrinsic_load_shared:
2620    case nir_intrinsic_store_shared:
2621    case nir_intrinsic_shared_atomic:
2622    case nir_intrinsic_shared_atomic_swap:
2623       ntt_emit_mem(c, instr, nir_var_mem_shared);
2624       break;
2625 
2626    case nir_intrinsic_atomic_counter_read:
2627    case nir_intrinsic_atomic_counter_add:
2628    case nir_intrinsic_atomic_counter_inc:
2629    case nir_intrinsic_atomic_counter_post_dec:
2630    case nir_intrinsic_atomic_counter_min:
2631    case nir_intrinsic_atomic_counter_max:
2632    case nir_intrinsic_atomic_counter_and:
2633    case nir_intrinsic_atomic_counter_or:
2634    case nir_intrinsic_atomic_counter_xor:
2635    case nir_intrinsic_atomic_counter_exchange:
2636    case nir_intrinsic_atomic_counter_comp_swap:
2637       ntt_emit_mem(c, instr, nir_var_uniform);
2638       break;
2639    case nir_intrinsic_atomic_counter_pre_dec:
2640       unreachable("Should be lowered by ntt_lower_atomic_pre_dec()");
2641       break;
2642 
2643    case nir_intrinsic_image_load:
2644    case nir_intrinsic_image_store:
2645    case nir_intrinsic_image_size:
2646    case nir_intrinsic_image_samples:
2647    case nir_intrinsic_image_atomic:
2648    case nir_intrinsic_image_atomic_swap:
2649    case nir_intrinsic_bindless_image_load:
2650    case nir_intrinsic_bindless_image_store:
2651    case nir_intrinsic_bindless_image_size:
2652    case nir_intrinsic_bindless_image_samples:
2653    case nir_intrinsic_bindless_image_atomic:
2654    case nir_intrinsic_bindless_image_atomic_swap:
2655       ntt_emit_image_load_store(c, instr);
2656       break;
2657 
2658    case nir_intrinsic_barrier:
2659       ntt_emit_barrier(c, instr);
2660       break;
2661 
2662    case nir_intrinsic_end_primitive:
2663       ntt_ENDPRIM(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2664       break;
2665 
2666    case nir_intrinsic_emit_vertex:
2667       ntt_EMIT(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2668       break;
2669 
2670       /* In TGSI we don't actually generate the barycentric coords, and emit
2671        * interp intrinsics later.  However, we do need to store the
2672        * load_barycentric_at_* argument so that we can use it at that point.
2673        */
2674    case nir_intrinsic_load_barycentric_pixel:
2675    case nir_intrinsic_load_barycentric_centroid:
2676    case nir_intrinsic_load_barycentric_sample:
2677       break;
2678    case nir_intrinsic_load_barycentric_at_sample:
2679    case nir_intrinsic_load_barycentric_at_offset:
2680       ntt_store(c, &instr->def, ntt_get_src(c, instr->src[0]));
2681       break;
2682 
2683    case nir_intrinsic_shader_clock:
2684       ntt_CLOCK(c, ntt_get_dest(c, &instr->def));
2685       break;
2686 
2687    case nir_intrinsic_decl_reg:
2688    case nir_intrinsic_load_reg:
2689    case nir_intrinsic_load_reg_indirect:
2690    case nir_intrinsic_store_reg:
2691    case nir_intrinsic_store_reg_indirect:
2692       /* fully consumed */
2693       break;
2694 
2695    default:
2696       fprintf(stderr, "Unknown intrinsic: ");
2697       nir_print_instr(&instr->instr, stderr);
2698       fprintf(stderr, "\n");
2699       break;
2700    }
2701 }
2702 
2703 struct ntt_tex_operand_state {
2704    struct ureg_src srcs[4];
2705    unsigned i;
2706 };
2707 
2708 static void
ntt_push_tex_arg(struct ntt_compile * c,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_tex_operand_state * s)2709 ntt_push_tex_arg(struct ntt_compile *c,
2710                  nir_tex_instr *instr,
2711                  nir_tex_src_type tex_src_type,
2712                  struct ntt_tex_operand_state *s)
2713 {
2714    int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
2715    if (tex_src < 0)
2716       return;
2717 
2718    nir_src *src = &instr->src[tex_src].src;
2719 
2720    /* virglrenderer workaround that's hard to do in tgsi_translate: Make sure
2721     * that TG4's immediate offset arg is float-typed.
2722     */
2723    if (instr->op == nir_texop_tg4 && tex_src_type == nir_tex_src_backend2 &&
2724        nir_src_is_const(*src)) {
2725       nir_const_value *consts = nir_src_as_const_value(*src);
2726       s->srcs[s->i++] = ureg_imm4f(c->ureg,
2727                                    consts[0].f32,
2728                                    consts[1].f32,
2729                                    consts[2].f32,
2730                                    consts[3].f32);
2731       return;
2732    }
2733 
2734    s->srcs[s->i++] = ntt_get_src(c, *src);
2735 }
2736 
2737 static void
ntt_emit_texture(struct ntt_compile * c,nir_tex_instr * instr)2738 ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
2739 {
2740    struct ureg_dst dst = ntt_get_dest(c, &instr->def);
2741    enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
2742    unsigned tex_opcode;
2743 
2744    int tex_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_texture_handle);
2745    int sampler_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle);
2746 
2747    struct ureg_src sampler;
2748    if (tex_handle_src >= 0 && sampler_handle_src >= 0) {
2749       /* It seems we can't get separate tex/sampler on GL, just use one of the handles */
2750       sampler = ntt_get_src(c, instr->src[tex_handle_src].src);
2751       assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
2752    } else {
2753       assert(tex_handle_src == -1 && sampler_handle_src == -1);
2754       sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
2755       int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
2756       if (sampler_src >= 0) {
2757          struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
2758          sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr, 2));
2759       }
2760    }
2761 
2762    switch (instr->op) {
2763    case nir_texop_tex:
2764       if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
2765          MAX2(instr->coord_components, 2) + instr->is_shadow)
2766          tex_opcode = TGSI_OPCODE_TXP;
2767       else
2768          tex_opcode = TGSI_OPCODE_TEX;
2769       break;
2770    case nir_texop_txf:
2771    case nir_texop_txf_ms:
2772       tex_opcode = TGSI_OPCODE_TXF;
2773 
2774       if (c->has_txf_lz) {
2775          int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod);
2776          if (lod_src >= 0 &&
2777              nir_src_is_const(instr->src[lod_src].src) &&
2778              ntt_src_as_uint(c, instr->src[lod_src].src) == 0) {
2779             tex_opcode = TGSI_OPCODE_TXF_LZ;
2780          }
2781       }
2782       break;
2783    case nir_texop_txl:
2784       tex_opcode = TGSI_OPCODE_TXL;
2785       break;
2786    case nir_texop_txb:
2787       tex_opcode = TGSI_OPCODE_TXB;
2788       break;
2789    case nir_texop_txd:
2790       tex_opcode = TGSI_OPCODE_TXD;
2791       break;
2792    case nir_texop_txs:
2793       tex_opcode = TGSI_OPCODE_TXQ;
2794       break;
2795    case nir_texop_tg4:
2796       tex_opcode = TGSI_OPCODE_TG4;
2797       break;
2798    case nir_texop_query_levels:
2799       tex_opcode = TGSI_OPCODE_TXQ;
2800       break;
2801    case nir_texop_lod:
2802       tex_opcode = TGSI_OPCODE_LODQ;
2803       break;
2804    case nir_texop_texture_samples:
2805       tex_opcode = TGSI_OPCODE_TXQS;
2806       break;
2807    default:
2808       unreachable("unsupported tex op");
2809    }
2810 
2811    struct ntt_tex_operand_state s = { .i = 0 };
2812    ntt_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
2813    ntt_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
2814 
2815    /* non-coord arg for TXQ */
2816    if (tex_opcode == TGSI_OPCODE_TXQ) {
2817       ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
2818       /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
2819        * scalar
2820        */
2821       s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
2822    }
2823 
2824    if (s.i > 1) {
2825       if (tex_opcode == TGSI_OPCODE_TEX)
2826          tex_opcode = TGSI_OPCODE_TEX2;
2827       if (tex_opcode == TGSI_OPCODE_TXB)
2828          tex_opcode = TGSI_OPCODE_TXB2;
2829       if (tex_opcode == TGSI_OPCODE_TXL)
2830          tex_opcode = TGSI_OPCODE_TXL2;
2831    }
2832 
2833    if (instr->op == nir_texop_txd) {
2834       /* Derivs appear in their own src args */
2835       int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
2836       int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2837       s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
2838       s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
2839    }
2840 
2841    if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
2842       if (c->screen->get_param(c->screen,
2843                                PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
2844          sampler = ureg_scalar(sampler, instr->component);
2845          s.srcs[s.i++] = ureg_src_undef();
2846       } else {
2847          s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
2848       }
2849    }
2850 
2851    s.srcs[s.i++] = sampler;
2852 
2853    enum tgsi_return_type tex_type;
2854    switch (instr->dest_type) {
2855    case nir_type_float32:
2856       tex_type = TGSI_RETURN_TYPE_FLOAT;
2857       break;
2858    case nir_type_int32:
2859       tex_type = TGSI_RETURN_TYPE_SINT;
2860       break;
2861    case nir_type_uint32:
2862       tex_type = TGSI_RETURN_TYPE_UINT;
2863       break;
2864    default:
2865       unreachable("unknown texture type");
2866    }
2867 
2868    struct ureg_dst tex_dst;
2869    if (instr->op == nir_texop_query_levels)
2870       tex_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2871    else
2872       tex_dst = dst;
2873 
2874    while (s.i < 4)
2875       s.srcs[s.i++] = ureg_src_undef();
2876 
2877    struct ntt_insn *insn = ntt_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]);
2878    insn->tex_target = target;
2879    insn->tex_return_type = tex_type;
2880    insn->is_tex = true;
2881 
2882    int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
2883    if (tex_offset_src >= 0) {
2884       struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);
2885 
2886       insn->tex_offset[0].File = offset.File;
2887       insn->tex_offset[0].Index = offset.Index;
2888       insn->tex_offset[0].SwizzleX = offset.SwizzleX;
2889       insn->tex_offset[0].SwizzleY = offset.SwizzleY;
2890       insn->tex_offset[0].SwizzleZ = offset.SwizzleZ;
2891       insn->tex_offset[0].Padding = 0;
2892    }
2893 
2894    if (nir_tex_instr_has_explicit_tg4_offsets(instr)) {
2895       for (uint8_t i = 0; i < 4; ++i) {
2896          struct ureg_src imm = ureg_imm2i(c->ureg, instr->tg4_offsets[i][0], instr->tg4_offsets[i][1]);
2897          insn->tex_offset[i].File = imm.File;
2898          insn->tex_offset[i].Index = imm.Index;
2899          insn->tex_offset[i].SwizzleX = imm.SwizzleX;
2900          insn->tex_offset[i].SwizzleY = imm.SwizzleY;
2901          insn->tex_offset[i].SwizzleZ = imm.SwizzleZ;
2902       }
2903    }
2904 
2905    if (instr->op == nir_texop_query_levels)
2906       ntt_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3));
2907 }
2908 
2909 static void
ntt_emit_jump(struct ntt_compile * c,nir_jump_instr * jump)2910 ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
2911 {
2912    switch (jump->type) {
2913    case nir_jump_break:
2914       ntt_BRK(c);
2915       break;
2916 
2917    case nir_jump_continue:
2918       ntt_CONT(c);
2919       break;
2920 
2921    default:
2922       fprintf(stderr, "Unknown jump instruction: ");
2923       nir_print_instr(&jump->instr, stderr);
2924       fprintf(stderr, "\n");
2925       abort();
2926    }
2927 }
2928 
2929 static void
ntt_emit_ssa_undef(struct ntt_compile * c,nir_undef_instr * instr)2930 ntt_emit_ssa_undef(struct ntt_compile *c, nir_undef_instr *instr)
2931 {
2932    /* Nothing to do but make sure that we have some storage to deref. */
2933    (void)ntt_get_ssa_def_decl(c, &instr->def);
2934 }
2935 
2936 static void
ntt_emit_instr(struct ntt_compile * c,nir_instr * instr)2937 ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
2938 {
2939    switch (instr->type) {
2940    case nir_instr_type_deref:
2941       /* ignored, will be walked by nir_intrinsic_image_*_deref. */
2942       break;
2943 
2944    case nir_instr_type_alu:
2945       ntt_emit_alu(c, nir_instr_as_alu(instr));
2946       break;
2947 
2948    case nir_instr_type_intrinsic:
2949       ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2950       break;
2951 
2952    case nir_instr_type_load_const:
2953       /* Nothing to do here, as load consts are done directly from
2954        * ntt_get_src() (since many constant NIR srcs will often get folded
2955        * directly into a register file index instead of as a TGSI src).
2956        */
2957       break;
2958 
2959    case nir_instr_type_tex:
2960       ntt_emit_texture(c, nir_instr_as_tex(instr));
2961       break;
2962 
2963    case nir_instr_type_jump:
2964       ntt_emit_jump(c, nir_instr_as_jump(instr));
2965       break;
2966 
2967    case nir_instr_type_undef:
2968       ntt_emit_ssa_undef(c, nir_instr_as_undef(instr));
2969       break;
2970 
2971    default:
2972       fprintf(stderr, "Unknown NIR instr type: ");
2973       nir_print_instr(instr, stderr);
2974       fprintf(stderr, "\n");
2975       abort();
2976    }
2977 }
2978 
2979 static void
ntt_emit_if(struct ntt_compile * c,nir_if * if_stmt)2980 ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
2981 {
2982    if (c->native_integers)
2983       ntt_UIF(c, c->if_cond);
2984    else
2985       ntt_IF(c, c->if_cond);
2986 
2987    ntt_emit_cf_list(c, &if_stmt->then_list);
2988 
2989    if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
2990       ntt_ELSE(c);
2991       ntt_emit_cf_list(c, &if_stmt->else_list);
2992    }
2993 
2994    ntt_ENDIF(c);
2995 }
2996 
2997 static void
ntt_emit_loop(struct ntt_compile * c,nir_loop * loop)2998 ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
2999 {
3000    assert(!nir_loop_has_continue_construct(loop));
3001    ntt_BGNLOOP(c);
3002    ntt_emit_cf_list(c, &loop->body);
3003    ntt_ENDLOOP(c);
3004 }
3005 
3006 static void
ntt_emit_block(struct ntt_compile * c,nir_block * block)3007 ntt_emit_block(struct ntt_compile *c, nir_block *block)
3008 {
3009    struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
3010    c->cur_block = ntt_block;
3011 
3012    nir_foreach_instr(instr, block) {
3013       ntt_emit_instr(c, instr);
3014 
3015       /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntt_OPCODE(). */
3016       if (ureg_get_instruction_number(c->ureg) != 0) {
3017          fprintf(stderr, "Emitted ureg insn during: ");
3018          nir_print_instr(instr, stderr);
3019          fprintf(stderr, "\n");
3020          unreachable("emitted ureg insn");
3021       }
3022    }
3023 
3024    /* Set up the if condition for ntt_emit_if(), which we have to do before
3025     * freeing up the temps (the "if" is treated as inside the block for liveness
3026     * purposes, despite not being an instruction)
3027     *
3028     * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
3029     * looks at all of .xyzw.  No harm in working around the bug.
3030     */
3031    nir_if *nif = nir_block_get_following_if(block);
3032    if (nif)
3033       c->if_cond = ureg_scalar(ntt_get_src(c, nif->condition), TGSI_SWIZZLE_X);
3034 }
3035 
3036 static void
ntt_emit_cf_list(struct ntt_compile * c,struct exec_list * list)3037 ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
3038 {
3039    foreach_list_typed(nir_cf_node, node, node, list) {
3040       switch (node->type) {
3041       case nir_cf_node_block:
3042          ntt_emit_block(c, nir_cf_node_as_block(node));
3043          break;
3044 
3045       case nir_cf_node_if:
3046          ntt_emit_if(c, nir_cf_node_as_if(node));
3047          break;
3048 
3049       case nir_cf_node_loop:
3050          ntt_emit_loop(c, nir_cf_node_as_loop(node));
3051          break;
3052 
3053       default:
3054          unreachable("unknown CF type");
3055       }
3056    }
3057 }
3058 
3059 static void
ntt_emit_block_ureg(struct ntt_compile * c,struct nir_block * block)3060 ntt_emit_block_ureg(struct ntt_compile *c, struct nir_block *block)
3061 {
3062    struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
3063 
3064    /* Emit the ntt insns to tgsi_ureg. */
3065    util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
3066       const struct tgsi_opcode_info *opcode_info =
3067          tgsi_get_opcode_info(insn->opcode);
3068 
3069       switch (insn->opcode) {
3070       case TGSI_OPCODE_UIF:
3071          ureg_UIF(c->ureg, insn->src[0], &c->cf_label);
3072          break;
3073 
3074       case TGSI_OPCODE_IF:
3075          ureg_IF(c->ureg, insn->src[0], &c->cf_label);
3076          break;
3077 
3078       case TGSI_OPCODE_ELSE:
3079          ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
3080          ureg_ELSE(c->ureg, &c->cf_label);
3081          c->current_if_else = c->cf_label;
3082          break;
3083 
3084       case TGSI_OPCODE_ENDIF:
3085          ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
3086          ureg_ENDIF(c->ureg);
3087          break;
3088 
3089       case TGSI_OPCODE_BGNLOOP:
3090          /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
3091           * does reference BGNLOOP's.  Follow the former behavior unless something comes up
3092           * with a need.
3093           */
3094          ureg_BGNLOOP(c->ureg, &c->cf_label);
3095          break;
3096 
3097       case TGSI_OPCODE_ENDLOOP:
3098          ureg_ENDLOOP(c->ureg, &c->cf_label);
3099          break;
3100 
3101       default:
3102          if (insn->is_tex) {
3103             int num_offsets = 0;
3104             for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
3105                if (insn->tex_offset[i].File != TGSI_FILE_NULL)
3106                   num_offsets = i + 1;
3107             }
3108             ureg_tex_insn(c->ureg, insn->opcode,
3109                           insn->dst, opcode_info->num_dst,
3110                           insn->tex_target, insn->tex_return_type,
3111                           insn->tex_offset,
3112                           num_offsets,
3113                           insn->src, opcode_info->num_src);
3114          } else if (insn->is_mem) {
3115             ureg_memory_insn(c->ureg, insn->opcode,
3116                              insn->dst, opcode_info->num_dst,
3117                              insn->src, opcode_info->num_src,
3118                              insn->mem_qualifier,
3119                              insn->tex_target,
3120                              insn->mem_format);
3121          } else {
3122             ureg_insn(c->ureg, insn->opcode,
3123                      insn->dst, opcode_info->num_dst,
3124                      insn->src, opcode_info->num_src,
3125                      insn->precise);
3126          }
3127       }
3128    }
3129 }
3130 
3131 static void
ntt_emit_if_ureg(struct ntt_compile * c,nir_if * if_stmt)3132 ntt_emit_if_ureg(struct ntt_compile *c, nir_if *if_stmt)
3133 {
3134    /* Note: the last block emitted our IF opcode. */
3135 
3136    int if_stack = c->current_if_else;
3137    c->current_if_else = c->cf_label;
3138 
3139    /* Either the then or else block includes the ENDIF, which will fix up the
3140     * IF(/ELSE)'s label for jumping
3141     */
3142    ntt_emit_cf_list_ureg(c, &if_stmt->then_list);
3143    ntt_emit_cf_list_ureg(c, &if_stmt->else_list);
3144 
3145    c->current_if_else = if_stack;
3146 }
3147 
3148 static void
ntt_emit_cf_list_ureg(struct ntt_compile * c,struct exec_list * list)3149 ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list)
3150 {
3151    foreach_list_typed(nir_cf_node, node, node, list) {
3152       switch (node->type) {
3153       case nir_cf_node_block:
3154          ntt_emit_block_ureg(c, nir_cf_node_as_block(node));
3155          break;
3156 
3157       case nir_cf_node_if:
3158          ntt_emit_if_ureg(c, nir_cf_node_as_if(node));
3159          break;
3160 
3161       case nir_cf_node_loop:
3162          /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
3163           * does reference BGNLOOP's.  Follow the former behavior unless something comes up
3164           * with a need.
3165           */
3166          ntt_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body);
3167          break;
3168 
3169       default:
3170          unreachable("unknown CF type");
3171       }
3172    }
3173 }
3174 
3175 static void
ntt_emit_impl(struct ntt_compile * c,nir_function_impl * impl)3176 ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
3177 {
3178    c->impl = impl;
3179 
3180    c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
3181    c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);
3182 
3183    /* Set up the struct ntt_blocks to put insns in */
3184    c->blocks = _mesa_pointer_hash_table_create(c);
3185    nir_foreach_block(block, impl) {
3186       struct ntt_block *ntt_block = rzalloc(c->blocks, struct ntt_block);
3187       util_dynarray_init(&ntt_block->insns, ntt_block);
3188       _mesa_hash_table_insert(c->blocks, block, ntt_block);
3189    }
3190 
3191 
3192    ntt_setup_registers(c);
3193 
3194    c->cur_block = ntt_block_from_nir(c, nir_start_block(impl));
3195    ntt_setup_inputs(c);
3196    ntt_setup_outputs(c);
3197    ntt_setup_uniforms(c);
3198 
3199    /* Emit the ntt insns */
3200    ntt_emit_cf_list(c, &impl->body);
3201 
3202    /* Don't do optimized RA if the driver requests it, unless the number of
3203     * temps is too large to be covered by the 16 bit signed int that TGSI
3204     * allocates for the register index */
3205    if (!c->options->unoptimized_ra || c->num_temps > 0x7fff)
3206       ntt_allocate_regs(c, impl);
3207    else
3208       ntt_allocate_regs_unoptimized(c, impl);
3209 
3210    /* Turn the ntt insns into actual TGSI tokens */
3211    ntt_emit_cf_list_ureg(c, &impl->body);
3212 
3213    ralloc_free(c->liveness);
3214    c->liveness = NULL;
3215 
3216 }
3217 
3218 static int
type_size(const struct glsl_type * type,bool bindless)3219 type_size(const struct glsl_type *type, bool bindless)
3220 {
3221    return glsl_count_attribute_slots(type, false);
3222 }
3223 
3224 /* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
3225  * can handle for 64-bit values in TGSI.
3226  */
3227 static uint8_t
ntt_should_vectorize_instr(const nir_instr * instr,const void * data)3228 ntt_should_vectorize_instr(const nir_instr *instr, const void *data)
3229 {
3230    if (instr->type != nir_instr_type_alu)
3231       return 0;
3232 
3233    nir_alu_instr *alu = nir_instr_as_alu(instr);
3234 
3235    switch (alu->op) {
3236    case nir_op_ibitfield_extract:
3237    case nir_op_ubitfield_extract:
3238    case nir_op_bitfield_insert:
3239       /* virglrenderer only looks at the .x channel of the offset/bits operands
3240        * when translating to GLSL.  tgsi.rst doesn't seem to require scalar
3241        * offset/bits operands.
3242        *
3243        * https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/195
3244        */
3245       return 1;
3246 
3247    default:
3248       break;
3249    }
3250 
3251    int src_bit_size = nir_src_bit_size(alu->src[0].src);
3252    int dst_bit_size = alu->def.bit_size;
3253 
3254    if (src_bit_size == 64 || dst_bit_size == 64) {
3255       /* Avoid vectorizing 64-bit instructions at all.  Despite tgsi.rst
3256        * claiming support, virglrenderer generates bad shaders on the host when
3257        * presented with them.  Maybe we can make virgl avoid tickling the
3258        * virglrenderer bugs, but given that glsl-to-TGSI didn't generate vector
3259        * 64-bit instrs in the first place, I don't see much reason to care about
3260        * this.
3261        */
3262       return 1;
3263    }
3264 
3265    return 4;
3266 }
3267 
3268 static bool
ntt_should_vectorize_io(unsigned align,unsigned bit_size,unsigned num_components,unsigned high_offset,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)3269 ntt_should_vectorize_io(unsigned align, unsigned bit_size,
3270                         unsigned num_components, unsigned high_offset,
3271                         nir_intrinsic_instr *low, nir_intrinsic_instr *high,
3272                         void *data)
3273 {
3274    if (bit_size != 32)
3275       return false;
3276 
3277    /* Our offset alignment should aways be at least 4 bytes */
3278    if (align < 4)
3279       return false;
3280 
3281    /* No wrapping off the end of a TGSI reg.  We could do a bit better by
3282     * looking at low's actual offset.  XXX: With LOAD_CONSTBUF maybe we don't
3283     * need this restriction.
3284     */
3285    unsigned worst_start_component = align == 4 ? 3 : align / 4;
3286    if (worst_start_component + num_components > 4)
3287       return false;
3288 
3289    return true;
3290 }
3291 
3292 static nir_variable_mode
ntt_no_indirects_mask(nir_shader * s,struct pipe_screen * screen)3293 ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
3294 {
3295    unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3296    unsigned indirect_mask = 0;
3297 
3298    if (!screen->get_shader_param(screen, pipe_stage,
3299                                  PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
3300       indirect_mask |= nir_var_shader_in;
3301    }
3302 
3303    if (!screen->get_shader_param(screen, pipe_stage,
3304                                  PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
3305       indirect_mask |= nir_var_shader_out;
3306    }
3307 
3308    if (!screen->get_shader_param(screen, pipe_stage,
3309                                  PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
3310       indirect_mask |= nir_var_function_temp;
3311    }
3312 
3313    return indirect_mask;
3314 }
3315 
3316 static void
ntt_optimize_nir(struct nir_shader * s,struct pipe_screen * screen,const struct nir_to_tgsi_options * options)3317 ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen,
3318                  const struct nir_to_tgsi_options *options)
3319 {
3320    bool progress;
3321    unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3322    unsigned control_flow_depth =
3323       screen->get_shader_param(screen, pipe_stage,
3324                                PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
3325    do {
3326       progress = false;
3327 
3328       NIR_PASS_V(s, nir_lower_vars_to_ssa);
3329       NIR_PASS_V(s, nir_split_64bit_vec3_and_vec4);
3330 
3331       NIR_PASS(progress, s, nir_copy_prop);
3332       NIR_PASS(progress, s, nir_opt_algebraic);
3333       NIR_PASS(progress, s, nir_opt_constant_folding);
3334       NIR_PASS(progress, s, nir_opt_remove_phis);
3335       NIR_PASS(progress, s, nir_opt_conditional_discard);
3336       NIR_PASS(progress, s, nir_opt_dce);
3337       NIR_PASS(progress, s, nir_opt_dead_cf);
3338       NIR_PASS(progress, s, nir_opt_cse);
3339       NIR_PASS(progress, s, nir_opt_find_array_copies);
3340       NIR_PASS(progress, s, nir_opt_copy_prop_vars);
3341       NIR_PASS(progress, s, nir_opt_dead_write_vars);
3342 
3343       NIR_PASS(progress, s, nir_opt_if, nir_opt_if_optimize_phi_true_false);
3344       NIR_PASS(progress, s, nir_opt_peephole_select,
3345                control_flow_depth == 0 ? ~0 : 8, true, true);
3346       NIR_PASS(progress, s, nir_opt_algebraic);
3347       NIR_PASS(progress, s, nir_opt_constant_folding);
3348       nir_load_store_vectorize_options vectorize_opts = {
3349          .modes = nir_var_mem_ubo,
3350          .callback = ntt_should_vectorize_io,
3351          .robust_modes = 0,
3352       };
3353       NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
3354       NIR_PASS(progress, s, nir_opt_shrink_stores, true);
3355       NIR_PASS(progress, s, nir_opt_shrink_vectors, false);
3356       NIR_PASS(progress, s, nir_opt_loop);
3357       NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
3358       NIR_PASS(progress, s, nir_opt_undef);
3359       NIR_PASS(progress, s, nir_opt_loop_unroll);
3360 
3361       /* Try to fold addressing math into ubo_vec4's base to avoid load_consts
3362        * and ALU ops for it.
3363        */
3364       nir_opt_offsets_options offset_options = {
3365          .ubo_vec4_max = ~0,
3366 
3367          /* No const offset in TGSI for shared accesses. */
3368          .shared_max = 0,
3369 
3370          /* unused intrinsics */
3371          .uniform_max = 0,
3372          .buffer_max = 0,
3373       };
3374 
3375       if (options->ubo_vec4_max)
3376          offset_options.ubo_vec4_max = options->ubo_vec4_max;
3377 
3378       NIR_PASS(progress, s, nir_opt_offsets, &offset_options);
3379    } while (progress);
3380 
3381    NIR_PASS_V(s, nir_lower_var_copies);
3382 }
3383 
3384 /* Scalarizes all 64-bit ALU ops.  Note that we only actually need to
3385  * scalarize vec3/vec4s, should probably fix that.
3386  */
3387 static bool
scalarize_64bit(const nir_instr * instr,const void * data)3388 scalarize_64bit(const nir_instr *instr, const void *data)
3389 {
3390    const nir_alu_instr *alu = nir_instr_as_alu(instr);
3391 
3392    return (alu->def.bit_size == 64 ||
3393            nir_src_bit_size(alu->src[0].src) == 64);
3394 }
3395 
3396 static bool
nir_to_tgsi_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)3397 nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
3398 {
3399    b->cursor = nir_after_instr(&instr->instr);
3400 
3401    switch (instr->intrinsic) {
3402    case nir_intrinsic_load_ubo:
3403    case nir_intrinsic_load_ubo_vec4:
3404    case nir_intrinsic_load_ssbo:
3405    case nir_intrinsic_load_input:
3406    case nir_intrinsic_load_interpolated_input:
3407    case nir_intrinsic_load_per_vertex_input:
3408    case nir_intrinsic_store_output:
3409    case nir_intrinsic_store_per_vertex_output:
3410    case nir_intrinsic_store_ssbo:
3411       break;
3412    default:
3413       return false;
3414    }
3415 
3416    if (instr->num_components <= 2)
3417       return false;
3418 
3419    bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
3420    if (has_dest) {
3421       if (instr->def.bit_size != 64)
3422          return false;
3423    } else  {
3424       if (nir_src_bit_size(instr->src[0]) != 64)
3425           return false;
3426    }
3427 
3428    nir_intrinsic_instr *first =
3429       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3430    nir_intrinsic_instr *second =
3431       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3432 
3433    switch (instr->intrinsic) {
3434    case nir_intrinsic_load_ubo:
3435    case nir_intrinsic_load_ubo_vec4:
3436    case nir_intrinsic_load_ssbo:
3437    case nir_intrinsic_store_ssbo:
3438       break;
3439 
3440    default: {
3441       nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
3442       semantics.location++;
3443       semantics.num_slots--;
3444       nir_intrinsic_set_io_semantics(second, semantics);
3445 
3446       nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
3447       break;
3448    }
3449    }
3450 
3451    first->num_components = 2;
3452    second->num_components -= 2;
3453    if (has_dest) {
3454       first->def.num_components = 2;
3455       second->def.num_components -= 2;
3456    }
3457 
3458    nir_builder_instr_insert(b, &first->instr);
3459    nir_builder_instr_insert(b, &second->instr);
3460 
3461    if (has_dest) {
3462       /* Merge the two loads' results back into a vector. */
3463       nir_scalar channels[4] = {
3464          nir_get_scalar(&first->def, 0),
3465          nir_get_scalar(&first->def, 1),
3466          nir_get_scalar(&second->def, 0),
3467          nir_get_scalar(&second->def, second->num_components > 1 ? 1 : 0),
3468       };
3469       nir_def *new = nir_vec_scalars(b, channels, instr->num_components);
3470       nir_def_rewrite_uses(&instr->def, new);
3471    } else {
3472       /* Split the src value across the two stores. */
3473       b->cursor = nir_before_instr(&instr->instr);
3474 
3475       nir_def *src0 = instr->src[0].ssa;
3476       nir_scalar channels[4] = { 0 };
3477       for (int i = 0; i < instr->num_components; i++)
3478          channels[i] = nir_get_scalar(src0, i);
3479 
3480       nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
3481       nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
3482 
3483       nir_src_rewrite(&first->src[0], nir_vec_scalars(b, channels, 2));
3484       nir_src_rewrite(&second->src[0],
3485                       nir_vec_scalars(b, &channels[2], second->num_components));
3486    }
3487 
3488    int offset_src = -1;
3489    uint32_t offset_amount = 16;
3490 
3491    switch (instr->intrinsic) {
3492    case nir_intrinsic_load_ssbo:
3493    case nir_intrinsic_load_ubo:
3494       offset_src = 1;
3495       break;
3496    case nir_intrinsic_load_ubo_vec4:
3497       offset_src = 1;
3498       offset_amount = 1;
3499       break;
3500    case nir_intrinsic_store_ssbo:
3501       offset_src = 2;
3502       break;
3503    default:
3504       break;
3505    }
3506    if (offset_src != -1) {
3507       b->cursor = nir_before_instr(&second->instr);
3508       nir_def *second_offset =
3509          nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
3510       nir_src_rewrite(&second->src[offset_src], second_offset);
3511    }
3512 
3513    /* DCE stores we generated with no writemask (nothing else does this
3514     * currently).
3515     */
3516    if (!has_dest) {
3517       if (nir_intrinsic_write_mask(first) == 0)
3518          nir_instr_remove(&first->instr);
3519       if (nir_intrinsic_write_mask(second) == 0)
3520          nir_instr_remove(&second->instr);
3521    }
3522 
3523    nir_instr_remove(&instr->instr);
3524 
3525    return true;
3526 }
3527 
3528 static bool
nir_to_tgsi_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)3529 nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
3530 {
3531    int num_components = instr->def.num_components;
3532 
3533    if (instr->def.bit_size != 64 || num_components <= 2)
3534       return false;
3535 
3536    b->cursor = nir_before_instr(&instr->instr);
3537 
3538    nir_load_const_instr *first =
3539       nir_load_const_instr_create(b->shader, 2, 64);
3540    nir_load_const_instr *second =
3541       nir_load_const_instr_create(b->shader, num_components - 2, 64);
3542 
3543    first->value[0] = instr->value[0];
3544    first->value[1] = instr->value[1];
3545    second->value[0] = instr->value[2];
3546    if (num_components == 4)
3547       second->value[1] = instr->value[3];
3548 
3549    nir_builder_instr_insert(b, &first->instr);
3550    nir_builder_instr_insert(b, &second->instr);
3551 
3552    nir_def *channels[4] = {
3553       nir_channel(b, &first->def, 0),
3554       nir_channel(b, &first->def, 1),
3555       nir_channel(b, &second->def, 0),
3556       num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
3557    };
3558    nir_def *new = nir_vec(b, channels, num_components);
3559    nir_def_replace(&instr->def, new);
3560 
3561    return true;
3562 }
3563 
3564 static bool
nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,void * data)3565 nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
3566                                       void *data)
3567 {
3568    switch (instr->type) {
3569    case nir_instr_type_load_const:
3570       return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
3571 
3572    case nir_instr_type_intrinsic:
3573       return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
3574    default:
3575       return false;
3576    }
3577 }
3578 
3579 static bool
nir_to_tgsi_lower_64bit_to_vec2(nir_shader * s)3580 nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
3581 {
3582    return nir_shader_instructions_pass(s,
3583                                        nir_to_tgsi_lower_64bit_to_vec2_instr,
3584                                        nir_metadata_control_flow,
3585                                        NULL);
3586 }
3587 
3588 struct ntt_lower_tex_state {
3589    nir_scalar channels[8];
3590    unsigned i;
3591 };
3592 
3593 static void
nir_to_tgsi_lower_tex_instr_arg(nir_builder * b,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_lower_tex_state * s)3594 nir_to_tgsi_lower_tex_instr_arg(nir_builder *b,
3595                                 nir_tex_instr *instr,
3596                                 nir_tex_src_type tex_src_type,
3597                                 struct ntt_lower_tex_state *s)
3598 {
3599    int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
3600    if (tex_src < 0)
3601       return;
3602 
3603    nir_def *def = instr->src[tex_src].src.ssa;
3604    for (int i = 0; i < def->num_components; i++) {
3605       s->channels[s->i++] = nir_get_scalar(def, i);
3606    }
3607 
3608    nir_tex_instr_remove_src(instr, tex_src);
3609 }
3610 
3611 /**
3612  * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
3613  * src.  This lets NIR handle the coalescing of the vec4 rather than trying to
3614  * manage it on our own, and may lead to more vectorization.
3615  */
3616 static bool
nir_to_tgsi_lower_tex_instr(nir_builder * b,nir_instr * instr,void * data)3617 nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
3618 {
3619    if (instr->type != nir_instr_type_tex)
3620       return false;
3621 
3622    nir_tex_instr *tex = nir_instr_as_tex(instr);
3623 
3624    if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
3625       return false;
3626 
3627    b->cursor = nir_before_instr(instr);
3628 
3629    struct ntt_lower_tex_state s = {0};
3630 
3631    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
3632    /* We always have at least two slots for the coordinate, even on 1D. */
3633    s.i = MAX2(s.i, 2);
3634 
3635    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
3636    s.i = MAX2(s.i, 3);
3637 
3638    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
3639 
3640    /* XXX: LZ */
3641    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
3642    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
3643    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
3644 
3645    /* No need to pack undefs in unused channels of the tex instr */
3646    while (!s.channels[s.i - 1].def)
3647       s.i--;
3648 
3649    /* Instead of putting undefs in the unused slots of the vecs, just put in
3650     * another used channel.  Otherwise, we'll get unnecessary moves into
3651     * registers.
3652     */
3653    assert(s.channels[0].def != NULL);
3654    for (int i = 1; i < s.i; i++) {
3655       if (!s.channels[i].def)
3656          s.channels[i] = s.channels[0];
3657    }
3658 
3659    nir_tex_instr_add_src(tex, nir_tex_src_backend1,
3660                          nir_vec_scalars(b, s.channels, MIN2(s.i, 4)));
3661    if (s.i > 4)
3662       nir_tex_instr_add_src(tex, nir_tex_src_backend2,
3663                             nir_vec_scalars(b, &s.channels[4], s.i - 4));
3664 
3665    return true;
3666 }
3667 
3668 static bool
nir_to_tgsi_lower_tex(nir_shader * s)3669 nir_to_tgsi_lower_tex(nir_shader *s)
3670 {
3671    return nir_shader_instructions_pass(s,
3672                                        nir_to_tgsi_lower_tex_instr,
3673                                        nir_metadata_control_flow,
3674                                        NULL);
3675 }
3676 
3677 static void
ntt_fix_nir_options(struct pipe_screen * screen,struct nir_shader * s,const struct nir_to_tgsi_options * ntt_options)3678 ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s,
3679                     const struct nir_to_tgsi_options *ntt_options)
3680 {
3681    const struct nir_shader_compiler_options *options = s->options;
3682    bool lower_fsqrt =
3683       !screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage),
3684                                 PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
3685 
3686    bool force_indirect_unrolling_sampler =
3687       screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL) < 400;
3688 
3689    nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3690 
3691    if (!options->lower_extract_byte ||
3692        !options->lower_extract_word ||
3693        !options->lower_insert_byte ||
3694        !options->lower_insert_word ||
3695        !options->lower_fdph ||
3696        !options->lower_flrp64 ||
3697        !options->lower_fmod ||
3698        !options->lower_uadd_carry ||
3699        !options->lower_usub_borrow ||
3700        !options->lower_uadd_sat ||
3701        !options->lower_usub_sat ||
3702        !options->lower_uniforms_to_ubo ||
3703        !options->lower_vector_cmp ||
3704        options->has_rotate8 ||
3705        options->has_rotate16 ||
3706        options->has_rotate32 ||
3707        options->lower_fsqrt != lower_fsqrt ||
3708        options->force_indirect_unrolling != no_indirects_mask ||
3709        force_indirect_unrolling_sampler) {
3710       nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options);
3711       *new_options = *s->options;
3712 
3713       new_options->lower_extract_byte = true;
3714       new_options->lower_extract_word = true;
3715       new_options->lower_insert_byte = true;
3716       new_options->lower_insert_word = true;
3717       new_options->lower_fdph = true;
3718       new_options->lower_flrp64 = true;
3719       new_options->lower_fmod = true;
3720       new_options->lower_uadd_carry = true;
3721       new_options->lower_usub_borrow = true;
3722       new_options->lower_uadd_sat = true;
3723       new_options->lower_usub_sat = true;
3724       new_options->lower_uniforms_to_ubo = true;
3725       new_options->lower_vector_cmp = true;
3726       new_options->lower_fsqrt = lower_fsqrt;
3727       new_options->has_rotate8 = false;
3728       new_options->has_rotate16 = false;
3729       new_options->has_rotate32 = false;
3730       new_options->force_indirect_unrolling = no_indirects_mask;
3731       new_options->force_indirect_unrolling_sampler = force_indirect_unrolling_sampler;
3732 
3733       s->options = new_options;
3734    }
3735 }
3736 
3737 static bool
ntt_lower_atomic_pre_dec_filter(const nir_instr * instr,const void * _data)3738 ntt_lower_atomic_pre_dec_filter(const nir_instr *instr, const void *_data)
3739 {
3740    return (instr->type == nir_instr_type_intrinsic &&
3741            nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_atomic_counter_pre_dec);
3742 }
3743 
3744 static nir_def *
ntt_lower_atomic_pre_dec_lower(nir_builder * b,nir_instr * instr,void * _data)3745 ntt_lower_atomic_pre_dec_lower(nir_builder *b, nir_instr *instr, void *_data)
3746 {
3747    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3748 
3749    nir_def *old_result = &intr->def;
3750    intr->intrinsic = nir_intrinsic_atomic_counter_post_dec;
3751 
3752    return nir_iadd_imm(b, old_result, -1);
3753 }
3754 
3755 static bool
ntt_lower_atomic_pre_dec(nir_shader * s)3756 ntt_lower_atomic_pre_dec(nir_shader *s)
3757 {
3758    return nir_shader_lower_instructions(s,
3759                                         ntt_lower_atomic_pre_dec_filter,
3760                                         ntt_lower_atomic_pre_dec_lower, NULL);
3761 }
3762 
3763 /* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
3764 static void
nir_to_tgsi_lower_txp(nir_shader * s)3765 nir_to_tgsi_lower_txp(nir_shader *s)
3766 {
3767    nir_lower_tex_options lower_tex_options = {
3768        .lower_txp = 0,
3769    };
3770 
3771    nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
3772       nir_foreach_instr(instr, block) {
3773          if (instr->type != nir_instr_type_tex)
3774             continue;
3775          nir_tex_instr *tex = nir_instr_as_tex(instr);
3776 
3777          if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
3778             continue;
3779 
3780          bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
3781          bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
3782          bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
3783 
3784          /* We can do TXP for any tex (not txg) where we can fit all the
3785           * coordinates and comparator and projector in one vec4 without any
3786           * other modifiers to add on.
3787           *
3788           * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
3789           * if we get any funny projectors then we just blow them all away.
3790           */
3791          if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
3792             lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
3793       }
3794    }
3795 
3796    /* nir_lower_tex must be run even if no options are set, because we need the
3797     * LOD to be set for query_levels and for non-fragment shaders.
3798     */
3799    NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
3800 }
3801 
3802 static bool
nir_lower_primid_sysval_to_input_filter(const nir_instr * instr,const void * _data)3803 nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data)
3804 {
3805    return (instr->type == nir_instr_type_intrinsic &&
3806            nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id);
3807 }
3808 
3809 static nir_def *
nir_lower_primid_sysval_to_input_lower(nir_builder * b,nir_instr * instr,void * data)3810 nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data)
3811 {
3812    nir_variable *var = nir_get_variable_with_location(b->shader, nir_var_shader_in,
3813                                                       VARYING_SLOT_PRIMITIVE_ID, glsl_uint_type());
3814 
3815    nir_io_semantics semantics = {
3816       .location = var->data.location,
3817        .num_slots = 1
3818    };
3819    return nir_load_input(b, 1, 32, nir_imm_int(b, 0),
3820                          .base = var->data.driver_location,
3821                          .io_semantics = semantics);
3822 }
3823 
3824 static bool
nir_lower_primid_sysval_to_input(nir_shader * s)3825 nir_lower_primid_sysval_to_input(nir_shader *s)
3826 {
3827    return nir_shader_lower_instructions(s,
3828                                         nir_lower_primid_sysval_to_input_filter,
3829                                         nir_lower_primid_sysval_to_input_lower, NULL);
3830 }
3831 
3832 const void *
nir_to_tgsi(struct nir_shader * s,struct pipe_screen * screen)3833 nir_to_tgsi(struct nir_shader *s,
3834             struct pipe_screen *screen)
3835 {
3836    static const struct nir_to_tgsi_options default_ntt_options = {0};
3837    return nir_to_tgsi_options(s, screen, &default_ntt_options);
3838 }
3839 
3840 /* Prevent lower_vec_to_mov from coalescing 64-to-32 conversions and comparisons
3841  * into unsupported channels of registers.
3842  */
3843 static bool
ntt_vec_to_mov_writemask_cb(const nir_instr * instr,unsigned writemask,UNUSED const void * _data)3844 ntt_vec_to_mov_writemask_cb(const nir_instr *instr, unsigned writemask, UNUSED const void *_data)
3845 {
3846    if (instr->type != nir_instr_type_alu)
3847       return false;
3848 
3849    nir_alu_instr *alu = nir_instr_as_alu(instr);
3850    int dst_32 = alu->def.bit_size == 32;
3851    int src_64 = nir_src_bit_size(alu->src[0].src) == 64;
3852 
3853    if (src_64 && dst_32) {
3854       int num_srcs = nir_op_infos[alu->op].num_inputs;
3855 
3856       if (num_srcs == 2 || nir_op_infos[alu->op].output_type == nir_type_bool32) {
3857          /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz
3858           * instead of .xy.  Just support scalar compares storing to .x,
3859           * GLSL-to-TGSI only ever emitted scalar ops anyway.
3860           */
3861         if (writemask != TGSI_WRITEMASK_X)
3862            return false;
3863       } else {
3864          /* TGSI's 64-to-32-bit conversions can only store to .xy (since a TGSI
3865           * register can only store a dvec2).  Don't try to coalesce to write to
3866           * .zw.
3867           */
3868          if (writemask & ~(TGSI_WRITEMASK_XY))
3869             return false;
3870       }
3871    }
3872 
3873    return true;
3874 }
3875 
3876 /**
3877  * Translates the NIR shader to TGSI.
3878  *
3879  * This requires some lowering of the NIR shader to prepare it for translation.
3880  * We take ownership of the NIR shader passed, returning a reference to the new
3881  * TGSI tokens instead.  If you need to keep the NIR, then pass us a clone.
3882  */
nir_to_tgsi_options(struct nir_shader * s,struct pipe_screen * screen,const struct nir_to_tgsi_options * options)3883 const void *nir_to_tgsi_options(struct nir_shader *s,
3884                                 struct pipe_screen *screen,
3885                                 const struct nir_to_tgsi_options *options)
3886 {
3887    struct ntt_compile *c;
3888    const void *tgsi_tokens;
3889    nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3890    bool native_integers = screen->get_shader_param(screen,
3891                                                    pipe_shader_type_from_mesa(s->info.stage),
3892                                                    PIPE_SHADER_CAP_INTEGERS);
3893    const struct nir_shader_compiler_options *original_options = s->options;
3894 
3895    ntt_fix_nir_options(screen, s, options);
3896 
3897    /* Lower array indexing on FS inputs.  Since we don't set
3898     * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to
3899     * elements by ureg, and so dynamically indexing them would be invalid.
3900     * Ideally we would set that ureg flag based on
3901     * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st
3902     * splitting NIR VS outputs to elements even if the FS doesn't get the
3903     * corresponding splitting, and virgl depends on TGSI across link boundaries
3904     * having matching declarations.
3905     */
3906    if (s->info.stage == MESA_SHADER_FRAGMENT) {
3907       NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
3908       NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL);
3909    }
3910 
3911    /* Lower tesslevel indirect derefs for tessellation shader.
3912     * tesslevels are now a compact array variable and nir expects a constant
3913     * array index into the compact array variable.
3914     */
3915    if (s->info.stage == MESA_SHADER_TESS_CTRL ||
3916        s->info.stage == MESA_SHADER_TESS_EVAL) {
3917       NIR_PASS_V(s, nir_lower_indirect_derefs, 0 , UINT32_MAX);
3918    }
3919 
3920    NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3921               type_size, (nir_lower_io_options)0);
3922 
3923    nir_to_tgsi_lower_txp(s);
3924    NIR_PASS_V(s, nir_to_tgsi_lower_tex);
3925 
3926    /* While TGSI can represent PRIMID as either an input or a system value,
3927     * glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers
3928     * depend on that.
3929     */
3930    if (s->info.stage == MESA_SHADER_GEOMETRY)
3931       NIR_PASS_V(s, nir_lower_primid_sysval_to_input);
3932 
3933    if (s->info.num_abos)
3934       NIR_PASS_V(s, ntt_lower_atomic_pre_dec);
3935 
3936    if (!original_options->lower_uniforms_to_ubo) {
3937       NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
3938                  screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
3939                  !native_integers);
3940    }
3941 
3942    /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
3943     * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
3944     * duplication logic we just make it so that we only see vec2s.
3945     */
3946    NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
3947    NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);
3948 
3949    if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
3950       NIR_PASS_V(s, nir_lower_ubo_vec4);
3951 
3952    ntt_optimize_nir(s, screen, options);
3953 
3954    NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
3955 
3956    /* Lower demote_if to if (cond) { demote } because TGSI doesn't have a DEMOTE_IF. */
3957    NIR_PASS_V(s, nir_lower_discard_if, nir_lower_demote_if_to_cf);
3958 
3959    NIR_PASS_V(s, nir_lower_frexp);
3960 
3961    bool progress;
3962    do {
3963       progress = false;
3964       NIR_PASS(progress, s, nir_opt_algebraic_late);
3965       if (progress) {
3966          NIR_PASS_V(s, nir_copy_prop);
3967          NIR_PASS_V(s, nir_opt_dce);
3968          NIR_PASS_V(s, nir_opt_cse);
3969       }
3970    } while (progress);
3971 
3972    NIR_PASS_V(s, nir_opt_combine_barriers, NULL, NULL);
3973 
3974    if (screen->get_shader_param(screen,
3975                                 pipe_shader_type_from_mesa(s->info.stage),
3976                                 PIPE_SHADER_CAP_INTEGERS)) {
3977       NIR_PASS_V(s, nir_lower_bool_to_int32);
3978    } else {
3979       NIR_PASS_V(s, nir_lower_int_to_float);
3980       NIR_PASS_V(s, nir_lower_bool_to_float,
3981                  !options->lower_cmp && !options->lower_fabs);
3982       /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
3983       NIR_PASS_V(s, nir_copy_prop);
3984       NIR_PASS_V(s, nir_opt_dce);
3985    }
3986 
3987    nir_move_options move_all =
3988        nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
3989        nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
3990 
3991    NIR_PASS_V(s, nir_opt_move, move_all);
3992 
3993    NIR_PASS_V(s, nir_convert_from_ssa, true);
3994    NIR_PASS_V(s, nir_lower_vec_to_regs, ntt_vec_to_mov_writemask_cb, NULL);
3995 
3996    /* locals_to_reg_intrinsics will leave dead derefs that are good to clean up.
3997     */
3998    NIR_PASS_V(s, nir_lower_locals_to_regs, 32);
3999    NIR_PASS_V(s, nir_opt_dce);
4000 
4001    /* See comment in ntt_get_alu_src for supported modifiers */
4002    NIR_PASS_V(s, nir_legacy_trivialize, !options->lower_fabs);
4003 
4004    if (NIR_DEBUG(TGSI)) {
4005       fprintf(stderr, "NIR before translation to TGSI:\n");
4006       nir_print_shader(s, stderr);
4007    }
4008 
4009    c = rzalloc(NULL, struct ntt_compile);
4010    c->screen = screen;
4011    c->options = options;
4012 
4013    c->needs_texcoord_semantic =
4014       screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
4015    c->has_txf_lz =
4016       screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ);
4017 
4018    c->s = s;
4019    c->native_integers = native_integers;
4020    c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
4021    ureg_setup_shader_info(c->ureg, &s->info);
4022    if (s->info.use_legacy_math_rules && screen->get_param(screen, PIPE_CAP_LEGACY_MATH_RULES))
4023       ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1);
4024 
4025    if (s->info.stage == MESA_SHADER_FRAGMENT) {
4026       /* The draw module's polygon stipple layer doesn't respect the chosen
4027        * coordinate mode, so leave it as unspecified unless we're actually
4028        * reading the position in the shader already.  See
4029        * gl-2.1-polygon-stipple-fs on softpipe.
4030        */
4031       if ((s->info.inputs_read & VARYING_BIT_POS) ||
4032           BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
4033          ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
4034                        s->info.fs.origin_upper_left ?
4035                        TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
4036                        TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
4037 
4038          ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
4039                        s->info.fs.pixel_center_integer ?
4040                        TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
4041                        TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
4042       }
4043    }
4044    /* Emit the main function */
4045    nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
4046    ntt_emit_impl(c, impl);
4047    ureg_END(c->ureg);
4048 
4049    tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
4050 
4051    if (NIR_DEBUG(TGSI)) {
4052       fprintf(stderr, "TGSI after translation from NIR:\n");
4053       tgsi_dump(tgsi_tokens, 0);
4054    }
4055 
4056    ureg_destroy(c->ureg);
4057 
4058    ralloc_free(c);
4059    ralloc_free(s);
4060 
4061    return tgsi_tokens;
4062 }
4063 
4064 static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
4065    .fdot_replicates = true,
4066    .fuse_ffma32 = true,
4067    .fuse_ffma64 = true,
4068    .lower_extract_byte = true,
4069    .lower_extract_word = true,
4070    .lower_insert_byte = true,
4071    .lower_insert_word = true,
4072    .lower_fdph = true,
4073    .lower_flrp64 = true,
4074    .lower_fmod = true,
4075    .lower_uniforms_to_ubo = true,
4076    .lower_uadd_carry = true,
4077    .lower_usub_borrow = true,
4078    .lower_uadd_sat = true,
4079    .lower_usub_sat = true,
4080    .lower_vector_cmp = true,
4081    .lower_int64_options = nir_lower_imul_2x32_64,
4082    .use_interpolated_input_intrinsics = true,
4083 
4084    /* TGSI doesn't have a semantic for local or global index, just local and
4085     * workgroup id.
4086     */
4087    .lower_cs_local_index_to_id = true,
4088    .has_ddx_intrinsics = true,
4089 };
4090 
4091 /* Returns a default compiler options for drivers with only nir-to-tgsi-based
4092  * NIR support.
4093  */
4094 const void *
nir_to_tgsi_get_compiler_options(struct pipe_screen * pscreen,enum pipe_shader_ir ir,unsigned shader)4095 nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,
4096                                  enum pipe_shader_ir ir,
4097                                  unsigned shader)
4098 {
4099    assert(ir == PIPE_SHADER_IR_NIR);
4100    return &nir_to_tgsi_compiler_options;
4101 }
4102 
4103 /** Helper for getting TGSI tokens to store for a pipe_shader_state CSO. */
4104 const void *
pipe_shader_state_to_tgsi_tokens(struct pipe_screen * screen,const struct pipe_shader_state * cso)4105 pipe_shader_state_to_tgsi_tokens(struct pipe_screen *screen,
4106                                  const struct pipe_shader_state *cso)
4107 {
4108    if (cso->type == PIPE_SHADER_IR_NIR) {
4109       return nir_to_tgsi((nir_shader *)cso->ir.nir, screen);
4110    } else {
4111       assert(cso->type == PIPE_SHADER_IR_TGSI);
4112       /* we need to keep a local copy of the tokens */
4113       return tgsi_dup_tokens(cso->tokens);
4114    }
4115 }
4116