1 /*
2 * Copyright © 2014-2015 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/nir/nir.h"
25 #include "compiler/nir/nir_deref.h"
26 #include "compiler/nir/nir_legacy.h"
27 #include "compiler/nir/nir_worklist.h"
28 #include "nir/nir_to_tgsi.h"
29 #include "pipe/p_screen.h"
30 #include "pipe/p_state.h"
31 #include "tgsi/tgsi_dump.h"
32 #include "tgsi/tgsi_from_mesa.h"
33 #include "tgsi/tgsi_info.h"
34 #include "tgsi/tgsi_parse.h"
35 #include "tgsi/tgsi_ureg.h"
36 #include "tgsi/tgsi_util.h"
37 #include "util/u_debug.h"
38 #include "util/u_math.h"
39 #include "util/u_memory.h"
40 #include "util/u_dynarray.h"
41
42 struct ntt_insn {
43 enum tgsi_opcode opcode;
44 struct ureg_dst dst[2];
45 struct ureg_src src[4];
46 enum tgsi_texture_type tex_target;
47 enum tgsi_return_type tex_return_type;
48 struct tgsi_texture_offset tex_offset[4];
49
50 unsigned mem_qualifier;
51 enum pipe_format mem_format;
52
53 bool is_tex : 1;
54 bool is_mem : 1;
55 bool precise : 1;
56 };
57
58 struct ntt_block {
59 /* Array of struct ntt_insn */
60 struct util_dynarray insns;
61 int start_ip;
62 int end_ip;
63 };
64
65 struct ntt_reg_interval {
66 uint32_t start, end;
67 };
68
69 struct ntt_compile {
70 nir_shader *s;
71 nir_function_impl *impl;
72 const struct nir_to_tgsi_options *options;
73 struct pipe_screen *screen;
74 struct ureg_program *ureg;
75
76 bool needs_texcoord_semantic;
77 bool native_integers;
78 bool has_txf_lz;
79
80 bool addr_declared[3];
81 struct ureg_dst addr_reg[3];
82
83 /* if condition set up at the end of a block, for ntt_emit_if(). */
84 struct ureg_src if_cond;
85
86 /* TGSI temps for our NIR SSA and register values. */
87 struct ureg_dst *reg_temp;
88 struct ureg_src *ssa_temp;
89
90 struct ntt_reg_interval *liveness;
91
92 /* Map from nir_block to ntt_block */
93 struct hash_table *blocks;
94 struct ntt_block *cur_block;
95 unsigned current_if_else;
96 unsigned cf_label;
97
98 /* Whether we're currently emitting instructiosn for a precise NIR instruction. */
99 bool precise;
100
101 unsigned num_temps;
102 unsigned first_non_array_temp;
103
104 /* Mappings from driver_location to TGSI input/output number.
105 *
106 * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
107 * their numbers assigned incrementally, unlike inputs or constants.
108 */
109 struct ureg_src *input_index_map;
110 uint64_t centroid_inputs;
111
112 uint32_t first_ubo;
113 uint32_t first_ssbo;
114
115 struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
116 };
117
118 static struct ureg_dst
ntt_temp(struct ntt_compile * c)119 ntt_temp(struct ntt_compile *c)
120 {
121 return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++);
122 }
123
124 static struct ntt_block *
ntt_block_from_nir(struct ntt_compile * c,struct nir_block * block)125 ntt_block_from_nir(struct ntt_compile *c, struct nir_block *block)
126 {
127 struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block);
128 return entry->data;
129 }
130
131 static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
132 static void ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list);
133
134 static struct ntt_insn *
ntt_insn(struct ntt_compile * c,enum tgsi_opcode opcode,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1,struct ureg_src src2,struct ureg_src src3)135 ntt_insn(struct ntt_compile *c, enum tgsi_opcode opcode,
136 struct ureg_dst dst,
137 struct ureg_src src0, struct ureg_src src1,
138 struct ureg_src src2, struct ureg_src src3)
139 {
140 struct ntt_insn insn = {
141 .opcode = opcode,
142 .dst = { dst, ureg_dst_undef() },
143 .src = { src0, src1, src2, src3 },
144 .precise = c->precise,
145 };
146 util_dynarray_append(&c->cur_block->insns, struct ntt_insn, insn);
147 return util_dynarray_top_ptr(&c->cur_block->insns, struct ntt_insn);
148 }
149
150 #define OP00( op ) \
151 static inline void ntt_##op(struct ntt_compile *c) \
152 { \
153 ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
154 }
155
156 #define OP01( op ) \
157 static inline void ntt_##op(struct ntt_compile *c, \
158 struct ureg_src src0) \
159 { \
160 ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
161 }
162
163
164 #define OP10( op ) \
165 static inline void ntt_##op(struct ntt_compile *c, \
166 struct ureg_dst dst) \
167 { \
168 ntt_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
169 }
170
171 #define OP11( op ) \
172 static inline void ntt_##op(struct ntt_compile *c, \
173 struct ureg_dst dst, \
174 struct ureg_src src0) \
175 { \
176 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
177 }
178
179 #define OP12( op ) \
180 static inline void ntt_##op(struct ntt_compile *c, \
181 struct ureg_dst dst, \
182 struct ureg_src src0, \
183 struct ureg_src src1) \
184 { \
185 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef()); \
186 }
187
188 #define OP13( op ) \
189 static inline void ntt_##op(struct ntt_compile *c, \
190 struct ureg_dst dst, \
191 struct ureg_src src0, \
192 struct ureg_src src1, \
193 struct ureg_src src2) \
194 { \
195 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef()); \
196 }
197
198 #define OP14( op ) \
199 static inline void ntt_##op(struct ntt_compile *c, \
200 struct ureg_dst dst, \
201 struct ureg_src src0, \
202 struct ureg_src src1, \
203 struct ureg_src src2, \
204 struct ureg_src src3) \
205 { \
206 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3); \
207 }
208
209 /* We hand-craft our tex instructions */
210 #define OP12_TEX(op)
211 #define OP14_TEX(op)
212
213 /* Use a template include to generate a correctly-typed ntt_OP()
214 * function for each TGSI opcode:
215 */
216 #include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h"
217
218 /**
219 * Interprets a nir_load_const used as a NIR src as a uint.
220 *
221 * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
222 * instruction (or in a phi-web used by an integer ALU instruction) were
223 * converted to floats and the ALU instruction swapped to the float equivalent.
224 * However, this means that integer load_consts used by intrinsics (which don't
225 * normally get that conversion) may have been reformatted to be floats. Given
226 * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
227 * we can just look and see if they look like floats and convert them back to
228 * ints.
229 */
230 static uint32_t
ntt_src_as_uint(struct ntt_compile * c,nir_src src)231 ntt_src_as_uint(struct ntt_compile *c, nir_src src)
232 {
233 uint32_t val = nir_src_as_uint(src);
234 if (!c->native_integers && val >= fui(1.0))
235 val = (uint32_t)uif(val);
236 return val;
237 }
238
239 static unsigned
ntt_64bit_write_mask(unsigned write_mask)240 ntt_64bit_write_mask(unsigned write_mask)
241 {
242 return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
243 }
244
245 static struct ureg_src
ntt_64bit_1f(struct ntt_compile * c)246 ntt_64bit_1f(struct ntt_compile *c)
247 {
248 return ureg_imm4u(c->ureg,
249 0x00000000, 0x3ff00000,
250 0x00000000, 0x3ff00000);
251 }
252
253 /* Per-channel masks of def/use within the block, and the per-channel
254 * livein/liveout for the block as a whole.
255 */
256 struct ntt_live_reg_block_state {
257 uint8_t *def, *use, *livein, *liveout, *defin, *defout;
258 };
259
260 struct ntt_live_reg_state {
261 unsigned bitset_words;
262
263 struct ntt_reg_interval *regs;
264
265 /* Used in propagate_across_edge() */
266 BITSET_WORD *tmp_live;
267
268 struct ntt_live_reg_block_state *blocks;
269
270 nir_block_worklist worklist;
271 };
272
273 static void
ntt_live_reg_mark_use(struct ntt_compile * c,struct ntt_live_reg_block_state * bs,int ip,unsigned index,unsigned used_mask)274 ntt_live_reg_mark_use(struct ntt_compile *c, struct ntt_live_reg_block_state *bs,
275 int ip, unsigned index, unsigned used_mask)
276 {
277 bs->use[index] |= used_mask & ~bs->def[index];
278
279 c->liveness[index].start = MIN2(c->liveness[index].start, ip);
280 c->liveness[index].end = MAX2(c->liveness[index].end, ip);
281
282 }
283 static void
ntt_live_reg_setup_def_use(struct ntt_compile * c,nir_function_impl * impl,struct ntt_live_reg_state * state)284 ntt_live_reg_setup_def_use(struct ntt_compile *c, nir_function_impl *impl, struct ntt_live_reg_state *state)
285 {
286 for (int i = 0; i < impl->num_blocks; i++) {
287 state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps);
288 state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps);
289 state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
290 state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps);
291 state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps);
292 state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
293 }
294
295 int ip = 0;
296 nir_foreach_block(block, impl) {
297 struct ntt_live_reg_block_state *bs = &state->blocks[block->index];
298 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
299
300 ntt_block->start_ip = ip;
301
302 util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
303 const struct tgsi_opcode_info *opcode_info =
304 tgsi_get_opcode_info(insn->opcode);
305
306 /* Set up use[] for the srcs.
307 *
308 * Uses are the channels of the reg read in the block that don't have a
309 * preceding def to screen them off. Note that we don't do per-element
310 * tracking of array regs, so they're never screened off.
311 */
312 for (int i = 0; i < opcode_info->num_src; i++) {
313 if (insn->src[i].File != TGSI_FILE_TEMPORARY)
314 continue;
315 int index = insn->src[i].Index;
316
317 uint32_t used_mask = tgsi_util_get_src_usage_mask(insn->opcode, i,
318 insn->dst->WriteMask,
319 insn->src[i].SwizzleX,
320 insn->src[i].SwizzleY,
321 insn->src[i].SwizzleZ,
322 insn->src[i].SwizzleW,
323 insn->tex_target,
324 insn->tex_target);
325
326 assert(!insn->src[i].Indirect || index < c->first_non_array_temp);
327 ntt_live_reg_mark_use(c, bs, ip, index, used_mask);
328 }
329
330 if (insn->is_tex) {
331 for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
332 if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY)
333 ntt_live_reg_mark_use(c, bs, ip, insn->tex_offset[i].Index, 0xf);
334 }
335 }
336
337 /* Set up def[] for the srcs.
338 *
339 * Defs are the unconditionally-written (not R/M/W) channels of the reg in
340 * the block that don't have a preceding use.
341 */
342 for (int i = 0; i < opcode_info->num_dst; i++) {
343 if (insn->dst[i].File != TGSI_FILE_TEMPORARY)
344 continue;
345 int index = insn->dst[i].Index;
346 uint32_t writemask = insn->dst[i].WriteMask;
347
348 bs->def[index] |= writemask & ~bs->use[index];
349 bs->defout[index] |= writemask;
350
351 assert(!insn->dst[i].Indirect || index < c->first_non_array_temp);
352 c->liveness[index].start = MIN2(c->liveness[index].start, ip);
353 c->liveness[index].end = MAX2(c->liveness[index].end, ip);
354 }
355 ip++;
356 }
357
358 ntt_block->end_ip = ip;
359 }
360 }
361
362 static void
ntt_live_regs(struct ntt_compile * c,nir_function_impl * impl)363 ntt_live_regs(struct ntt_compile *c, nir_function_impl *impl)
364 {
365 nir_metadata_require(impl, nir_metadata_block_index);
366
367 c->liveness = rzalloc_array(c, struct ntt_reg_interval, c->num_temps);
368
369 struct ntt_live_reg_state state = {
370 .blocks = rzalloc_array(impl, struct ntt_live_reg_block_state, impl->num_blocks),
371 };
372
373 /* The intervals start out with start > end (indicating unused) */
374 for (int i = 0; i < c->num_temps; i++)
375 c->liveness[i].start = ~0;
376
377 ntt_live_reg_setup_def_use(c, impl, &state);
378
379 /* Make a forward-order worklist of all the blocks. */
380 nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL);
381 nir_foreach_block(block, impl) {
382 nir_block_worklist_push_tail(&state.worklist, block);
383 }
384
385 /* Propagate defin/defout down the CFG to calculate the live variables
386 * potentially defined along any possible control flow path. We'll use this
387 * to keep things like conditional defs of the reg (or array regs where we
388 * don't track defs!) from making the reg's live range extend back to the
389 * start of the program.
390 */
391 while (!nir_block_worklist_is_empty(&state.worklist)) {
392 nir_block *block = nir_block_worklist_pop_head(&state.worklist);
393 for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
394 nir_block *succ = block->successors[j];
395 if (!succ || succ->index == impl->num_blocks)
396 continue;
397
398 for (int i = 0; i < c->num_temps; i++) {
399 uint8_t new_def = state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i];
400
401 if (new_def) {
402 state.blocks[succ->index].defin[i] |= new_def;
403 state.blocks[succ->index].defout[i] |= new_def;
404 nir_block_worklist_push_tail(&state.worklist, succ);
405 }
406 }
407 }
408 }
409
410 /* Make a reverse-order worklist of all the blocks. */
411 nir_foreach_block(block, impl) {
412 nir_block_worklist_push_head(&state.worklist, block);
413 }
414
415 /* We're now ready to work through the worklist and update the liveness sets
416 * of each of the blocks. As long as we keep the worklist up-to-date as we
417 * go, everything will get covered.
418 */
419 while (!nir_block_worklist_is_empty(&state.worklist)) {
420 /* We pop them off in the reverse order we pushed them on. This way
421 * the first walk of the instructions is backwards so we only walk
422 * once in the case of no control flow.
423 */
424 nir_block *block = nir_block_worklist_pop_head(&state.worklist);
425 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
426 struct ntt_live_reg_block_state *bs = &state.blocks[block->index];
427
428 for (int i = 0; i < c->num_temps; i++) {
429 /* Collect livein from our successors to include in our liveout. */
430 for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
431 nir_block *succ = block->successors[j];
432 if (!succ || succ->index == impl->num_blocks)
433 continue;
434 struct ntt_live_reg_block_state *sbs = &state.blocks[succ->index];
435
436 uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i];
437 if (new_liveout) {
438 if (state.blocks[block->index].defout[i])
439 c->liveness[i].end = MAX2(c->liveness[i].end, ntt_block->end_ip);
440 bs->liveout[i] |= sbs->livein[i];
441 }
442 }
443
444 /* Propagate use requests from either our block's uses or our
445 * non-screened-off liveout up to our predecessors.
446 */
447 uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) &
448 ~bs->livein[i]);
449 if (new_livein) {
450 bs->livein[i] |= new_livein;
451 set_foreach(block->predecessors, entry) {
452 nir_block *pred = (void *)entry->key;
453 nir_block_worklist_push_tail(&state.worklist, pred);
454 }
455
456 if (new_livein & state.blocks[block->index].defin[i])
457 c->liveness[i].start = MIN2(c->liveness[i].start, ntt_block->start_ip);
458 }
459 }
460 }
461
462 ralloc_free(state.blocks);
463 nir_block_worklist_fini(&state.worklist);
464 }
465
466 static void
ntt_ra_check(struct ntt_compile * c,unsigned * ra_map,BITSET_WORD * released,int ip,unsigned index)467 ntt_ra_check(struct ntt_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index)
468 {
469 if (index < c->first_non_array_temp)
470 return;
471
472 if (c->liveness[index].start == ip && ra_map[index] == ~0)
473 ra_map[index] = ureg_DECL_temporary(c->ureg).Index;
474
475 if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) {
476 ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index]));
477 BITSET_SET(released, index);
478 }
479 }
480
481 static void
ntt_allocate_regs(struct ntt_compile * c,nir_function_impl * impl)482 ntt_allocate_regs(struct ntt_compile *c, nir_function_impl *impl)
483 {
484 ntt_live_regs(c, impl);
485
486 unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps);
487 unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps));
488
489 /* No RA on NIR array regs */
490 for (int i = 0; i < c->first_non_array_temp; i++)
491 ra_map[i] = i;
492
493 for (int i = c->first_non_array_temp; i < c->num_temps; i++)
494 ra_map[i] = ~0;
495
496 int ip = 0;
497 nir_foreach_block(block, impl) {
498 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
499
500 for (int i = 0; i < c->num_temps; i++)
501 ntt_ra_check(c, ra_map, released, ip, i);
502
503 util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
504 const struct tgsi_opcode_info *opcode_info =
505 tgsi_get_opcode_info(insn->opcode);
506
507 for (int i = 0; i < opcode_info->num_src; i++) {
508 if (insn->src[i].File == TGSI_FILE_TEMPORARY) {
509 ntt_ra_check(c, ra_map, released, ip, insn->src[i].Index);
510 insn->src[i].Index = ra_map[insn->src[i].Index];
511 }
512 }
513
514 if (insn->is_tex) {
515 for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
516 if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY) {
517 ntt_ra_check(c, ra_map, released, ip, insn->tex_offset[i].Index);
518 insn->tex_offset[i].Index = ra_map[insn->tex_offset[i].Index];
519 }
520 }
521 }
522
523 for (int i = 0; i < opcode_info->num_dst; i++) {
524 if (insn->dst[i].File == TGSI_FILE_TEMPORARY) {
525 ntt_ra_check(c, ra_map, released, ip, insn->dst[i].Index);
526 insn->dst[i].Index = ra_map[insn->dst[i].Index];
527 }
528 }
529 ip++;
530 }
531
532 for (int i = 0; i < c->num_temps; i++)
533 ntt_ra_check(c, ra_map, released, ip, i);
534 }
535 }
536
537 static void
ntt_allocate_regs_unoptimized(struct ntt_compile * c,nir_function_impl * impl)538 ntt_allocate_regs_unoptimized(struct ntt_compile *c, nir_function_impl *impl)
539 {
540 for (int i = c->first_non_array_temp; i < c->num_temps; i++)
541 ureg_DECL_temporary(c->ureg);
542 }
543
544
545 /**
546 * Try to find an iadd of a constant value with a non-constant value in the
547 * nir_src's first component, returning the constant offset and replacing *src
548 * with the non-constant component.
549 */
550 static const uint32_t
ntt_extract_const_src_offset(nir_src * src)551 ntt_extract_const_src_offset(nir_src *src)
552 {
553 nir_scalar s = nir_get_scalar(src->ssa, 0);
554
555 while (nir_scalar_is_alu(s)) {
556 nir_alu_instr *alu = nir_instr_as_alu(s.def->parent_instr);
557
558 if (alu->op == nir_op_iadd) {
559 for (int i = 0; i < 2; i++) {
560 nir_const_value *v = nir_src_as_const_value(alu->src[i].src);
561 if (v != NULL) {
562 *src = alu->src[1 - i].src;
563 return v[alu->src[i].swizzle[s.comp]].u32;
564 }
565 }
566
567 return 0;
568 }
569
570 /* We'd like to reuse nir_scalar_chase_movs(), but it assumes SSA and that
571 * seems reasonable for something used in inner loops of the compiler.
572 */
573 if (alu->op == nir_op_mov) {
574 s.def = alu->src[0].src.ssa;
575 s.comp = alu->src[0].swizzle[s.comp];
576 } else if (nir_op_is_vec(alu->op)) {
577 s.def = alu->src[s.comp].src.ssa;
578 s.comp = alu->src[s.comp].swizzle[0];
579 } else {
580 return 0;
581 }
582 }
583
584 return 0;
585 }
586
587 static const struct glsl_type *
ntt_shader_input_type(struct ntt_compile * c,struct nir_variable * var)588 ntt_shader_input_type(struct ntt_compile *c,
589 struct nir_variable *var)
590 {
591 switch (c->s->info.stage) {
592 case MESA_SHADER_GEOMETRY:
593 case MESA_SHADER_TESS_EVAL:
594 case MESA_SHADER_TESS_CTRL:
595 if (glsl_type_is_array(var->type))
596 return glsl_get_array_element(var->type);
597 else
598 return var->type;
599 default:
600 return var->type;
601 }
602 }
603
604 static void
ntt_get_gl_varying_semantic(struct ntt_compile * c,unsigned location,unsigned * semantic_name,unsigned * semantic_index)605 ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
606 unsigned *semantic_name, unsigned *semantic_index)
607 {
608 /* We want to use most of tgsi_get_gl_varying_semantic(), but the
609 * !texcoord shifting has already been applied, so avoid that.
610 */
611 if (!c->needs_texcoord_semantic &&
612 (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
613 *semantic_name = TGSI_SEMANTIC_GENERIC;
614 *semantic_index = location - VARYING_SLOT_VAR0;
615 return;
616 }
617
618 tgsi_get_gl_varying_semantic(location, true,
619 semantic_name, semantic_index);
620 }
621
622 /* TGSI varying declarations have a component usage mask associated (used by
623 * r600 and svga).
624 */
625 static uint32_t
ntt_tgsi_usage_mask(unsigned start_component,unsigned num_components,bool is_64)626 ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
627 bool is_64)
628 {
629 uint32_t usage_mask =
630 u_bit_consecutive(start_component, num_components);
631
632 if (is_64) {
633 if (start_component >= 2)
634 usage_mask >>= 2;
635
636 uint32_t tgsi_usage_mask = 0;
637
638 if (usage_mask & TGSI_WRITEMASK_X)
639 tgsi_usage_mask |= TGSI_WRITEMASK_XY;
640 if (usage_mask & TGSI_WRITEMASK_Y)
641 tgsi_usage_mask |= TGSI_WRITEMASK_ZW;
642
643 return tgsi_usage_mask;
644 } else {
645 return usage_mask;
646 }
647 }
648
649 /* TGSI varying declarations have a component usage mask associated (used by
650 * r600 and svga).
651 */
652 static uint32_t
ntt_tgsi_var_usage_mask(const struct nir_variable * var)653 ntt_tgsi_var_usage_mask(const struct nir_variable *var)
654 {
655 const struct glsl_type *type_without_array =
656 glsl_without_array(var->type);
657 unsigned num_components = glsl_get_vector_elements(type_without_array);
658 if (num_components == 0) /* structs */
659 num_components = 4;
660
661 return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
662 glsl_type_is_64bit(type_without_array));
663 }
664
665 static struct ureg_dst
ntt_output_decl(struct ntt_compile * c,nir_intrinsic_instr * instr,uint32_t * frac)666 ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
667 {
668 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
669 int base = nir_intrinsic_base(instr);
670 *frac = nir_intrinsic_component(instr);
671 bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
672
673 struct ureg_dst out;
674 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
675 unsigned semantic_name, semantic_index;
676 tgsi_get_gl_frag_result_semantic(semantics.location,
677 &semantic_name, &semantic_index);
678 semantic_index += semantics.dual_source_blend_index;
679
680 switch (semantics.location) {
681 case FRAG_RESULT_DEPTH:
682 *frac = 2; /* z write is the to the .z channel in TGSI */
683 break;
684 case FRAG_RESULT_STENCIL:
685 *frac = 1;
686 break;
687 default:
688 break;
689 }
690
691 out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
692 } else {
693 unsigned semantic_name, semantic_index;
694
695 ntt_get_gl_varying_semantic(c, semantics.location,
696 &semantic_name, &semantic_index);
697
698 uint32_t usage_mask = ntt_tgsi_usage_mask(*frac,
699 instr->num_components,
700 is_64);
701 uint32_t gs_streams = semantics.gs_streams;
702 for (int i = 0; i < 4; i++) {
703 if (!(usage_mask & (1 << i)))
704 gs_streams &= ~(0x3 << 2 * i);
705 }
706
707 /* No driver appears to use array_id of outputs. */
708 unsigned array_id = 0;
709
710 /* This bit is lost in the i/o semantics, but it's unused in in-tree
711 * drivers.
712 */
713 bool invariant = semantics.invariant;
714
715 unsigned num_slots = semantics.num_slots;
716 if (semantics.location == VARYING_SLOT_TESS_LEVEL_INNER ||
717 semantics.location == VARYING_SLOT_TESS_LEVEL_OUTER) {
718 /* Compact vars get a num_slots in NIR as number of components, but we
719 * want the number of vec4 slots here.
720 */
721 num_slots = 1;
722 }
723
724 out = ureg_DECL_output_layout(c->ureg,
725 semantic_name, semantic_index,
726 gs_streams,
727 base,
728 usage_mask,
729 array_id,
730 num_slots,
731 invariant);
732 }
733
734 unsigned write_mask;
735 if (nir_intrinsic_has_write_mask(instr))
736 write_mask = nir_intrinsic_write_mask(instr);
737 else
738 write_mask = ((1 << instr->num_components) - 1) << *frac;
739
740 if (is_64) {
741 write_mask = ntt_64bit_write_mask(write_mask);
742 if (*frac >= 2)
743 write_mask = write_mask << 2;
744 } else {
745 write_mask = write_mask << *frac;
746 }
747 return ureg_writemask(out, write_mask);
748 }
749
750 static bool
ntt_try_store_in_tgsi_output_with_use(struct ntt_compile * c,struct ureg_dst * dst,nir_src * src)751 ntt_try_store_in_tgsi_output_with_use(struct ntt_compile *c,
752 struct ureg_dst *dst,
753 nir_src *src)
754 {
755 *dst = ureg_dst_undef();
756
757 switch (c->s->info.stage) {
758 case MESA_SHADER_FRAGMENT:
759 case MESA_SHADER_VERTEX:
760 break;
761 default:
762 /* tgsi_exec (at least) requires that output stores happen per vertex
763 * emitted, you don't get to reuse a previous output value for the next
764 * vertex.
765 */
766 return false;
767 }
768
769 if (nir_src_is_if(src))
770 return false;
771
772 if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
773 return false;
774
775 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(nir_src_parent_instr(src));
776 if (intr->intrinsic != nir_intrinsic_store_output ||
777 !nir_src_is_const(intr->src[1])) {
778 return false;
779 }
780
781 uint32_t frac;
782 *dst = ntt_output_decl(c, intr, &frac);
783 dst->Index += ntt_src_as_uint(c, intr->src[1]);
784
785 return frac == 0;
786 }
787
788 /* If this reg is used only for storing an output, then in the simple
789 * cases we can write directly to the TGSI output instead of having
790 * store_output emit its own MOV.
791 */
792 static bool
ntt_try_store_reg_in_tgsi_output(struct ntt_compile * c,struct ureg_dst * dst,nir_intrinsic_instr * reg_decl)793 ntt_try_store_reg_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
794 nir_intrinsic_instr *reg_decl)
795 {
796 assert(reg_decl->intrinsic == nir_intrinsic_decl_reg);
797
798 *dst = ureg_dst_undef();
799
800 /* Look for a single use for try_store_in_tgsi_output */
801 nir_src *use = NULL;
802 nir_foreach_reg_load(src, reg_decl) {
803 nir_intrinsic_instr *load = nir_instr_as_intrinsic(nir_src_parent_instr(src));
804 nir_foreach_use_including_if(load_use, &load->def) {
805 /* We can only have one use */
806 if (use != NULL)
807 return false;
808
809 use = load_use;
810 }
811 }
812
813 if (use == NULL)
814 return false;
815
816 return ntt_try_store_in_tgsi_output_with_use(c, dst, use);
817 }
818
819 /* If this SSA def is used only for storing an output, then in the simple
820 * cases we can write directly to the TGSI output instead of having
821 * store_output emit its own MOV.
822 */
823 static bool
ntt_try_store_ssa_in_tgsi_output(struct ntt_compile * c,struct ureg_dst * dst,nir_def * def)824 ntt_try_store_ssa_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
825 nir_def *def)
826 {
827 *dst = ureg_dst_undef();
828
829 if (!list_is_singular(&def->uses))
830 return false;
831
832 nir_foreach_use_including_if(use, def) {
833 return ntt_try_store_in_tgsi_output_with_use(c, dst, use);
834 }
835 unreachable("We have one use");
836 }
837
838 static void
ntt_setup_inputs(struct ntt_compile * c)839 ntt_setup_inputs(struct ntt_compile *c)
840 {
841 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
842 return;
843
844 unsigned num_inputs = 0;
845 int num_input_arrays = 0;
846
847 nir_foreach_shader_in_variable(var, c->s) {
848 const struct glsl_type *type = ntt_shader_input_type(c, var);
849 unsigned array_len =
850 glsl_count_attribute_slots(type, false);
851
852 num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
853 }
854
855 c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
856
857 nir_foreach_shader_in_variable(var, c->s) {
858 const struct glsl_type *type = ntt_shader_input_type(c, var);
859 unsigned array_len =
860 glsl_count_attribute_slots(type, false);
861
862 unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
863 unsigned sample_loc;
864 struct ureg_src decl;
865
866 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
867 interpolation =
868 tgsi_get_interp_mode(var->data.interpolation,
869 var->data.location == VARYING_SLOT_COL0 ||
870 var->data.location == VARYING_SLOT_COL1);
871
872 if (var->data.location == VARYING_SLOT_POS)
873 interpolation = TGSI_INTERPOLATE_LINEAR;
874 }
875
876 unsigned semantic_name, semantic_index;
877 ntt_get_gl_varying_semantic(c, var->data.location,
878 &semantic_name, &semantic_index);
879
880 if (var->data.sample) {
881 sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
882 } else if (var->data.centroid) {
883 sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
884 c->centroid_inputs |= (BITSET_MASK(array_len) <<
885 var->data.driver_location);
886 } else {
887 sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
888 }
889
890 unsigned array_id = 0;
891 if (glsl_type_is_array(type))
892 array_id = ++num_input_arrays;
893
894 uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
895
896 decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
897 semantic_name,
898 semantic_index,
899 interpolation,
900 sample_loc,
901 var->data.driver_location,
902 usage_mask,
903 array_id, array_len);
904
905 if (semantic_name == TGSI_SEMANTIC_FACE) {
906 struct ureg_dst temp = ntt_temp(c);
907 if (c->native_integers) {
908 /* NIR is ~0 front and 0 back, while TGSI is +1 front */
909 ntt_SGE(c, temp, decl, ureg_imm1f(c->ureg, 0));
910 } else {
911 /* tgsi docs say that floating point FACE will be positive for
912 * frontface and negative for backface, but realistically
913 * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0.
914 * Copy that behavior, since some drivers (r300) have been doing a
915 * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0
916 * front face).
917 */
918 temp.Saturate = true;
919 ntt_MOV(c, temp, decl);
920
921 }
922 decl = ureg_src(temp);
923 }
924
925 for (unsigned i = 0; i < array_len; i++) {
926 c->input_index_map[var->data.driver_location + i] = decl;
927 c->input_index_map[var->data.driver_location + i].Index += i;
928 }
929 }
930 }
931
932 static int
ntt_sort_by_location(const nir_variable * a,const nir_variable * b)933 ntt_sort_by_location(const nir_variable *a, const nir_variable *b)
934 {
935 return a->data.location - b->data.location;
936 }
937
938 /**
939 * Workaround for virglrenderer requiring that TGSI FS output color variables
940 * are declared in order. Besides, it's a lot nicer to read the TGSI this way.
941 */
942 static void
ntt_setup_outputs(struct ntt_compile * c)943 ntt_setup_outputs(struct ntt_compile *c)
944 {
945 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
946 return;
947
948 nir_sort_variables_with_modes(c->s, ntt_sort_by_location, nir_var_shader_out);
949
950 nir_foreach_shader_out_variable(var, c->s) {
951 if (var->data.location == FRAG_RESULT_COLOR)
952 ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
953
954 unsigned semantic_name, semantic_index;
955 tgsi_get_gl_frag_result_semantic(var->data.location,
956 &semantic_name, &semantic_index);
957
958 (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
959 }
960 }
961
962 static enum tgsi_texture_type
tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim,bool is_array,bool is_shadow)963 tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
964 {
965 switch (dim) {
966 case GLSL_SAMPLER_DIM_1D:
967 if (is_shadow)
968 return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
969 else
970 return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
971 case GLSL_SAMPLER_DIM_2D:
972 case GLSL_SAMPLER_DIM_EXTERNAL:
973 if (is_shadow)
974 return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
975 else
976 return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
977 case GLSL_SAMPLER_DIM_3D:
978 return TGSI_TEXTURE_3D;
979 case GLSL_SAMPLER_DIM_CUBE:
980 if (is_shadow)
981 return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
982 else
983 return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
984 case GLSL_SAMPLER_DIM_RECT:
985 if (is_shadow)
986 return TGSI_TEXTURE_SHADOWRECT;
987 else
988 return TGSI_TEXTURE_RECT;
989 case GLSL_SAMPLER_DIM_MS:
990 return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
991 case GLSL_SAMPLER_DIM_BUF:
992 return TGSI_TEXTURE_BUFFER;
993 default:
994 unreachable("unknown sampler dim");
995 }
996 }
997
998 static enum tgsi_return_type
tgsi_return_type_from_base_type(enum glsl_base_type type)999 tgsi_return_type_from_base_type(enum glsl_base_type type)
1000 {
1001 switch (type) {
1002 case GLSL_TYPE_INT:
1003 return TGSI_RETURN_TYPE_SINT;
1004 case GLSL_TYPE_UINT:
1005 return TGSI_RETURN_TYPE_UINT;
1006 case GLSL_TYPE_FLOAT:
1007 return TGSI_RETURN_TYPE_FLOAT;
1008 default:
1009 unreachable("unexpected texture type");
1010 }
1011 }
1012
1013 static void
ntt_setup_uniforms(struct ntt_compile * c)1014 ntt_setup_uniforms(struct ntt_compile *c)
1015 {
1016 nir_foreach_uniform_variable(var, c->s) {
1017 if (glsl_type_is_sampler(glsl_without_array(var->type)) ||
1018 glsl_type_is_texture(glsl_without_array(var->type))) {
1019 /* Don't use this size for the check for samplers -- arrays of structs
1020 * containing samplers should be ignored, and just the separate lowered
1021 * sampler uniform decl used.
1022 */
1023 int size = glsl_type_get_sampler_count(var->type) +
1024 glsl_type_get_texture_count(var->type);
1025
1026 const struct glsl_type *stype = glsl_without_array(var->type);
1027 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
1028 glsl_sampler_type_is_array(stype),
1029 glsl_sampler_type_is_shadow(stype));
1030 enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
1031 for (int i = 0; i < size; i++) {
1032 ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
1033 target, ret_type, ret_type, ret_type, ret_type);
1034 ureg_DECL_sampler(c->ureg, var->data.binding + i);
1035 }
1036 } else if (glsl_contains_atomic(var->type)) {
1037 uint32_t offset = var->data.offset / 4;
1038 uint32_t size = glsl_atomic_size(var->type) / 4;
1039 ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0);
1040 }
1041
1042 /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
1043 * size declaration happens with other UBOs below.
1044 */
1045 }
1046
1047 nir_foreach_image_variable(var, c->s) {
1048 int image_count = glsl_type_get_image_count(var->type);
1049 const struct glsl_type *itype = glsl_without_array(var->type);
1050 enum tgsi_texture_type tex_type =
1051 tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype),
1052 glsl_sampler_type_is_array(itype), false);
1053
1054 for (int i = 0; i < image_count; i++) {
1055 c->images[var->data.binding] = ureg_DECL_image(c->ureg,
1056 var->data.binding + i,
1057 tex_type,
1058 var->data.image.format,
1059 !(var->data.access & ACCESS_NON_WRITEABLE),
1060 false);
1061 }
1062 }
1063
1064 c->first_ubo = ~0;
1065
1066 unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
1067 nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
1068 int ubo = var->data.driver_location;
1069 if (ubo == -1)
1070 continue;
1071
1072 if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
1073 c->first_ubo = MIN2(c->first_ubo, ubo);
1074
1075 unsigned size = glsl_get_explicit_size(var->interface_type, false);
1076
1077 int array_size = 1;
1078 if (glsl_type_is_interface(glsl_without_array(var->type)))
1079 array_size = MAX2(1, glsl_get_aoa_size(var->type));
1080
1081 for (int i = 0; i < array_size; i++) {
1082 /* Even if multiple NIR variables are in the same uniform block, their
1083 * explicit size is the size of the block.
1084 */
1085 if (ubo_sizes[ubo + i])
1086 assert(ubo_sizes[ubo + i] == size);
1087
1088 ubo_sizes[ubo + i] = size;
1089 }
1090 }
1091
1092 for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
1093 if (ubo_sizes[i])
1094 ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
1095 }
1096
1097 if (c->options->lower_ssbo_bindings) {
1098 c->first_ssbo = 255;
1099 nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ssbo) {
1100 if (c->first_ssbo > var->data.binding)
1101 c->first_ssbo = var->data.binding;
1102 }
1103 } else
1104 c->first_ssbo = 0;
1105
1106 /* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic
1107 * counters
1108 */
1109 bool atomic = false;
1110 for (int i = 0; i < c->s->info.num_ssbos; ++i)
1111 ureg_DECL_buffer(c->ureg, c->first_ssbo + i, atomic);
1112
1113 }
1114
1115 static void
ntt_setup_registers(struct ntt_compile * c)1116 ntt_setup_registers(struct ntt_compile *c)
1117 {
1118 assert(c->num_temps == 0);
1119
1120 nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
1121 /* Permanently allocate all the array regs at the start. */
1122 unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
1123 unsigned index = nir_reg->def.index;
1124
1125 if (num_array_elems != 0) {
1126 struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, num_array_elems, true);
1127 c->reg_temp[index] = decl;
1128 assert(c->num_temps == decl.Index);
1129 c->num_temps += num_array_elems;
1130 }
1131 }
1132 c->first_non_array_temp = c->num_temps;
1133
1134 /* After that, allocate non-array regs in our virtual space that we'll
1135 * register-allocate before ureg emit.
1136 */
1137 nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
1138 unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
1139 unsigned num_components = nir_intrinsic_num_components(nir_reg);
1140 unsigned bit_size = nir_intrinsic_bit_size(nir_reg);
1141 unsigned index = nir_reg->def.index;
1142
1143 /* We already handled arrays */
1144 if (num_array_elems == 0) {
1145 struct ureg_dst decl;
1146 uint32_t write_mask = BITFIELD_MASK(num_components);
1147
1148 if (!ntt_try_store_reg_in_tgsi_output(c, &decl, nir_reg)) {
1149 if (bit_size == 64) {
1150 if (num_components > 2) {
1151 fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
1152 num_components, index);
1153 }
1154
1155 write_mask = ntt_64bit_write_mask(write_mask);
1156 }
1157
1158 decl = ureg_writemask(ntt_temp(c), write_mask);
1159 }
1160 c->reg_temp[index] = decl;
1161 }
1162 }
1163 }
1164
1165 static struct ureg_src
ntt_get_load_const_src(struct ntt_compile * c,nir_load_const_instr * instr)1166 ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
1167 {
1168 int num_components = instr->def.num_components;
1169
1170 if (!c->native_integers) {
1171 float values[4];
1172 assert(instr->def.bit_size == 32);
1173 for (int i = 0; i < num_components; i++)
1174 values[i] = uif(instr->value[i].u32);
1175
1176 return ureg_DECL_immediate(c->ureg, values, num_components);
1177 } else {
1178 uint32_t values[4];
1179
1180 if (instr->def.bit_size == 32) {
1181 for (int i = 0; i < num_components; i++)
1182 values[i] = instr->value[i].u32;
1183 } else {
1184 assert(num_components <= 2);
1185 for (int i = 0; i < num_components; i++) {
1186 values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
1187 values[i * 2 + 1] = instr->value[i].u64 >> 32;
1188 }
1189 num_components *= 2;
1190 }
1191
1192 return ureg_DECL_immediate_uint(c->ureg, values, num_components);
1193 }
1194 }
1195
1196 static struct ureg_src
ntt_reladdr(struct ntt_compile * c,struct ureg_src addr,int addr_index)1197 ntt_reladdr(struct ntt_compile *c, struct ureg_src addr, int addr_index)
1198 {
1199 assert(addr_index < ARRAY_SIZE(c->addr_reg));
1200
1201 for (int i = 0; i <= addr_index; i++) {
1202 if (!c->addr_declared[i]) {
1203 c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg),
1204 TGSI_WRITEMASK_X);
1205 c->addr_declared[i] = true;
1206 }
1207 }
1208
1209 if (c->native_integers)
1210 ntt_UARL(c, c->addr_reg[addr_index], addr);
1211 else
1212 ntt_ARL(c, c->addr_reg[addr_index], addr);
1213 return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0);
1214 }
1215
1216 /* Forward declare for recursion with indirects */
1217 static struct ureg_src
1218 ntt_get_src(struct ntt_compile *c, nir_src src);
1219
1220 static struct ureg_src
ntt_get_chased_src(struct ntt_compile * c,nir_legacy_src * src)1221 ntt_get_chased_src(struct ntt_compile *c, nir_legacy_src *src)
1222 {
1223 if (src->is_ssa) {
1224 if (src->ssa->parent_instr->type == nir_instr_type_load_const)
1225 return ntt_get_load_const_src(c, nir_instr_as_load_const(src->ssa->parent_instr));
1226
1227 return c->ssa_temp[src->ssa->index];
1228 } else {
1229 struct ureg_dst reg_temp = c->reg_temp[src->reg.handle->index];
1230 reg_temp.Index += src->reg.base_offset;
1231
1232 if (src->reg.indirect) {
1233 struct ureg_src offset = ntt_get_src(c, nir_src_for_ssa(src->reg.indirect));
1234 return ureg_src_indirect(ureg_src(reg_temp),
1235 ntt_reladdr(c, offset, 0));
1236 } else {
1237 return ureg_src(reg_temp);
1238 }
1239 }
1240 }
1241
1242 static struct ureg_src
ntt_get_src(struct ntt_compile * c,nir_src src)1243 ntt_get_src(struct ntt_compile *c, nir_src src)
1244 {
1245 nir_legacy_src chased = nir_legacy_chase_src(&src);
1246 return ntt_get_chased_src(c, &chased);
1247 }
1248
1249 static struct ureg_src
ntt_get_alu_src(struct ntt_compile * c,nir_alu_instr * instr,int i)1250 ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
1251 {
1252 /* We only support 32-bit float modifiers. The only other modifier type
1253 * officially supported by TGSI is 32-bit integer negates, but even those are
1254 * broken on virglrenderer, so skip lowering all integer and f64 float mods.
1255 *
1256 * The options->lower_fabs requests that we not have native source modifiers
1257 * for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
1258 */
1259 nir_legacy_alu_src src =
1260 nir_legacy_chase_alu_src(&instr->src[i], !c->options->lower_fabs);
1261 struct ureg_src usrc = ntt_get_chased_src(c, &src.src);
1262
1263 /* Expand double/dvec2 src references to TGSI swizzles using a pair of 32-bit
1264 * channels. We skip this for undefs, as those don't get split to vec2s (but
1265 * the specific swizzles from an undef don't matter)
1266 */
1267 if (nir_src_bit_size(instr->src[i].src) == 64 &&
1268 !(src.src.is_ssa && src.src.ssa->parent_instr->type == nir_instr_type_undef)) {
1269 int chan1 = 1;
1270 if (nir_op_infos[instr->op].input_sizes[i] == 0) {
1271 chan1 = instr->def.num_components > 1 ? 1 : 0;
1272 }
1273 usrc = ureg_swizzle(usrc,
1274 src.swizzle[0] * 2,
1275 src.swizzle[0] * 2 + 1,
1276 src.swizzle[chan1] * 2,
1277 src.swizzle[chan1] * 2 + 1);
1278 } else {
1279 usrc = ureg_swizzle(usrc,
1280 src.swizzle[0],
1281 src.swizzle[1],
1282 src.swizzle[2],
1283 src.swizzle[3]);
1284 }
1285
1286 if (src.fabs)
1287 usrc = ureg_abs(usrc);
1288 if (src.fneg)
1289 usrc = ureg_negate(usrc);
1290
1291 return usrc;
1292 }
1293
1294 /* Reswizzles a source so that the unset channels in the write mask still refer
1295 * to one of the channels present in the write mask.
1296 */
1297 static struct ureg_src
ntt_swizzle_for_write_mask(struct ureg_src src,uint32_t write_mask)1298 ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
1299 {
1300 assert(write_mask);
1301 int first_chan = ffs(write_mask) - 1;
1302 return ureg_swizzle(src,
1303 (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
1304 (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
1305 (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
1306 (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
1307 }
1308
1309 static struct ureg_dst
ntt_get_ssa_def_decl(struct ntt_compile * c,nir_def * ssa)1310 ntt_get_ssa_def_decl(struct ntt_compile *c, nir_def *ssa)
1311 {
1312 uint32_t writemask = BITSET_MASK(ssa->num_components);
1313 if (ssa->bit_size == 64)
1314 writemask = ntt_64bit_write_mask(writemask);
1315
1316 struct ureg_dst dst;
1317 if (!ntt_try_store_ssa_in_tgsi_output(c, &dst, ssa))
1318 dst = ntt_temp(c);
1319
1320 c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask);
1321
1322 return ureg_writemask(dst, writemask);
1323 }
1324
1325 static struct ureg_dst
ntt_get_chased_dest_decl(struct ntt_compile * c,nir_legacy_dest * dest)1326 ntt_get_chased_dest_decl(struct ntt_compile *c, nir_legacy_dest *dest)
1327 {
1328 if (dest->is_ssa)
1329 return ntt_get_ssa_def_decl(c, dest->ssa);
1330 else
1331 return c->reg_temp[dest->reg.handle->index];
1332 }
1333
1334 static struct ureg_dst
ntt_get_chased_dest(struct ntt_compile * c,nir_legacy_dest * dest)1335 ntt_get_chased_dest(struct ntt_compile *c, nir_legacy_dest *dest)
1336 {
1337 struct ureg_dst dst = ntt_get_chased_dest_decl(c, dest);
1338
1339 if (!dest->is_ssa) {
1340 dst.Index += dest->reg.base_offset;
1341
1342 if (dest->reg.indirect) {
1343 struct ureg_src offset = ntt_get_src(c, nir_src_for_ssa(dest->reg.indirect));
1344 dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset, 0));
1345 }
1346 }
1347
1348 return dst;
1349 }
1350
1351 static struct ureg_dst
ntt_get_dest(struct ntt_compile * c,nir_def * def)1352 ntt_get_dest(struct ntt_compile *c, nir_def *def)
1353 {
1354 nir_legacy_dest chased = nir_legacy_chase_dest(def);
1355 return ntt_get_chased_dest(c, &chased);
1356 }
1357
1358 static struct ureg_dst
ntt_get_alu_dest(struct ntt_compile * c,nir_def * def)1359 ntt_get_alu_dest(struct ntt_compile *c, nir_def *def)
1360 {
1361 nir_legacy_alu_dest chased = nir_legacy_chase_alu_dest(def);
1362 struct ureg_dst dst = ntt_get_chased_dest(c, &chased.dest);
1363
1364 if (chased.fsat)
1365 dst.Saturate = true;
1366
1367 /* Only registers get write masks */
1368 if (chased.dest.is_ssa)
1369 return dst;
1370
1371 int dst_64 = def->bit_size == 64;
1372 unsigned write_mask = chased.write_mask;
1373
1374 if (dst_64)
1375 return ureg_writemask(dst, ntt_64bit_write_mask(write_mask));
1376 else
1377 return ureg_writemask(dst, write_mask);
1378 }
1379
1380 /* For an SSA dest being populated by a constant src, replace the storage with
1381 * a copy of the ureg_src.
1382 */
1383 static void
ntt_store_def(struct ntt_compile * c,nir_def * def,struct ureg_src src)1384 ntt_store_def(struct ntt_compile *c, nir_def *def, struct ureg_src src)
1385 {
1386 if (!src.Indirect && !src.DimIndirect) {
1387 switch (src.File) {
1388 case TGSI_FILE_IMMEDIATE:
1389 case TGSI_FILE_INPUT:
1390 case TGSI_FILE_CONSTANT:
1391 case TGSI_FILE_SYSTEM_VALUE:
1392 c->ssa_temp[def->index] = src;
1393 return;
1394 }
1395 }
1396
1397 ntt_MOV(c, ntt_get_ssa_def_decl(c, def), src);
1398 }
1399
1400 static void
ntt_store(struct ntt_compile * c,nir_def * def,struct ureg_src src)1401 ntt_store(struct ntt_compile *c, nir_def *def, struct ureg_src src)
1402 {
1403 nir_legacy_dest chased = nir_legacy_chase_dest(def);
1404
1405 if (chased.is_ssa)
1406 ntt_store_def(c, chased.ssa, src);
1407 else {
1408 struct ureg_dst dst = ntt_get_chased_dest(c, &chased);
1409 ntt_MOV(c, dst, src);
1410 }
1411 }
1412
1413 static void
ntt_emit_scalar(struct ntt_compile * c,unsigned tgsi_op,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1)1414 ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
1415 struct ureg_dst dst,
1416 struct ureg_src src0,
1417 struct ureg_src src1)
1418 {
1419 unsigned i;
1420
1421 /* POW is the only 2-operand scalar op. */
1422 if (tgsi_op != TGSI_OPCODE_POW)
1423 src1 = src0;
1424
1425 for (i = 0; i < 4; i++) {
1426 if (dst.WriteMask & (1 << i)) {
1427 ntt_insn(c, tgsi_op,
1428 ureg_writemask(dst, 1 << i),
1429 ureg_scalar(src0, i),
1430 ureg_scalar(src1, i),
1431 ureg_src_undef(), ureg_src_undef());
1432 }
1433 }
1434 }
1435
1436 static void
ntt_emit_alu(struct ntt_compile * c,nir_alu_instr * instr)1437 ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
1438 {
1439 struct ureg_src src[4];
1440 struct ureg_dst dst;
1441 unsigned i;
1442 int dst_64 = instr->def.bit_size == 64;
1443 int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
1444 int num_srcs = nir_op_infos[instr->op].num_inputs;
1445
1446 /* Don't try to translate folded fsat since their source won't be valid */
1447 if (instr->op == nir_op_fsat && nir_legacy_fsat_folds(instr))
1448 return;
1449
1450 c->precise = instr->exact;
1451
1452 assert(num_srcs <= ARRAY_SIZE(src));
1453 for (i = 0; i < num_srcs; i++)
1454 src[i] = ntt_get_alu_src(c, instr, i);
1455 for (; i < ARRAY_SIZE(src); i++)
1456 src[i] = ureg_src_undef();
1457
1458 dst = ntt_get_alu_dest(c, &instr->def);
1459
1460 static enum tgsi_opcode op_map[][2] = {
1461 [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
1462
1463 /* fabs/fneg 32-bit are special-cased below. */
1464 [nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
1465 [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },
1466
1467 [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
1468 [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
1469 [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
1470 [nir_op_fdot2_replicated] = { TGSI_OPCODE_DP2 },
1471 [nir_op_fdot3_replicated] = { TGSI_OPCODE_DP3 },
1472 [nir_op_fdot4_replicated] = { TGSI_OPCODE_DP4 },
1473 [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
1474 [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
1475 [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
1476 [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
1477 [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
1478 [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
1479 [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },
1480
1481 [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
1482 [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
1483 [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },
1484
1485 /* The conversions will have one combination of src and dst bitsize. */
1486 [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
1487 [nir_op_f2f64] = { TGSI_OPCODE_F2D },
1488 [nir_op_i2i64] = { TGSI_OPCODE_I2I64 },
1489
1490 [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
1491 [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
1492 [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
1493 [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
1494 [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
1495 [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
1496 [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
1497 [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },
1498
1499 [nir_op_slt] = { TGSI_OPCODE_SLT },
1500 [nir_op_sge] = { TGSI_OPCODE_SGE },
1501 [nir_op_seq] = { TGSI_OPCODE_SEQ },
1502 [nir_op_sne] = { TGSI_OPCODE_SNE },
1503
1504 [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
1505 [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
1506 [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
1507 [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },
1508
1509 [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
1510 [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
1511 [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
1512 [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },
1513
1514 [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
1515 [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },
1516
1517 [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
1518 [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
1519 [nir_op_fsign] = { TGSI_OPCODE_SSG, TGSI_OPCODE_DSSG },
1520 [nir_op_isign] = { TGSI_OPCODE_ISSG, TGSI_OPCODE_I64SSG },
1521 [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
1522 [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
1523 [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
1524 [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
1525 [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
1526 [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
1527 [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
1528 [nir_op_bit_count] = { TGSI_OPCODE_POPC },
1529 [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
1530 [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
1531 [nir_op_find_lsb] = { TGSI_OPCODE_LSB },
1532 [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
1533 [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
1534 [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
1535 [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
1536 [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
1537 [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
1538 [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
1539 [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
1540 [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
1541 [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
1542 [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },
1543
1544 /* These bitwise ops don't care about 32 vs 64 types, so they have the
1545 * same TGSI op.
1546 */
1547 [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
1548 [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
1549 [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
1550 [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },
1551
1552 [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
1553 [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
1554 [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
1555 [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
1556 [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
1557 [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
1558 [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
1559 [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
1560 };
1561
1562 if (src_64 && !dst_64) {
1563 if (num_srcs == 2 || nir_op_infos[instr->op].output_type == nir_type_bool32) {
1564 /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
1565 * of .xy.
1566 */
1567 assert(!(dst.WriteMask & TGSI_WRITEMASK_YW));
1568 } else {
1569 /* TGSI 64bit-to-32-bit conversions only generate results in the .xy
1570 * channels and will need to get fixed up.
1571 */
1572 assert(!(dst.WriteMask & TGSI_WRITEMASK_ZW));
1573 }
1574 }
1575
1576 bool table_op64 = src_64;
1577 if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
1578 /* The normal path for NIR to TGSI ALU op translation */
1579 ntt_insn(c, op_map[instr->op][table_op64],
1580 dst, src[0], src[1], src[2], src[3]);
1581 } else {
1582 /* Special cases for NIR to TGSI ALU op translation. */
1583
1584 /* TODO: Use something like the ntt_store() path for the MOV calls so we
1585 * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
1586 */
1587
1588 switch (instr->op) {
1589 case nir_op_u2u64:
1590 ntt_AND(c, dst, ureg_swizzle(src[0],
1591 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1592 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1593 ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
1594 break;
1595
1596 case nir_op_i2i32:
1597 case nir_op_u2u32:
1598 assert(src_64);
1599 ntt_MOV(c, dst, ureg_swizzle(src[0],
1600 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1601 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
1602 break;
1603
1604 case nir_op_fabs:
1605 /* Try to eliminate */
1606 if (!c->options->lower_fabs && nir_legacy_float_mod_folds(instr))
1607 break;
1608
1609 if (c->options->lower_fabs)
1610 ntt_MAX(c, dst, src[0], ureg_negate(src[0]));
1611 else
1612 ntt_MOV(c, dst, ureg_abs(src[0]));
1613 break;
1614
1615 case nir_op_fsat:
1616 if (dst_64) {
1617 ntt_MIN(c, dst, src[0], ntt_64bit_1f(c));
1618 ntt_MAX(c, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
1619 } else {
1620 ntt_MOV(c, ureg_saturate(dst), src[0]);
1621 }
1622 break;
1623
1624 case nir_op_fneg:
1625 /* Try to eliminate */
1626 if (nir_legacy_float_mod_folds(instr))
1627 break;
1628
1629 ntt_MOV(c, dst, ureg_negate(src[0]));
1630 break;
1631
1632 /* NOTE: TGSI 32-bit math ops have the old "one source channel
1633 * replicated to all dst channels" behavior, while 64 is normal mapping
1634 * of src channels to dst.
1635 */
1636 case nir_op_frcp:
1637 assert(!dst_64);
1638 ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef());
1639 break;
1640
1641 case nir_op_frsq:
1642 assert(!dst_64);
1643 ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef());
1644 break;
1645
1646 case nir_op_fsqrt:
1647 assert(!dst_64);
1648 ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], ureg_src_undef());
1649 break;
1650
1651 case nir_op_fexp2:
1652 assert(!dst_64);
1653 ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef());
1654 break;
1655
1656 case nir_op_flog2:
1657 assert(!dst_64);
1658 ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef());
1659 break;
1660
1661 case nir_op_b2f32:
1662 ntt_AND(c, dst, src[0], ureg_imm1f(c->ureg, 1.0));
1663 break;
1664
1665 case nir_op_b2f64:
1666 ntt_AND(c, dst,
1667 ureg_swizzle(src[0],
1668 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1669 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1670 ntt_64bit_1f(c));
1671 break;
1672
1673 case nir_op_b2i32:
1674 ntt_AND(c, dst, src[0], ureg_imm1u(c->ureg, 1));
1675 break;
1676
1677 case nir_op_b2i64:
1678 ntt_AND(c, dst,
1679 ureg_swizzle(src[0],
1680 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1681 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1682 ureg_imm4u(c->ureg, 1, 0, 1, 0));
1683 break;
1684
1685 case nir_op_fsin:
1686 ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef());
1687 break;
1688
1689 case nir_op_fcos:
1690 ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef());
1691 break;
1692
1693 case nir_op_fsub:
1694 assert(!dst_64);
1695 ntt_ADD(c, dst, src[0], ureg_negate(src[1]));
1696 break;
1697
1698 case nir_op_isub:
1699 assert(!dst_64);
1700 ntt_UADD(c, dst, src[0], ureg_negate(src[1]));
1701 break;
1702
1703 case nir_op_fmod:
1704 unreachable("should be handled by .lower_fmod = true");
1705 break;
1706
1707 case nir_op_fpow:
1708 ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
1709 break;
1710
1711 case nir_op_flrp:
1712 ntt_LRP(c, dst, src[2], src[1], src[0]);
1713 break;
1714
1715 case nir_op_pack_64_2x32_split:
1716 ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
1717 ureg_swizzle(src[0],
1718 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1719 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1720 ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_YW),
1721 ureg_swizzle(src[1],
1722 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1723 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1724 break;
1725
1726 case nir_op_unpack_64_2x32_split_x:
1727 ntt_MOV(c, dst, ureg_swizzle(src[0],
1728 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1729 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
1730 break;
1731
1732 case nir_op_unpack_64_2x32_split_y:
1733 ntt_MOV(c, dst, ureg_swizzle(src[0],
1734 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
1735 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
1736 break;
1737
1738 case nir_op_b32csel:
1739 if (nir_src_bit_size(instr->src[1].src) == 64) {
1740 ntt_UCMP(c, dst, ureg_swizzle(src[0],
1741 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1742 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1743 src[1], src[2]);
1744 } else {
1745 ntt_UCMP(c, dst, src[0], src[1], src[2]);
1746 }
1747 break;
1748
1749 case nir_op_fcsel:
1750 /* If CMP isn't supported, then the flags that enable NIR to generate
1751 * this opcode should also not be set.
1752 */
1753 assert(!c->options->lower_cmp);
1754
1755 /* Implement this as CMP(-abs(src0), src1, src2). */
1756 ntt_CMP(c, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]);
1757 break;
1758
1759 case nir_op_fcsel_gt:
1760 /* If CMP isn't supported, then the flags that enable NIR to generate
1761 * these opcodes should also not be set.
1762 */
1763 assert(!c->options->lower_cmp);
1764
1765 ntt_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]);
1766 break;
1767
1768 case nir_op_fcsel_ge:
1769 /* If CMP isn't supported, then the flags that enable NIR to generate
1770 * these opcodes should also not be set.
1771 */
1772 assert(!c->options->lower_cmp);
1773
1774 /* Implement this as if !(src0 < 0.0) was identical to src0 >= 0.0. */
1775 ntt_CMP(c, dst, src[0], src[2], src[1]);
1776 break;
1777
1778 case nir_op_frexp_sig:
1779 case nir_op_frexp_exp:
1780 unreachable("covered by nir_lower_frexp()");
1781 break;
1782
1783 case nir_op_ldexp:
1784 assert(dst_64); /* 32bit handled in table. */
1785 ntt_DLDEXP(c, dst, src[0],
1786 ureg_swizzle(src[1],
1787 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1788 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1789 break;
1790
1791 case nir_op_vec4:
1792 case nir_op_vec3:
1793 case nir_op_vec2:
1794 unreachable("covered by nir_lower_vec_to_movs()");
1795
1796 default:
1797 fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
1798 unreachable("Unknown NIR opcode");
1799 }
1800 }
1801
1802 c->precise = false;
1803 }
1804
1805 static struct ureg_src
ntt_ureg_src_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src,int addr_reg)1806 ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
1807 nir_src src, int addr_reg)
1808 {
1809 if (nir_src_is_const(src)) {
1810 usrc.Index += ntt_src_as_uint(c, src);
1811 return usrc;
1812 } else {
1813 return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src), addr_reg));
1814 }
1815 }
1816
1817 static struct ureg_dst
ntt_ureg_dst_indirect(struct ntt_compile * c,struct ureg_dst dst,nir_src src)1818 ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
1819 nir_src src)
1820 {
1821 if (nir_src_is_const(src)) {
1822 dst.Index += ntt_src_as_uint(c, src);
1823 return dst;
1824 } else {
1825 return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src), 0));
1826 }
1827 }
1828
1829 static struct ureg_src
ntt_ureg_src_dimension_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src)1830 ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
1831 nir_src src)
1832 {
1833 if (nir_src_is_const(src)) {
1834 return ureg_src_dimension(usrc, ntt_src_as_uint(c, src));
1835 }
1836 else
1837 {
1838 return ureg_src_dimension_indirect(usrc,
1839 ntt_reladdr(c, ntt_get_src(c, src), 1),
1840 0);
1841 }
1842 }
1843
1844 static struct ureg_dst
ntt_ureg_dst_dimension_indirect(struct ntt_compile * c,struct ureg_dst udst,nir_src src)1845 ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst,
1846 nir_src src)
1847 {
1848 if (nir_src_is_const(src)) {
1849 return ureg_dst_dimension(udst, ntt_src_as_uint(c, src));
1850 } else {
1851 return ureg_dst_dimension_indirect(udst,
1852 ntt_reladdr(c, ntt_get_src(c, src), 1),
1853 0);
1854 }
1855 }
1856 /* Some load operations in NIR will have a fractional offset that we need to
1857 * swizzle down before storing to the result register.
1858 */
1859 static struct ureg_src
ntt_shift_by_frac(struct ureg_src src,unsigned frac,unsigned num_components)1860 ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1861 {
1862 return ureg_swizzle(src,
1863 frac,
1864 frac + MIN2(num_components - 1, 1),
1865 frac + MIN2(num_components - 1, 2),
1866 frac + MIN2(num_components - 1, 3));
1867 }
1868
1869
1870 static void
ntt_emit_load_ubo(struct ntt_compile * c,nir_intrinsic_instr * instr)1871 ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
1872 {
1873 int bit_size = instr->def.bit_size;
1874 assert(bit_size == 32 || instr->num_components <= 2);
1875
1876 struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
1877
1878 struct ureg_dst addr_temp = ureg_dst_undef();
1879
1880 if (nir_src_is_const(instr->src[0])) {
1881 src = ureg_src_dimension(src, ntt_src_as_uint(c, instr->src[0]));
1882 } else {
1883 /* virglrenderer requires that indirect UBO references have the UBO
1884 * array's base index in the Index field, not added to the indrect
1885 * address.
1886 *
1887 * Many nir intrinsics have a base address const value for the start of
1888 * their array indirection, but load_ubo doesn't. We fake it by
1889 * subtracting it off here.
1890 */
1891 addr_temp = ntt_temp(c);
1892 ntt_UADD(c, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
1893 src = ureg_src_dimension_indirect(src,
1894 ntt_reladdr(c, ureg_src(addr_temp), 1),
1895 c->first_ubo);
1896 }
1897
1898 if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {
1899 /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
1900 * file.
1901 */
1902 src.Index = nir_intrinsic_base(instr);
1903
1904 if (nir_src_is_const(instr->src[1])) {
1905 src.Index += ntt_src_as_uint(c, instr->src[1]);
1906 } else {
1907 src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1]), 0));
1908 }
1909
1910 int start_component = nir_intrinsic_component(instr);
1911 if (bit_size == 64)
1912 start_component *= 2;
1913
1914 src = ntt_shift_by_frac(src, start_component,
1915 instr->num_components * bit_size / 32);
1916
1917 ntt_store(c, &instr->def, src);
1918 } else {
1919 /* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a
1920 * TGSI_OPCODE_LOAD instruction from the const file.
1921 */
1922 struct ntt_insn *insn =
1923 ntt_insn(c, TGSI_OPCODE_LOAD,
1924 ntt_get_dest(c, &instr->def),
1925 src, ntt_get_src(c, instr->src[1]),
1926 ureg_src_undef(), ureg_src_undef());
1927 insn->is_mem = true;
1928 insn->tex_target = 0;
1929 insn->mem_qualifier = 0;
1930 insn->mem_format = 0; /* unused */
1931 }
1932 }
1933
1934 static unsigned
ntt_get_access_qualifier(nir_intrinsic_instr * instr)1935 ntt_get_access_qualifier(nir_intrinsic_instr *instr)
1936 {
1937 enum gl_access_qualifier access = nir_intrinsic_access(instr);
1938 unsigned qualifier = 0;
1939
1940 if (access & ACCESS_COHERENT)
1941 qualifier |= TGSI_MEMORY_COHERENT;
1942 if (access & ACCESS_VOLATILE)
1943 qualifier |= TGSI_MEMORY_VOLATILE;
1944 if (access & ACCESS_RESTRICT)
1945 qualifier |= TGSI_MEMORY_RESTRICT;
1946
1947 return qualifier;
1948 }
1949
1950 static unsigned
ntt_translate_atomic_op(nir_atomic_op op)1951 ntt_translate_atomic_op(nir_atomic_op op)
1952 {
1953 switch (op) {
1954 case nir_atomic_op_iadd: return TGSI_OPCODE_ATOMUADD;
1955 case nir_atomic_op_fadd: return TGSI_OPCODE_ATOMFADD;
1956 case nir_atomic_op_imin: return TGSI_OPCODE_ATOMIMIN;
1957 case nir_atomic_op_imax: return TGSI_OPCODE_ATOMIMAX;
1958 case nir_atomic_op_umin: return TGSI_OPCODE_ATOMUMIN;
1959 case nir_atomic_op_umax: return TGSI_OPCODE_ATOMUMAX;
1960 case nir_atomic_op_iand: return TGSI_OPCODE_ATOMAND;
1961 case nir_atomic_op_ixor: return TGSI_OPCODE_ATOMXOR;
1962 case nir_atomic_op_ior: return TGSI_OPCODE_ATOMOR;
1963 case nir_atomic_op_xchg: return TGSI_OPCODE_ATOMXCHG;
1964 default: unreachable("invalid atomic");
1965 }
1966 }
1967
1968 static void
ntt_emit_mem(struct ntt_compile * c,nir_intrinsic_instr * instr,nir_variable_mode mode)1969 ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
1970 nir_variable_mode mode)
1971 {
1972 bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
1973 instr->intrinsic == nir_intrinsic_store_shared);
1974 bool is_load = (instr->intrinsic == nir_intrinsic_atomic_counter_read ||
1975 instr->intrinsic == nir_intrinsic_load_ssbo ||
1976 instr->intrinsic == nir_intrinsic_load_shared);
1977 unsigned opcode;
1978 struct ureg_src src[4];
1979 int num_src = 0;
1980 int next_src;
1981 struct ureg_dst addr_temp = ureg_dst_undef();
1982
1983 struct ureg_src memory;
1984 switch (mode) {
1985 case nir_var_mem_ssbo:
1986 memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER,
1987 c->first_ssbo),
1988 instr->src[is_store ? 1 : 0], 2);
1989 next_src = 1;
1990 break;
1991 case nir_var_mem_shared:
1992 memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
1993 next_src = 0;
1994 break;
1995 case nir_var_uniform: { /* HW atomic buffers */
1996 nir_src src = instr->src[0];
1997 uint32_t offset = (ntt_extract_const_src_offset(&src) +
1998 nir_intrinsic_range_base(instr)) / 4;
1999
2000 memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, offset);
2001 /* ntt_ureg_src_indirect, except dividing by 4 */
2002 if (nir_src_is_const(src)) {
2003 memory.Index += nir_src_as_uint(src) / 4;
2004 } else {
2005 addr_temp = ntt_temp(c);
2006 ntt_USHR(c, addr_temp, ntt_get_src(c, src), ureg_imm1i(c->ureg, 2));
2007 memory = ureg_src_indirect(memory, ntt_reladdr(c, ureg_src(addr_temp), 2));
2008 }
2009 memory = ureg_src_dimension(memory, nir_intrinsic_base(instr));
2010 next_src = 0;
2011 break;
2012 }
2013
2014 default:
2015 unreachable("unknown memory type");
2016 }
2017
2018 if (is_store) {
2019 src[num_src++] = ntt_get_src(c, instr->src[next_src + 1]); /* offset */
2020 src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
2021 } else {
2022 src[num_src++] = memory;
2023 if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
2024 src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* offset */
2025 switch (instr->intrinsic) {
2026 case nir_intrinsic_atomic_counter_inc:
2027 src[num_src++] = ureg_imm1i(c->ureg, 1);
2028 break;
2029 case nir_intrinsic_atomic_counter_post_dec:
2030 src[num_src++] = ureg_imm1i(c->ureg, -1);
2031 break;
2032 default:
2033 if (!is_load)
2034 src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* value */
2035 break;
2036 }
2037 }
2038 }
2039
2040
2041 switch (instr->intrinsic) {
2042 case nir_intrinsic_ssbo_atomic:
2043 case nir_intrinsic_shared_atomic:
2044 opcode = ntt_translate_atomic_op(nir_intrinsic_atomic_op(instr));
2045 break;
2046 case nir_intrinsic_atomic_counter_add:
2047 case nir_intrinsic_atomic_counter_inc:
2048 case nir_intrinsic_atomic_counter_post_dec:
2049 opcode = TGSI_OPCODE_ATOMUADD;
2050 break;
2051 case nir_intrinsic_atomic_counter_min:
2052 opcode = TGSI_OPCODE_ATOMIMIN;
2053 break;
2054 case nir_intrinsic_atomic_counter_max:
2055 opcode = TGSI_OPCODE_ATOMIMAX;
2056 break;
2057 case nir_intrinsic_atomic_counter_and:
2058 opcode = TGSI_OPCODE_ATOMAND;
2059 break;
2060 case nir_intrinsic_atomic_counter_or:
2061 opcode = TGSI_OPCODE_ATOMOR;
2062 break;
2063 case nir_intrinsic_atomic_counter_xor:
2064 opcode = TGSI_OPCODE_ATOMXOR;
2065 break;
2066 case nir_intrinsic_atomic_counter_exchange:
2067 opcode = TGSI_OPCODE_ATOMXCHG;
2068 break;
2069 case nir_intrinsic_atomic_counter_comp_swap:
2070 case nir_intrinsic_ssbo_atomic_swap:
2071 case nir_intrinsic_shared_atomic_swap:
2072 opcode = TGSI_OPCODE_ATOMCAS;
2073 src[num_src++] = ntt_get_src(c, instr->src[next_src++]);
2074 break;
2075 case nir_intrinsic_atomic_counter_read:
2076 case nir_intrinsic_load_ssbo:
2077 case nir_intrinsic_load_shared:
2078 opcode = TGSI_OPCODE_LOAD;
2079 break;
2080 case nir_intrinsic_store_ssbo:
2081 case nir_intrinsic_store_shared:
2082 opcode = TGSI_OPCODE_STORE;
2083 break;
2084 case nir_intrinsic_get_ssbo_size:
2085 opcode = TGSI_OPCODE_RESQ;
2086 break;
2087 default:
2088 unreachable("unknown memory op");
2089 }
2090
2091 unsigned qualifier = 0;
2092 if (mode == nir_var_mem_ssbo &&
2093 instr->intrinsic != nir_intrinsic_get_ssbo_size) {
2094 qualifier = ntt_get_access_qualifier(instr);
2095 }
2096
2097 struct ureg_dst dst;
2098 if (is_store) {
2099 dst = ureg_dst(memory);
2100
2101 unsigned write_mask = nir_intrinsic_write_mask(instr);
2102 if (nir_src_bit_size(instr->src[0]) == 64)
2103 write_mask = ntt_64bit_write_mask(write_mask);
2104 dst = ureg_writemask(dst, write_mask);
2105 } else {
2106 dst = ntt_get_dest(c, &instr->def);
2107 }
2108
2109 struct ntt_insn *insn = ntt_insn(c, opcode, dst, src[0], src[1], src[2], src[3]);
2110 insn->tex_target = TGSI_TEXTURE_BUFFER;
2111 insn->mem_qualifier = qualifier;
2112 insn->mem_format = 0; /* unused */
2113 insn->is_mem = true;
2114 }
2115
2116 static void
ntt_emit_image_load_store(struct ntt_compile * c,nir_intrinsic_instr * instr)2117 ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
2118 {
2119 unsigned op;
2120 struct ureg_src srcs[4];
2121 int num_src = 0;
2122 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
2123 bool is_array = nir_intrinsic_image_array(instr);
2124
2125 struct ureg_dst temp = ureg_dst_undef();
2126
2127 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false);
2128
2129 struct ureg_src resource;
2130 switch (instr->intrinsic) {
2131 case nir_intrinsic_bindless_image_load:
2132 case nir_intrinsic_bindless_image_store:
2133 case nir_intrinsic_bindless_image_size:
2134 case nir_intrinsic_bindless_image_samples:
2135 case nir_intrinsic_bindless_image_atomic:
2136 case nir_intrinsic_bindless_image_atomic_swap:
2137 resource = ntt_get_src(c, instr->src[0]);
2138 break;
2139 default:
2140 resource = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
2141 instr->src[0], 2);
2142 resource.Index += nir_intrinsic_range_base(instr);
2143 }
2144
2145 struct ureg_dst dst;
2146 if (instr->intrinsic == nir_intrinsic_image_store ||
2147 instr->intrinsic == nir_intrinsic_bindless_image_store) {
2148 dst = ureg_dst(resource);
2149 } else {
2150 srcs[num_src++] = resource;
2151 dst = ntt_get_dest(c, &instr->def);
2152 }
2153 struct ureg_dst opcode_dst = dst;
2154
2155 if (instr->intrinsic != nir_intrinsic_image_size &&
2156 instr->intrinsic != nir_intrinsic_image_samples &&
2157 instr->intrinsic != nir_intrinsic_bindless_image_size &&
2158 instr->intrinsic != nir_intrinsic_bindless_image_samples) {
2159 struct ureg_src coord = ntt_get_src(c, instr->src[1]);
2160
2161 if (dim == GLSL_SAMPLER_DIM_MS) {
2162 temp = ntt_temp(c);
2163 ntt_MOV(c, temp, coord);
2164 ntt_MOV(c, ureg_writemask(temp, TGSI_WRITEMASK_W),
2165 ureg_scalar(ntt_get_src(c, instr->src[2]), TGSI_SWIZZLE_X));
2166 coord = ureg_src(temp);
2167 }
2168 srcs[num_src++] = coord;
2169
2170 if (instr->intrinsic != nir_intrinsic_image_load &&
2171 instr->intrinsic != nir_intrinsic_bindless_image_load) {
2172 srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
2173 if (instr->intrinsic == nir_intrinsic_image_atomic_swap ||
2174 instr->intrinsic == nir_intrinsic_bindless_image_atomic_swap)
2175 srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
2176 }
2177 }
2178
2179 switch (instr->intrinsic) {
2180 case nir_intrinsic_image_load:
2181 case nir_intrinsic_bindless_image_load:
2182 op = TGSI_OPCODE_LOAD;
2183 break;
2184 case nir_intrinsic_image_store:
2185 case nir_intrinsic_bindless_image_store:
2186 op = TGSI_OPCODE_STORE;
2187 break;
2188 case nir_intrinsic_image_size:
2189 case nir_intrinsic_bindless_image_size:
2190 op = TGSI_OPCODE_RESQ;
2191 break;
2192 case nir_intrinsic_image_samples:
2193 case nir_intrinsic_bindless_image_samples:
2194 op = TGSI_OPCODE_RESQ;
2195 opcode_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2196 break;
2197 case nir_intrinsic_image_atomic:
2198 case nir_intrinsic_bindless_image_atomic:
2199 op = ntt_translate_atomic_op(nir_intrinsic_atomic_op(instr));
2200 break;
2201 case nir_intrinsic_image_atomic_swap:
2202 case nir_intrinsic_bindless_image_atomic_swap:
2203 op = TGSI_OPCODE_ATOMCAS;
2204 break;
2205 default:
2206 unreachable("bad op");
2207 }
2208
2209 struct ntt_insn *insn = ntt_insn(c, op, opcode_dst, srcs[0], srcs[1], srcs[2], srcs[3]);
2210 insn->tex_target = target;
2211 insn->mem_qualifier = ntt_get_access_qualifier(instr);
2212 insn->mem_format = nir_intrinsic_format(instr);
2213 insn->is_mem = true;
2214
2215 if (instr->intrinsic == nir_intrinsic_image_samples ||
2216 instr->intrinsic == nir_intrinsic_bindless_image_samples)
2217 ntt_MOV(c, dst, ureg_scalar(ureg_src(opcode_dst), 3));
2218 }
2219
2220 static void
ntt_emit_load_input(struct ntt_compile * c,nir_intrinsic_instr * instr)2221 ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
2222 {
2223 uint32_t frac = nir_intrinsic_component(instr);
2224 uint32_t num_components = instr->num_components;
2225 unsigned base = nir_intrinsic_base(instr);
2226 struct ureg_src input;
2227 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2228 bool is_64 = instr->def.bit_size == 64;
2229
2230 if (c->s->info.stage == MESA_SHADER_VERTEX) {
2231 input = ureg_DECL_vs_input(c->ureg, base);
2232 for (int i = 1; i < semantics.num_slots; i++)
2233 ureg_DECL_vs_input(c->ureg, base + i);
2234 } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
2235 unsigned semantic_name, semantic_index;
2236 ntt_get_gl_varying_semantic(c, semantics.location,
2237 &semantic_name, &semantic_index);
2238
2239 /* XXX: ArrayID is used in r600 gs inputs */
2240 uint32_t array_id = 0;
2241
2242 input = ureg_DECL_input_layout(c->ureg,
2243 semantic_name,
2244 semantic_index,
2245 base,
2246 ntt_tgsi_usage_mask(frac,
2247 instr->num_components,
2248 is_64),
2249 array_id,
2250 semantics.num_slots);
2251 } else {
2252 input = c->input_index_map[base];
2253 }
2254
2255 if (is_64)
2256 num_components *= 2;
2257
2258 input = ntt_shift_by_frac(input, frac, num_components);
2259
2260 switch (instr->intrinsic) {
2261 case nir_intrinsic_load_input:
2262 input = ntt_ureg_src_indirect(c, input, instr->src[0], 0);
2263 ntt_store(c, &instr->def, input);
2264 break;
2265
2266 case nir_intrinsic_load_per_vertex_input:
2267 input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2268 input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
2269 ntt_store(c, &instr->def, input);
2270 break;
2271
2272 case nir_intrinsic_load_interpolated_input: {
2273 input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2274
2275 nir_intrinsic_instr *bary_instr =
2276 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
2277
2278 switch (bary_instr->intrinsic) {
2279 case nir_intrinsic_load_barycentric_pixel:
2280 case nir_intrinsic_load_barycentric_sample:
2281 /* For these, we know that the barycentric load matches the
2282 * interpolation on the input declaration, so we can use it directly.
2283 */
2284 ntt_store(c, &instr->def, input);
2285 break;
2286
2287 case nir_intrinsic_load_barycentric_centroid:
2288 /* If the input was declared centroid, then there's no need to
2289 * emit the extra TGSI interp instruction, we can just read the
2290 * input.
2291 */
2292 if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
2293 ntt_store(c, &instr->def, input);
2294 } else {
2295 ntt_INTERP_CENTROID(c, ntt_get_dest(c, &instr->def), input);
2296 }
2297 break;
2298
2299 case nir_intrinsic_load_barycentric_at_sample:
2300 /* We stored the sample in the fake "bary" dest. */
2301 ntt_INTERP_SAMPLE(c, ntt_get_dest(c, &instr->def), input,
2302 ntt_get_src(c, instr->src[0]));
2303 break;
2304
2305 case nir_intrinsic_load_barycentric_at_offset:
2306 /* We stored the offset in the fake "bary" dest. */
2307 ntt_INTERP_OFFSET(c, ntt_get_dest(c, &instr->def), input,
2308 ntt_get_src(c, instr->src[0]));
2309 break;
2310
2311 default:
2312 unreachable("bad barycentric interp intrinsic\n");
2313 }
2314 break;
2315 }
2316
2317 default:
2318 unreachable("bad load input intrinsic\n");
2319 }
2320 }
2321
2322 static void
ntt_emit_store_output(struct ntt_compile * c,nir_intrinsic_instr * instr)2323 ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2324 {
2325 struct ureg_src src = ntt_get_src(c, instr->src[0]);
2326
2327 if (src.File == TGSI_FILE_OUTPUT) {
2328 /* If our src is the output file, that's an indication that we were able
2329 * to emit the output stores in the generating instructions and we have
2330 * nothing to do here.
2331 */
2332 return;
2333 }
2334
2335 uint32_t frac;
2336 struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2337
2338 if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
2339 out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
2340 out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]);
2341 } else {
2342 out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2343 }
2344
2345 uint8_t swizzle[4] = { 0, 0, 0, 0 };
2346 for (int i = frac; i < 4; i++) {
2347 if (out.WriteMask & (1 << i))
2348 swizzle[i] = i - frac;
2349 }
2350
2351 src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2352
2353 ntt_MOV(c, out, src);
2354 }
2355
2356 static void
ntt_emit_load_output(struct ntt_compile * c,nir_intrinsic_instr * instr)2357 ntt_emit_load_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2358 {
2359 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2360
2361 /* ntt_try_store_in_tgsi_output() optimization is not valid if normal
2362 * load_output is present.
2363 */
2364 assert(c->s->info.stage != MESA_SHADER_VERTEX &&
2365 (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output));
2366
2367 uint32_t frac;
2368 struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2369
2370 if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
2371 out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2372 out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[0]);
2373 } else {
2374 out = ntt_ureg_dst_indirect(c, out, instr->src[0]);
2375 }
2376
2377 struct ureg_dst dst = ntt_get_dest(c, &instr->def);
2378 struct ureg_src out_src = ureg_src(out);
2379
2380 /* Don't swizzling unavailable channels of the output in the writemasked-out
2381 * components. Avoids compile failures in virglrenderer with
2382 * TESS_LEVEL_INNER.
2383 */
2384 int fill_channel = ffs(dst.WriteMask) - 1;
2385 uint8_t swizzles[4] = { 0, 1, 2, 3 };
2386 for (int i = 0; i < 4; i++)
2387 if (!(dst.WriteMask & (1 << i)))
2388 swizzles[i] = fill_channel;
2389 out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]);
2390
2391 if (semantics.fb_fetch_output)
2392 ntt_FBFETCH(c, dst, out_src);
2393 else
2394 ntt_MOV(c, dst, out_src);
2395 }
2396
2397 static void
ntt_emit_load_sysval(struct ntt_compile * c,nir_intrinsic_instr * instr)2398 ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
2399 {
2400 gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
2401 enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
2402 struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
2403
2404 /* virglrenderer doesn't like references to channels of the sysval that
2405 * aren't defined, even if they aren't really read. (GLSL compile fails on
2406 * gl_NumWorkGroups.w, for example).
2407 */
2408 uint32_t write_mask = BITSET_MASK(instr->def.num_components);
2409 sv = ntt_swizzle_for_write_mask(sv, write_mask);
2410
2411 /* TGSI and NIR define these intrinsics as always loading ints, but they can
2412 * still appear on hardware with non-native-integers fragment shaders using
2413 * the draw path (i915g). In that case, having called nir_lower_int_to_float
2414 * means that we actually want floats instead.
2415 */
2416 if (!c->native_integers) {
2417 switch (instr->intrinsic) {
2418 case nir_intrinsic_load_vertex_id:
2419 case nir_intrinsic_load_instance_id:
2420 ntt_U2F(c, ntt_get_dest(c, &instr->def), sv);
2421 return;
2422
2423 default:
2424 break;
2425 }
2426 }
2427
2428 ntt_store(c, &instr->def, sv);
2429 }
2430
2431 static void
ntt_emit_barrier(struct ntt_compile * c,nir_intrinsic_instr * intr)2432 ntt_emit_barrier(struct ntt_compile *c, nir_intrinsic_instr *intr)
2433 {
2434 bool compute = gl_shader_stage_is_compute(c->s->info.stage);
2435
2436 if (nir_intrinsic_memory_scope(intr) != SCOPE_NONE) {
2437 nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
2438 unsigned membar = 0;
2439
2440 if (modes & nir_var_image)
2441 membar |= TGSI_MEMBAR_SHADER_IMAGE;
2442
2443 if (modes & nir_var_mem_shared)
2444 membar |= TGSI_MEMBAR_SHARED;
2445
2446 /* Atomic counters are lowered to SSBOs, there's no NIR mode corresponding
2447 * exactly to atomics. Take the closest match.
2448 */
2449 if (modes & nir_var_mem_ssbo)
2450 membar |= TGSI_MEMBAR_SHADER_BUFFER | TGSI_MEMBAR_ATOMIC_BUFFER;
2451
2452 if (modes & nir_var_mem_global)
2453 membar |= TGSI_MEMBAR_SHADER_BUFFER;
2454
2455 /* Hack for virglrenderer: the GLSL specific memory barrier functions,
2456 * memoryBarrier{Buffer,Image,Shared,AtomicCounter}(), are only
2457 * available in compute shaders prior to GLSL 4.30. In other stages,
2458 * it needs to use the full memoryBarrier(). It may be possible to
2459 * make them available via #extension directives in older versions,
2460 * but it's confusingly underspecified, and Mesa/virglrenderer don't
2461 * currently agree on how to do it. So, just promote partial memory
2462 * barriers back to full ones outside of compute shaders when asked.
2463 */
2464 if (membar && !compute &&
2465 c->options->non_compute_membar_needs_all_modes) {
2466 membar |= TGSI_MEMBAR_SHADER_BUFFER |
2467 TGSI_MEMBAR_ATOMIC_BUFFER |
2468 TGSI_MEMBAR_SHADER_IMAGE |
2469 TGSI_MEMBAR_SHARED;
2470 }
2471
2472 /* If we only need workgroup scope (not device-scope), we might be able to
2473 * optimize a bit.
2474 */
2475 if (membar && compute &&
2476 nir_intrinsic_memory_scope(intr) == SCOPE_WORKGROUP) {
2477
2478 membar |= TGSI_MEMBAR_THREAD_GROUP;
2479 }
2480
2481 /* Only emit a memory barrier if there are any relevant modes */
2482 if (membar)
2483 ntt_MEMBAR(c, ureg_imm1u(c->ureg, membar));
2484 }
2485
2486 if (nir_intrinsic_execution_scope(intr) != SCOPE_NONE) {
2487 assert(compute || c->s->info.stage == MESA_SHADER_TESS_CTRL);
2488 ntt_BARRIER(c);
2489 }
2490 }
2491
2492 static void
ntt_emit_intrinsic(struct ntt_compile * c,nir_intrinsic_instr * instr)2493 ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
2494 {
2495 switch (instr->intrinsic) {
2496 case nir_intrinsic_load_ubo:
2497 case nir_intrinsic_load_ubo_vec4:
2498 ntt_emit_load_ubo(c, instr);
2499 break;
2500
2501 /* Vertex */
2502 case nir_intrinsic_load_vertex_id:
2503 case nir_intrinsic_load_vertex_id_zero_base:
2504 case nir_intrinsic_load_base_vertex:
2505 case nir_intrinsic_load_base_instance:
2506 case nir_intrinsic_load_instance_id:
2507 case nir_intrinsic_load_draw_id:
2508 case nir_intrinsic_load_invocation_id:
2509 case nir_intrinsic_load_frag_coord:
2510 case nir_intrinsic_load_point_coord:
2511 case nir_intrinsic_load_front_face:
2512 case nir_intrinsic_load_sample_id:
2513 case nir_intrinsic_load_sample_pos:
2514 case nir_intrinsic_load_sample_mask_in:
2515 case nir_intrinsic_load_helper_invocation:
2516 case nir_intrinsic_load_tess_coord:
2517 case nir_intrinsic_load_patch_vertices_in:
2518 case nir_intrinsic_load_primitive_id:
2519 case nir_intrinsic_load_tess_level_outer:
2520 case nir_intrinsic_load_tess_level_inner:
2521 case nir_intrinsic_load_local_invocation_id:
2522 case nir_intrinsic_load_workgroup_id:
2523 case nir_intrinsic_load_num_workgroups:
2524 case nir_intrinsic_load_workgroup_size:
2525 case nir_intrinsic_load_subgroup_size:
2526 case nir_intrinsic_load_subgroup_invocation:
2527 case nir_intrinsic_load_subgroup_eq_mask:
2528 case nir_intrinsic_load_subgroup_ge_mask:
2529 case nir_intrinsic_load_subgroup_gt_mask:
2530 case nir_intrinsic_load_subgroup_lt_mask:
2531 case nir_intrinsic_load_subgroup_le_mask:
2532 ntt_emit_load_sysval(c, instr);
2533 break;
2534
2535 case nir_intrinsic_load_input:
2536 case nir_intrinsic_load_per_vertex_input:
2537 case nir_intrinsic_load_interpolated_input:
2538 ntt_emit_load_input(c, instr);
2539 break;
2540
2541 case nir_intrinsic_store_output:
2542 case nir_intrinsic_store_per_vertex_output:
2543 ntt_emit_store_output(c, instr);
2544 break;
2545
2546 case nir_intrinsic_load_output:
2547 case nir_intrinsic_load_per_vertex_output:
2548 ntt_emit_load_output(c, instr);
2549 break;
2550
2551 case nir_intrinsic_demote:
2552 ntt_DEMOTE(c);
2553 break;
2554
2555 case nir_intrinsic_terminate:
2556 ntt_KILL(c);
2557 break;
2558
2559 case nir_intrinsic_terminate_if: {
2560 struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);
2561
2562 if (c->native_integers) {
2563 struct ureg_dst temp = ureg_writemask(ntt_temp(c), 1);
2564 ntt_AND(c, temp, cond, ureg_imm1f(c->ureg, 1.0));
2565 ntt_KILL_IF(c, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
2566 } else {
2567 /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
2568 ntt_KILL_IF(c, ureg_negate(cond));
2569 }
2570 break;
2571 }
2572
2573 case nir_intrinsic_is_helper_invocation:
2574 ntt_READ_HELPER(c, ntt_get_dest(c, &instr->def));
2575 break;
2576
2577 case nir_intrinsic_vote_all:
2578 ntt_VOTE_ALL(c, ntt_get_dest(c, &instr->def), ntt_get_src(c,instr->src[0]));
2579 return;
2580 case nir_intrinsic_vote_any:
2581 ntt_VOTE_ANY(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2582 return;
2583 case nir_intrinsic_vote_ieq:
2584 ntt_VOTE_EQ(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2585 return;
2586 case nir_intrinsic_ballot:
2587 ntt_BALLOT(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2588 return;
2589 case nir_intrinsic_read_first_invocation:
2590 ntt_READ_FIRST(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2591 return;
2592 case nir_intrinsic_read_invocation:
2593 ntt_READ_INVOC(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]), ntt_get_src(c, instr->src[1]));
2594 return;
2595
2596 case nir_intrinsic_ddx:
2597 case nir_intrinsic_ddx_coarse:
2598 ntt_DDX(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2599 return;
2600 case nir_intrinsic_ddx_fine:
2601 ntt_DDX_FINE(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2602 return;
2603 case nir_intrinsic_ddy:
2604 case nir_intrinsic_ddy_coarse:
2605 ntt_DDY(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2606 return;
2607 case nir_intrinsic_ddy_fine:
2608 ntt_DDY_FINE(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2609 return;
2610
2611 case nir_intrinsic_load_ssbo:
2612 case nir_intrinsic_store_ssbo:
2613 case nir_intrinsic_ssbo_atomic:
2614 case nir_intrinsic_ssbo_atomic_swap:
2615 case nir_intrinsic_get_ssbo_size:
2616 ntt_emit_mem(c, instr, nir_var_mem_ssbo);
2617 break;
2618
2619 case nir_intrinsic_load_shared:
2620 case nir_intrinsic_store_shared:
2621 case nir_intrinsic_shared_atomic:
2622 case nir_intrinsic_shared_atomic_swap:
2623 ntt_emit_mem(c, instr, nir_var_mem_shared);
2624 break;
2625
2626 case nir_intrinsic_atomic_counter_read:
2627 case nir_intrinsic_atomic_counter_add:
2628 case nir_intrinsic_atomic_counter_inc:
2629 case nir_intrinsic_atomic_counter_post_dec:
2630 case nir_intrinsic_atomic_counter_min:
2631 case nir_intrinsic_atomic_counter_max:
2632 case nir_intrinsic_atomic_counter_and:
2633 case nir_intrinsic_atomic_counter_or:
2634 case nir_intrinsic_atomic_counter_xor:
2635 case nir_intrinsic_atomic_counter_exchange:
2636 case nir_intrinsic_atomic_counter_comp_swap:
2637 ntt_emit_mem(c, instr, nir_var_uniform);
2638 break;
2639 case nir_intrinsic_atomic_counter_pre_dec:
2640 unreachable("Should be lowered by ntt_lower_atomic_pre_dec()");
2641 break;
2642
2643 case nir_intrinsic_image_load:
2644 case nir_intrinsic_image_store:
2645 case nir_intrinsic_image_size:
2646 case nir_intrinsic_image_samples:
2647 case nir_intrinsic_image_atomic:
2648 case nir_intrinsic_image_atomic_swap:
2649 case nir_intrinsic_bindless_image_load:
2650 case nir_intrinsic_bindless_image_store:
2651 case nir_intrinsic_bindless_image_size:
2652 case nir_intrinsic_bindless_image_samples:
2653 case nir_intrinsic_bindless_image_atomic:
2654 case nir_intrinsic_bindless_image_atomic_swap:
2655 ntt_emit_image_load_store(c, instr);
2656 break;
2657
2658 case nir_intrinsic_barrier:
2659 ntt_emit_barrier(c, instr);
2660 break;
2661
2662 case nir_intrinsic_end_primitive:
2663 ntt_ENDPRIM(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2664 break;
2665
2666 case nir_intrinsic_emit_vertex:
2667 ntt_EMIT(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2668 break;
2669
2670 /* In TGSI we don't actually generate the barycentric coords, and emit
2671 * interp intrinsics later. However, we do need to store the
2672 * load_barycentric_at_* argument so that we can use it at that point.
2673 */
2674 case nir_intrinsic_load_barycentric_pixel:
2675 case nir_intrinsic_load_barycentric_centroid:
2676 case nir_intrinsic_load_barycentric_sample:
2677 break;
2678 case nir_intrinsic_load_barycentric_at_sample:
2679 case nir_intrinsic_load_barycentric_at_offset:
2680 ntt_store(c, &instr->def, ntt_get_src(c, instr->src[0]));
2681 break;
2682
2683 case nir_intrinsic_shader_clock:
2684 ntt_CLOCK(c, ntt_get_dest(c, &instr->def));
2685 break;
2686
2687 case nir_intrinsic_decl_reg:
2688 case nir_intrinsic_load_reg:
2689 case nir_intrinsic_load_reg_indirect:
2690 case nir_intrinsic_store_reg:
2691 case nir_intrinsic_store_reg_indirect:
2692 /* fully consumed */
2693 break;
2694
2695 default:
2696 fprintf(stderr, "Unknown intrinsic: ");
2697 nir_print_instr(&instr->instr, stderr);
2698 fprintf(stderr, "\n");
2699 break;
2700 }
2701 }
2702
2703 struct ntt_tex_operand_state {
2704 struct ureg_src srcs[4];
2705 unsigned i;
2706 };
2707
2708 static void
ntt_push_tex_arg(struct ntt_compile * c,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_tex_operand_state * s)2709 ntt_push_tex_arg(struct ntt_compile *c,
2710 nir_tex_instr *instr,
2711 nir_tex_src_type tex_src_type,
2712 struct ntt_tex_operand_state *s)
2713 {
2714 int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
2715 if (tex_src < 0)
2716 return;
2717
2718 nir_src *src = &instr->src[tex_src].src;
2719
2720 /* virglrenderer workaround that's hard to do in tgsi_translate: Make sure
2721 * that TG4's immediate offset arg is float-typed.
2722 */
2723 if (instr->op == nir_texop_tg4 && tex_src_type == nir_tex_src_backend2 &&
2724 nir_src_is_const(*src)) {
2725 nir_const_value *consts = nir_src_as_const_value(*src);
2726 s->srcs[s->i++] = ureg_imm4f(c->ureg,
2727 consts[0].f32,
2728 consts[1].f32,
2729 consts[2].f32,
2730 consts[3].f32);
2731 return;
2732 }
2733
2734 s->srcs[s->i++] = ntt_get_src(c, *src);
2735 }
2736
2737 static void
ntt_emit_texture(struct ntt_compile * c,nir_tex_instr * instr)2738 ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
2739 {
2740 struct ureg_dst dst = ntt_get_dest(c, &instr->def);
2741 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
2742 unsigned tex_opcode;
2743
2744 int tex_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_texture_handle);
2745 int sampler_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle);
2746
2747 struct ureg_src sampler;
2748 if (tex_handle_src >= 0 && sampler_handle_src >= 0) {
2749 /* It seems we can't get separate tex/sampler on GL, just use one of the handles */
2750 sampler = ntt_get_src(c, instr->src[tex_handle_src].src);
2751 assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
2752 } else {
2753 assert(tex_handle_src == -1 && sampler_handle_src == -1);
2754 sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
2755 int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
2756 if (sampler_src >= 0) {
2757 struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
2758 sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr, 2));
2759 }
2760 }
2761
2762 switch (instr->op) {
2763 case nir_texop_tex:
2764 if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
2765 MAX2(instr->coord_components, 2) + instr->is_shadow)
2766 tex_opcode = TGSI_OPCODE_TXP;
2767 else
2768 tex_opcode = TGSI_OPCODE_TEX;
2769 break;
2770 case nir_texop_txf:
2771 case nir_texop_txf_ms:
2772 tex_opcode = TGSI_OPCODE_TXF;
2773
2774 if (c->has_txf_lz) {
2775 int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod);
2776 if (lod_src >= 0 &&
2777 nir_src_is_const(instr->src[lod_src].src) &&
2778 ntt_src_as_uint(c, instr->src[lod_src].src) == 0) {
2779 tex_opcode = TGSI_OPCODE_TXF_LZ;
2780 }
2781 }
2782 break;
2783 case nir_texop_txl:
2784 tex_opcode = TGSI_OPCODE_TXL;
2785 break;
2786 case nir_texop_txb:
2787 tex_opcode = TGSI_OPCODE_TXB;
2788 break;
2789 case nir_texop_txd:
2790 tex_opcode = TGSI_OPCODE_TXD;
2791 break;
2792 case nir_texop_txs:
2793 tex_opcode = TGSI_OPCODE_TXQ;
2794 break;
2795 case nir_texop_tg4:
2796 tex_opcode = TGSI_OPCODE_TG4;
2797 break;
2798 case nir_texop_query_levels:
2799 tex_opcode = TGSI_OPCODE_TXQ;
2800 break;
2801 case nir_texop_lod:
2802 tex_opcode = TGSI_OPCODE_LODQ;
2803 break;
2804 case nir_texop_texture_samples:
2805 tex_opcode = TGSI_OPCODE_TXQS;
2806 break;
2807 default:
2808 unreachable("unsupported tex op");
2809 }
2810
2811 struct ntt_tex_operand_state s = { .i = 0 };
2812 ntt_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
2813 ntt_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
2814
2815 /* non-coord arg for TXQ */
2816 if (tex_opcode == TGSI_OPCODE_TXQ) {
2817 ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
2818 /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
2819 * scalar
2820 */
2821 s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
2822 }
2823
2824 if (s.i > 1) {
2825 if (tex_opcode == TGSI_OPCODE_TEX)
2826 tex_opcode = TGSI_OPCODE_TEX2;
2827 if (tex_opcode == TGSI_OPCODE_TXB)
2828 tex_opcode = TGSI_OPCODE_TXB2;
2829 if (tex_opcode == TGSI_OPCODE_TXL)
2830 tex_opcode = TGSI_OPCODE_TXL2;
2831 }
2832
2833 if (instr->op == nir_texop_txd) {
2834 /* Derivs appear in their own src args */
2835 int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
2836 int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2837 s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
2838 s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
2839 }
2840
2841 if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
2842 if (c->screen->get_param(c->screen,
2843 PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
2844 sampler = ureg_scalar(sampler, instr->component);
2845 s.srcs[s.i++] = ureg_src_undef();
2846 } else {
2847 s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
2848 }
2849 }
2850
2851 s.srcs[s.i++] = sampler;
2852
2853 enum tgsi_return_type tex_type;
2854 switch (instr->dest_type) {
2855 case nir_type_float32:
2856 tex_type = TGSI_RETURN_TYPE_FLOAT;
2857 break;
2858 case nir_type_int32:
2859 tex_type = TGSI_RETURN_TYPE_SINT;
2860 break;
2861 case nir_type_uint32:
2862 tex_type = TGSI_RETURN_TYPE_UINT;
2863 break;
2864 default:
2865 unreachable("unknown texture type");
2866 }
2867
2868 struct ureg_dst tex_dst;
2869 if (instr->op == nir_texop_query_levels)
2870 tex_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2871 else
2872 tex_dst = dst;
2873
2874 while (s.i < 4)
2875 s.srcs[s.i++] = ureg_src_undef();
2876
2877 struct ntt_insn *insn = ntt_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]);
2878 insn->tex_target = target;
2879 insn->tex_return_type = tex_type;
2880 insn->is_tex = true;
2881
2882 int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
2883 if (tex_offset_src >= 0) {
2884 struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);
2885
2886 insn->tex_offset[0].File = offset.File;
2887 insn->tex_offset[0].Index = offset.Index;
2888 insn->tex_offset[0].SwizzleX = offset.SwizzleX;
2889 insn->tex_offset[0].SwizzleY = offset.SwizzleY;
2890 insn->tex_offset[0].SwizzleZ = offset.SwizzleZ;
2891 insn->tex_offset[0].Padding = 0;
2892 }
2893
2894 if (nir_tex_instr_has_explicit_tg4_offsets(instr)) {
2895 for (uint8_t i = 0; i < 4; ++i) {
2896 struct ureg_src imm = ureg_imm2i(c->ureg, instr->tg4_offsets[i][0], instr->tg4_offsets[i][1]);
2897 insn->tex_offset[i].File = imm.File;
2898 insn->tex_offset[i].Index = imm.Index;
2899 insn->tex_offset[i].SwizzleX = imm.SwizzleX;
2900 insn->tex_offset[i].SwizzleY = imm.SwizzleY;
2901 insn->tex_offset[i].SwizzleZ = imm.SwizzleZ;
2902 }
2903 }
2904
2905 if (instr->op == nir_texop_query_levels)
2906 ntt_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3));
2907 }
2908
2909 static void
ntt_emit_jump(struct ntt_compile * c,nir_jump_instr * jump)2910 ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
2911 {
2912 switch (jump->type) {
2913 case nir_jump_break:
2914 ntt_BRK(c);
2915 break;
2916
2917 case nir_jump_continue:
2918 ntt_CONT(c);
2919 break;
2920
2921 default:
2922 fprintf(stderr, "Unknown jump instruction: ");
2923 nir_print_instr(&jump->instr, stderr);
2924 fprintf(stderr, "\n");
2925 abort();
2926 }
2927 }
2928
2929 static void
ntt_emit_ssa_undef(struct ntt_compile * c,nir_undef_instr * instr)2930 ntt_emit_ssa_undef(struct ntt_compile *c, nir_undef_instr *instr)
2931 {
2932 /* Nothing to do but make sure that we have some storage to deref. */
2933 (void)ntt_get_ssa_def_decl(c, &instr->def);
2934 }
2935
2936 static void
ntt_emit_instr(struct ntt_compile * c,nir_instr * instr)2937 ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
2938 {
2939 switch (instr->type) {
2940 case nir_instr_type_deref:
2941 /* ignored, will be walked by nir_intrinsic_image_*_deref. */
2942 break;
2943
2944 case nir_instr_type_alu:
2945 ntt_emit_alu(c, nir_instr_as_alu(instr));
2946 break;
2947
2948 case nir_instr_type_intrinsic:
2949 ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2950 break;
2951
2952 case nir_instr_type_load_const:
2953 /* Nothing to do here, as load consts are done directly from
2954 * ntt_get_src() (since many constant NIR srcs will often get folded
2955 * directly into a register file index instead of as a TGSI src).
2956 */
2957 break;
2958
2959 case nir_instr_type_tex:
2960 ntt_emit_texture(c, nir_instr_as_tex(instr));
2961 break;
2962
2963 case nir_instr_type_jump:
2964 ntt_emit_jump(c, nir_instr_as_jump(instr));
2965 break;
2966
2967 case nir_instr_type_undef:
2968 ntt_emit_ssa_undef(c, nir_instr_as_undef(instr));
2969 break;
2970
2971 default:
2972 fprintf(stderr, "Unknown NIR instr type: ");
2973 nir_print_instr(instr, stderr);
2974 fprintf(stderr, "\n");
2975 abort();
2976 }
2977 }
2978
2979 static void
ntt_emit_if(struct ntt_compile * c,nir_if * if_stmt)2980 ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
2981 {
2982 if (c->native_integers)
2983 ntt_UIF(c, c->if_cond);
2984 else
2985 ntt_IF(c, c->if_cond);
2986
2987 ntt_emit_cf_list(c, &if_stmt->then_list);
2988
2989 if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
2990 ntt_ELSE(c);
2991 ntt_emit_cf_list(c, &if_stmt->else_list);
2992 }
2993
2994 ntt_ENDIF(c);
2995 }
2996
2997 static void
ntt_emit_loop(struct ntt_compile * c,nir_loop * loop)2998 ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
2999 {
3000 assert(!nir_loop_has_continue_construct(loop));
3001 ntt_BGNLOOP(c);
3002 ntt_emit_cf_list(c, &loop->body);
3003 ntt_ENDLOOP(c);
3004 }
3005
3006 static void
ntt_emit_block(struct ntt_compile * c,nir_block * block)3007 ntt_emit_block(struct ntt_compile *c, nir_block *block)
3008 {
3009 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
3010 c->cur_block = ntt_block;
3011
3012 nir_foreach_instr(instr, block) {
3013 ntt_emit_instr(c, instr);
3014
3015 /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntt_OPCODE(). */
3016 if (ureg_get_instruction_number(c->ureg) != 0) {
3017 fprintf(stderr, "Emitted ureg insn during: ");
3018 nir_print_instr(instr, stderr);
3019 fprintf(stderr, "\n");
3020 unreachable("emitted ureg insn");
3021 }
3022 }
3023
3024 /* Set up the if condition for ntt_emit_if(), which we have to do before
3025 * freeing up the temps (the "if" is treated as inside the block for liveness
3026 * purposes, despite not being an instruction)
3027 *
3028 * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
3029 * looks at all of .xyzw. No harm in working around the bug.
3030 */
3031 nir_if *nif = nir_block_get_following_if(block);
3032 if (nif)
3033 c->if_cond = ureg_scalar(ntt_get_src(c, nif->condition), TGSI_SWIZZLE_X);
3034 }
3035
3036 static void
ntt_emit_cf_list(struct ntt_compile * c,struct exec_list * list)3037 ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
3038 {
3039 foreach_list_typed(nir_cf_node, node, node, list) {
3040 switch (node->type) {
3041 case nir_cf_node_block:
3042 ntt_emit_block(c, nir_cf_node_as_block(node));
3043 break;
3044
3045 case nir_cf_node_if:
3046 ntt_emit_if(c, nir_cf_node_as_if(node));
3047 break;
3048
3049 case nir_cf_node_loop:
3050 ntt_emit_loop(c, nir_cf_node_as_loop(node));
3051 break;
3052
3053 default:
3054 unreachable("unknown CF type");
3055 }
3056 }
3057 }
3058
3059 static void
ntt_emit_block_ureg(struct ntt_compile * c,struct nir_block * block)3060 ntt_emit_block_ureg(struct ntt_compile *c, struct nir_block *block)
3061 {
3062 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
3063
3064 /* Emit the ntt insns to tgsi_ureg. */
3065 util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
3066 const struct tgsi_opcode_info *opcode_info =
3067 tgsi_get_opcode_info(insn->opcode);
3068
3069 switch (insn->opcode) {
3070 case TGSI_OPCODE_UIF:
3071 ureg_UIF(c->ureg, insn->src[0], &c->cf_label);
3072 break;
3073
3074 case TGSI_OPCODE_IF:
3075 ureg_IF(c->ureg, insn->src[0], &c->cf_label);
3076 break;
3077
3078 case TGSI_OPCODE_ELSE:
3079 ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
3080 ureg_ELSE(c->ureg, &c->cf_label);
3081 c->current_if_else = c->cf_label;
3082 break;
3083
3084 case TGSI_OPCODE_ENDIF:
3085 ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
3086 ureg_ENDIF(c->ureg);
3087 break;
3088
3089 case TGSI_OPCODE_BGNLOOP:
3090 /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
3091 * does reference BGNLOOP's. Follow the former behavior unless something comes up
3092 * with a need.
3093 */
3094 ureg_BGNLOOP(c->ureg, &c->cf_label);
3095 break;
3096
3097 case TGSI_OPCODE_ENDLOOP:
3098 ureg_ENDLOOP(c->ureg, &c->cf_label);
3099 break;
3100
3101 default:
3102 if (insn->is_tex) {
3103 int num_offsets = 0;
3104 for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
3105 if (insn->tex_offset[i].File != TGSI_FILE_NULL)
3106 num_offsets = i + 1;
3107 }
3108 ureg_tex_insn(c->ureg, insn->opcode,
3109 insn->dst, opcode_info->num_dst,
3110 insn->tex_target, insn->tex_return_type,
3111 insn->tex_offset,
3112 num_offsets,
3113 insn->src, opcode_info->num_src);
3114 } else if (insn->is_mem) {
3115 ureg_memory_insn(c->ureg, insn->opcode,
3116 insn->dst, opcode_info->num_dst,
3117 insn->src, opcode_info->num_src,
3118 insn->mem_qualifier,
3119 insn->tex_target,
3120 insn->mem_format);
3121 } else {
3122 ureg_insn(c->ureg, insn->opcode,
3123 insn->dst, opcode_info->num_dst,
3124 insn->src, opcode_info->num_src,
3125 insn->precise);
3126 }
3127 }
3128 }
3129 }
3130
3131 static void
ntt_emit_if_ureg(struct ntt_compile * c,nir_if * if_stmt)3132 ntt_emit_if_ureg(struct ntt_compile *c, nir_if *if_stmt)
3133 {
3134 /* Note: the last block emitted our IF opcode. */
3135
3136 int if_stack = c->current_if_else;
3137 c->current_if_else = c->cf_label;
3138
3139 /* Either the then or else block includes the ENDIF, which will fix up the
3140 * IF(/ELSE)'s label for jumping
3141 */
3142 ntt_emit_cf_list_ureg(c, &if_stmt->then_list);
3143 ntt_emit_cf_list_ureg(c, &if_stmt->else_list);
3144
3145 c->current_if_else = if_stack;
3146 }
3147
3148 static void
ntt_emit_cf_list_ureg(struct ntt_compile * c,struct exec_list * list)3149 ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list)
3150 {
3151 foreach_list_typed(nir_cf_node, node, node, list) {
3152 switch (node->type) {
3153 case nir_cf_node_block:
3154 ntt_emit_block_ureg(c, nir_cf_node_as_block(node));
3155 break;
3156
3157 case nir_cf_node_if:
3158 ntt_emit_if_ureg(c, nir_cf_node_as_if(node));
3159 break;
3160
3161 case nir_cf_node_loop:
3162 /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
3163 * does reference BGNLOOP's. Follow the former behavior unless something comes up
3164 * with a need.
3165 */
3166 ntt_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body);
3167 break;
3168
3169 default:
3170 unreachable("unknown CF type");
3171 }
3172 }
3173 }
3174
3175 static void
ntt_emit_impl(struct ntt_compile * c,nir_function_impl * impl)3176 ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
3177 {
3178 c->impl = impl;
3179
3180 c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
3181 c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);
3182
3183 /* Set up the struct ntt_blocks to put insns in */
3184 c->blocks = _mesa_pointer_hash_table_create(c);
3185 nir_foreach_block(block, impl) {
3186 struct ntt_block *ntt_block = rzalloc(c->blocks, struct ntt_block);
3187 util_dynarray_init(&ntt_block->insns, ntt_block);
3188 _mesa_hash_table_insert(c->blocks, block, ntt_block);
3189 }
3190
3191
3192 ntt_setup_registers(c);
3193
3194 c->cur_block = ntt_block_from_nir(c, nir_start_block(impl));
3195 ntt_setup_inputs(c);
3196 ntt_setup_outputs(c);
3197 ntt_setup_uniforms(c);
3198
3199 /* Emit the ntt insns */
3200 ntt_emit_cf_list(c, &impl->body);
3201
3202 /* Don't do optimized RA if the driver requests it, unless the number of
3203 * temps is too large to be covered by the 16 bit signed int that TGSI
3204 * allocates for the register index */
3205 if (!c->options->unoptimized_ra || c->num_temps > 0x7fff)
3206 ntt_allocate_regs(c, impl);
3207 else
3208 ntt_allocate_regs_unoptimized(c, impl);
3209
3210 /* Turn the ntt insns into actual TGSI tokens */
3211 ntt_emit_cf_list_ureg(c, &impl->body);
3212
3213 ralloc_free(c->liveness);
3214 c->liveness = NULL;
3215
3216 }
3217
3218 static int
type_size(const struct glsl_type * type,bool bindless)3219 type_size(const struct glsl_type *type, bool bindless)
3220 {
3221 return glsl_count_attribute_slots(type, false);
3222 }
3223
3224 /* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
3225 * can handle for 64-bit values in TGSI.
3226 */
3227 static uint8_t
ntt_should_vectorize_instr(const nir_instr * instr,const void * data)3228 ntt_should_vectorize_instr(const nir_instr *instr, const void *data)
3229 {
3230 if (instr->type != nir_instr_type_alu)
3231 return 0;
3232
3233 nir_alu_instr *alu = nir_instr_as_alu(instr);
3234
3235 switch (alu->op) {
3236 case nir_op_ibitfield_extract:
3237 case nir_op_ubitfield_extract:
3238 case nir_op_bitfield_insert:
3239 /* virglrenderer only looks at the .x channel of the offset/bits operands
3240 * when translating to GLSL. tgsi.rst doesn't seem to require scalar
3241 * offset/bits operands.
3242 *
3243 * https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/195
3244 */
3245 return 1;
3246
3247 default:
3248 break;
3249 }
3250
3251 int src_bit_size = nir_src_bit_size(alu->src[0].src);
3252 int dst_bit_size = alu->def.bit_size;
3253
3254 if (src_bit_size == 64 || dst_bit_size == 64) {
3255 /* Avoid vectorizing 64-bit instructions at all. Despite tgsi.rst
3256 * claiming support, virglrenderer generates bad shaders on the host when
3257 * presented with them. Maybe we can make virgl avoid tickling the
3258 * virglrenderer bugs, but given that glsl-to-TGSI didn't generate vector
3259 * 64-bit instrs in the first place, I don't see much reason to care about
3260 * this.
3261 */
3262 return 1;
3263 }
3264
3265 return 4;
3266 }
3267
3268 static bool
ntt_should_vectorize_io(unsigned align,unsigned bit_size,unsigned num_components,unsigned high_offset,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)3269 ntt_should_vectorize_io(unsigned align, unsigned bit_size,
3270 unsigned num_components, unsigned high_offset,
3271 nir_intrinsic_instr *low, nir_intrinsic_instr *high,
3272 void *data)
3273 {
3274 if (bit_size != 32)
3275 return false;
3276
3277 /* Our offset alignment should aways be at least 4 bytes */
3278 if (align < 4)
3279 return false;
3280
3281 /* No wrapping off the end of a TGSI reg. We could do a bit better by
3282 * looking at low's actual offset. XXX: With LOAD_CONSTBUF maybe we don't
3283 * need this restriction.
3284 */
3285 unsigned worst_start_component = align == 4 ? 3 : align / 4;
3286 if (worst_start_component + num_components > 4)
3287 return false;
3288
3289 return true;
3290 }
3291
3292 static nir_variable_mode
ntt_no_indirects_mask(nir_shader * s,struct pipe_screen * screen)3293 ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
3294 {
3295 unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3296 unsigned indirect_mask = 0;
3297
3298 if (!screen->get_shader_param(screen, pipe_stage,
3299 PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
3300 indirect_mask |= nir_var_shader_in;
3301 }
3302
3303 if (!screen->get_shader_param(screen, pipe_stage,
3304 PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
3305 indirect_mask |= nir_var_shader_out;
3306 }
3307
3308 if (!screen->get_shader_param(screen, pipe_stage,
3309 PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
3310 indirect_mask |= nir_var_function_temp;
3311 }
3312
3313 return indirect_mask;
3314 }
3315
3316 static void
ntt_optimize_nir(struct nir_shader * s,struct pipe_screen * screen,const struct nir_to_tgsi_options * options)3317 ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen,
3318 const struct nir_to_tgsi_options *options)
3319 {
3320 bool progress;
3321 unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3322 unsigned control_flow_depth =
3323 screen->get_shader_param(screen, pipe_stage,
3324 PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
3325 do {
3326 progress = false;
3327
3328 NIR_PASS_V(s, nir_lower_vars_to_ssa);
3329 NIR_PASS_V(s, nir_split_64bit_vec3_and_vec4);
3330
3331 NIR_PASS(progress, s, nir_copy_prop);
3332 NIR_PASS(progress, s, nir_opt_algebraic);
3333 NIR_PASS(progress, s, nir_opt_constant_folding);
3334 NIR_PASS(progress, s, nir_opt_remove_phis);
3335 NIR_PASS(progress, s, nir_opt_conditional_discard);
3336 NIR_PASS(progress, s, nir_opt_dce);
3337 NIR_PASS(progress, s, nir_opt_dead_cf);
3338 NIR_PASS(progress, s, nir_opt_cse);
3339 NIR_PASS(progress, s, nir_opt_find_array_copies);
3340 NIR_PASS(progress, s, nir_opt_copy_prop_vars);
3341 NIR_PASS(progress, s, nir_opt_dead_write_vars);
3342
3343 NIR_PASS(progress, s, nir_opt_if, nir_opt_if_optimize_phi_true_false);
3344 NIR_PASS(progress, s, nir_opt_peephole_select,
3345 control_flow_depth == 0 ? ~0 : 8, true, true);
3346 NIR_PASS(progress, s, nir_opt_algebraic);
3347 NIR_PASS(progress, s, nir_opt_constant_folding);
3348 nir_load_store_vectorize_options vectorize_opts = {
3349 .modes = nir_var_mem_ubo,
3350 .callback = ntt_should_vectorize_io,
3351 .robust_modes = 0,
3352 };
3353 NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
3354 NIR_PASS(progress, s, nir_opt_shrink_stores, true);
3355 NIR_PASS(progress, s, nir_opt_shrink_vectors, false);
3356 NIR_PASS(progress, s, nir_opt_loop);
3357 NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
3358 NIR_PASS(progress, s, nir_opt_undef);
3359 NIR_PASS(progress, s, nir_opt_loop_unroll);
3360
3361 /* Try to fold addressing math into ubo_vec4's base to avoid load_consts
3362 * and ALU ops for it.
3363 */
3364 nir_opt_offsets_options offset_options = {
3365 .ubo_vec4_max = ~0,
3366
3367 /* No const offset in TGSI for shared accesses. */
3368 .shared_max = 0,
3369
3370 /* unused intrinsics */
3371 .uniform_max = 0,
3372 .buffer_max = 0,
3373 };
3374
3375 if (options->ubo_vec4_max)
3376 offset_options.ubo_vec4_max = options->ubo_vec4_max;
3377
3378 NIR_PASS(progress, s, nir_opt_offsets, &offset_options);
3379 } while (progress);
3380
3381 NIR_PASS_V(s, nir_lower_var_copies);
3382 }
3383
3384 /* Scalarizes all 64-bit ALU ops. Note that we only actually need to
3385 * scalarize vec3/vec4s, should probably fix that.
3386 */
3387 static bool
scalarize_64bit(const nir_instr * instr,const void * data)3388 scalarize_64bit(const nir_instr *instr, const void *data)
3389 {
3390 const nir_alu_instr *alu = nir_instr_as_alu(instr);
3391
3392 return (alu->def.bit_size == 64 ||
3393 nir_src_bit_size(alu->src[0].src) == 64);
3394 }
3395
3396 static bool
nir_to_tgsi_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)3397 nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
3398 {
3399 b->cursor = nir_after_instr(&instr->instr);
3400
3401 switch (instr->intrinsic) {
3402 case nir_intrinsic_load_ubo:
3403 case nir_intrinsic_load_ubo_vec4:
3404 case nir_intrinsic_load_ssbo:
3405 case nir_intrinsic_load_input:
3406 case nir_intrinsic_load_interpolated_input:
3407 case nir_intrinsic_load_per_vertex_input:
3408 case nir_intrinsic_store_output:
3409 case nir_intrinsic_store_per_vertex_output:
3410 case nir_intrinsic_store_ssbo:
3411 break;
3412 default:
3413 return false;
3414 }
3415
3416 if (instr->num_components <= 2)
3417 return false;
3418
3419 bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
3420 if (has_dest) {
3421 if (instr->def.bit_size != 64)
3422 return false;
3423 } else {
3424 if (nir_src_bit_size(instr->src[0]) != 64)
3425 return false;
3426 }
3427
3428 nir_intrinsic_instr *first =
3429 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3430 nir_intrinsic_instr *second =
3431 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3432
3433 switch (instr->intrinsic) {
3434 case nir_intrinsic_load_ubo:
3435 case nir_intrinsic_load_ubo_vec4:
3436 case nir_intrinsic_load_ssbo:
3437 case nir_intrinsic_store_ssbo:
3438 break;
3439
3440 default: {
3441 nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
3442 semantics.location++;
3443 semantics.num_slots--;
3444 nir_intrinsic_set_io_semantics(second, semantics);
3445
3446 nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
3447 break;
3448 }
3449 }
3450
3451 first->num_components = 2;
3452 second->num_components -= 2;
3453 if (has_dest) {
3454 first->def.num_components = 2;
3455 second->def.num_components -= 2;
3456 }
3457
3458 nir_builder_instr_insert(b, &first->instr);
3459 nir_builder_instr_insert(b, &second->instr);
3460
3461 if (has_dest) {
3462 /* Merge the two loads' results back into a vector. */
3463 nir_scalar channels[4] = {
3464 nir_get_scalar(&first->def, 0),
3465 nir_get_scalar(&first->def, 1),
3466 nir_get_scalar(&second->def, 0),
3467 nir_get_scalar(&second->def, second->num_components > 1 ? 1 : 0),
3468 };
3469 nir_def *new = nir_vec_scalars(b, channels, instr->num_components);
3470 nir_def_rewrite_uses(&instr->def, new);
3471 } else {
3472 /* Split the src value across the two stores. */
3473 b->cursor = nir_before_instr(&instr->instr);
3474
3475 nir_def *src0 = instr->src[0].ssa;
3476 nir_scalar channels[4] = { 0 };
3477 for (int i = 0; i < instr->num_components; i++)
3478 channels[i] = nir_get_scalar(src0, i);
3479
3480 nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
3481 nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
3482
3483 nir_src_rewrite(&first->src[0], nir_vec_scalars(b, channels, 2));
3484 nir_src_rewrite(&second->src[0],
3485 nir_vec_scalars(b, &channels[2], second->num_components));
3486 }
3487
3488 int offset_src = -1;
3489 uint32_t offset_amount = 16;
3490
3491 switch (instr->intrinsic) {
3492 case nir_intrinsic_load_ssbo:
3493 case nir_intrinsic_load_ubo:
3494 offset_src = 1;
3495 break;
3496 case nir_intrinsic_load_ubo_vec4:
3497 offset_src = 1;
3498 offset_amount = 1;
3499 break;
3500 case nir_intrinsic_store_ssbo:
3501 offset_src = 2;
3502 break;
3503 default:
3504 break;
3505 }
3506 if (offset_src != -1) {
3507 b->cursor = nir_before_instr(&second->instr);
3508 nir_def *second_offset =
3509 nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
3510 nir_src_rewrite(&second->src[offset_src], second_offset);
3511 }
3512
3513 /* DCE stores we generated with no writemask (nothing else does this
3514 * currently).
3515 */
3516 if (!has_dest) {
3517 if (nir_intrinsic_write_mask(first) == 0)
3518 nir_instr_remove(&first->instr);
3519 if (nir_intrinsic_write_mask(second) == 0)
3520 nir_instr_remove(&second->instr);
3521 }
3522
3523 nir_instr_remove(&instr->instr);
3524
3525 return true;
3526 }
3527
3528 static bool
nir_to_tgsi_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)3529 nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
3530 {
3531 int num_components = instr->def.num_components;
3532
3533 if (instr->def.bit_size != 64 || num_components <= 2)
3534 return false;
3535
3536 b->cursor = nir_before_instr(&instr->instr);
3537
3538 nir_load_const_instr *first =
3539 nir_load_const_instr_create(b->shader, 2, 64);
3540 nir_load_const_instr *second =
3541 nir_load_const_instr_create(b->shader, num_components - 2, 64);
3542
3543 first->value[0] = instr->value[0];
3544 first->value[1] = instr->value[1];
3545 second->value[0] = instr->value[2];
3546 if (num_components == 4)
3547 second->value[1] = instr->value[3];
3548
3549 nir_builder_instr_insert(b, &first->instr);
3550 nir_builder_instr_insert(b, &second->instr);
3551
3552 nir_def *channels[4] = {
3553 nir_channel(b, &first->def, 0),
3554 nir_channel(b, &first->def, 1),
3555 nir_channel(b, &second->def, 0),
3556 num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
3557 };
3558 nir_def *new = nir_vec(b, channels, num_components);
3559 nir_def_replace(&instr->def, new);
3560
3561 return true;
3562 }
3563
3564 static bool
nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,void * data)3565 nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
3566 void *data)
3567 {
3568 switch (instr->type) {
3569 case nir_instr_type_load_const:
3570 return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
3571
3572 case nir_instr_type_intrinsic:
3573 return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
3574 default:
3575 return false;
3576 }
3577 }
3578
3579 static bool
nir_to_tgsi_lower_64bit_to_vec2(nir_shader * s)3580 nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
3581 {
3582 return nir_shader_instructions_pass(s,
3583 nir_to_tgsi_lower_64bit_to_vec2_instr,
3584 nir_metadata_control_flow,
3585 NULL);
3586 }
3587
3588 struct ntt_lower_tex_state {
3589 nir_scalar channels[8];
3590 unsigned i;
3591 };
3592
3593 static void
nir_to_tgsi_lower_tex_instr_arg(nir_builder * b,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_lower_tex_state * s)3594 nir_to_tgsi_lower_tex_instr_arg(nir_builder *b,
3595 nir_tex_instr *instr,
3596 nir_tex_src_type tex_src_type,
3597 struct ntt_lower_tex_state *s)
3598 {
3599 int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
3600 if (tex_src < 0)
3601 return;
3602
3603 nir_def *def = instr->src[tex_src].src.ssa;
3604 for (int i = 0; i < def->num_components; i++) {
3605 s->channels[s->i++] = nir_get_scalar(def, i);
3606 }
3607
3608 nir_tex_instr_remove_src(instr, tex_src);
3609 }
3610
3611 /**
3612 * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
3613 * src. This lets NIR handle the coalescing of the vec4 rather than trying to
3614 * manage it on our own, and may lead to more vectorization.
3615 */
3616 static bool
nir_to_tgsi_lower_tex_instr(nir_builder * b,nir_instr * instr,void * data)3617 nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
3618 {
3619 if (instr->type != nir_instr_type_tex)
3620 return false;
3621
3622 nir_tex_instr *tex = nir_instr_as_tex(instr);
3623
3624 if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
3625 return false;
3626
3627 b->cursor = nir_before_instr(instr);
3628
3629 struct ntt_lower_tex_state s = {0};
3630
3631 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
3632 /* We always have at least two slots for the coordinate, even on 1D. */
3633 s.i = MAX2(s.i, 2);
3634
3635 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
3636 s.i = MAX2(s.i, 3);
3637
3638 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
3639
3640 /* XXX: LZ */
3641 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
3642 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
3643 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
3644
3645 /* No need to pack undefs in unused channels of the tex instr */
3646 while (!s.channels[s.i - 1].def)
3647 s.i--;
3648
3649 /* Instead of putting undefs in the unused slots of the vecs, just put in
3650 * another used channel. Otherwise, we'll get unnecessary moves into
3651 * registers.
3652 */
3653 assert(s.channels[0].def != NULL);
3654 for (int i = 1; i < s.i; i++) {
3655 if (!s.channels[i].def)
3656 s.channels[i] = s.channels[0];
3657 }
3658
3659 nir_tex_instr_add_src(tex, nir_tex_src_backend1,
3660 nir_vec_scalars(b, s.channels, MIN2(s.i, 4)));
3661 if (s.i > 4)
3662 nir_tex_instr_add_src(tex, nir_tex_src_backend2,
3663 nir_vec_scalars(b, &s.channels[4], s.i - 4));
3664
3665 return true;
3666 }
3667
3668 static bool
nir_to_tgsi_lower_tex(nir_shader * s)3669 nir_to_tgsi_lower_tex(nir_shader *s)
3670 {
3671 return nir_shader_instructions_pass(s,
3672 nir_to_tgsi_lower_tex_instr,
3673 nir_metadata_control_flow,
3674 NULL);
3675 }
3676
3677 static void
ntt_fix_nir_options(struct pipe_screen * screen,struct nir_shader * s,const struct nir_to_tgsi_options * ntt_options)3678 ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s,
3679 const struct nir_to_tgsi_options *ntt_options)
3680 {
3681 const struct nir_shader_compiler_options *options = s->options;
3682 bool lower_fsqrt =
3683 !screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage),
3684 PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
3685
3686 bool force_indirect_unrolling_sampler =
3687 screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL) < 400;
3688
3689 nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3690
3691 if (!options->lower_extract_byte ||
3692 !options->lower_extract_word ||
3693 !options->lower_insert_byte ||
3694 !options->lower_insert_word ||
3695 !options->lower_fdph ||
3696 !options->lower_flrp64 ||
3697 !options->lower_fmod ||
3698 !options->lower_uadd_carry ||
3699 !options->lower_usub_borrow ||
3700 !options->lower_uadd_sat ||
3701 !options->lower_usub_sat ||
3702 !options->lower_uniforms_to_ubo ||
3703 !options->lower_vector_cmp ||
3704 options->has_rotate8 ||
3705 options->has_rotate16 ||
3706 options->has_rotate32 ||
3707 options->lower_fsqrt != lower_fsqrt ||
3708 options->force_indirect_unrolling != no_indirects_mask ||
3709 force_indirect_unrolling_sampler) {
3710 nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options);
3711 *new_options = *s->options;
3712
3713 new_options->lower_extract_byte = true;
3714 new_options->lower_extract_word = true;
3715 new_options->lower_insert_byte = true;
3716 new_options->lower_insert_word = true;
3717 new_options->lower_fdph = true;
3718 new_options->lower_flrp64 = true;
3719 new_options->lower_fmod = true;
3720 new_options->lower_uadd_carry = true;
3721 new_options->lower_usub_borrow = true;
3722 new_options->lower_uadd_sat = true;
3723 new_options->lower_usub_sat = true;
3724 new_options->lower_uniforms_to_ubo = true;
3725 new_options->lower_vector_cmp = true;
3726 new_options->lower_fsqrt = lower_fsqrt;
3727 new_options->has_rotate8 = false;
3728 new_options->has_rotate16 = false;
3729 new_options->has_rotate32 = false;
3730 new_options->force_indirect_unrolling = no_indirects_mask;
3731 new_options->force_indirect_unrolling_sampler = force_indirect_unrolling_sampler;
3732
3733 s->options = new_options;
3734 }
3735 }
3736
3737 static bool
ntt_lower_atomic_pre_dec_filter(const nir_instr * instr,const void * _data)3738 ntt_lower_atomic_pre_dec_filter(const nir_instr *instr, const void *_data)
3739 {
3740 return (instr->type == nir_instr_type_intrinsic &&
3741 nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_atomic_counter_pre_dec);
3742 }
3743
3744 static nir_def *
ntt_lower_atomic_pre_dec_lower(nir_builder * b,nir_instr * instr,void * _data)3745 ntt_lower_atomic_pre_dec_lower(nir_builder *b, nir_instr *instr, void *_data)
3746 {
3747 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3748
3749 nir_def *old_result = &intr->def;
3750 intr->intrinsic = nir_intrinsic_atomic_counter_post_dec;
3751
3752 return nir_iadd_imm(b, old_result, -1);
3753 }
3754
3755 static bool
ntt_lower_atomic_pre_dec(nir_shader * s)3756 ntt_lower_atomic_pre_dec(nir_shader *s)
3757 {
3758 return nir_shader_lower_instructions(s,
3759 ntt_lower_atomic_pre_dec_filter,
3760 ntt_lower_atomic_pre_dec_lower, NULL);
3761 }
3762
3763 /* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
3764 static void
nir_to_tgsi_lower_txp(nir_shader * s)3765 nir_to_tgsi_lower_txp(nir_shader *s)
3766 {
3767 nir_lower_tex_options lower_tex_options = {
3768 .lower_txp = 0,
3769 };
3770
3771 nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
3772 nir_foreach_instr(instr, block) {
3773 if (instr->type != nir_instr_type_tex)
3774 continue;
3775 nir_tex_instr *tex = nir_instr_as_tex(instr);
3776
3777 if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
3778 continue;
3779
3780 bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
3781 bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
3782 bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
3783
3784 /* We can do TXP for any tex (not txg) where we can fit all the
3785 * coordinates and comparator and projector in one vec4 without any
3786 * other modifiers to add on.
3787 *
3788 * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
3789 * if we get any funny projectors then we just blow them all away.
3790 */
3791 if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
3792 lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
3793 }
3794 }
3795
3796 /* nir_lower_tex must be run even if no options are set, because we need the
3797 * LOD to be set for query_levels and for non-fragment shaders.
3798 */
3799 NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
3800 }
3801
3802 static bool
nir_lower_primid_sysval_to_input_filter(const nir_instr * instr,const void * _data)3803 nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data)
3804 {
3805 return (instr->type == nir_instr_type_intrinsic &&
3806 nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id);
3807 }
3808
3809 static nir_def *
nir_lower_primid_sysval_to_input_lower(nir_builder * b,nir_instr * instr,void * data)3810 nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data)
3811 {
3812 nir_variable *var = nir_get_variable_with_location(b->shader, nir_var_shader_in,
3813 VARYING_SLOT_PRIMITIVE_ID, glsl_uint_type());
3814
3815 nir_io_semantics semantics = {
3816 .location = var->data.location,
3817 .num_slots = 1
3818 };
3819 return nir_load_input(b, 1, 32, nir_imm_int(b, 0),
3820 .base = var->data.driver_location,
3821 .io_semantics = semantics);
3822 }
3823
3824 static bool
nir_lower_primid_sysval_to_input(nir_shader * s)3825 nir_lower_primid_sysval_to_input(nir_shader *s)
3826 {
3827 return nir_shader_lower_instructions(s,
3828 nir_lower_primid_sysval_to_input_filter,
3829 nir_lower_primid_sysval_to_input_lower, NULL);
3830 }
3831
3832 const void *
nir_to_tgsi(struct nir_shader * s,struct pipe_screen * screen)3833 nir_to_tgsi(struct nir_shader *s,
3834 struct pipe_screen *screen)
3835 {
3836 static const struct nir_to_tgsi_options default_ntt_options = {0};
3837 return nir_to_tgsi_options(s, screen, &default_ntt_options);
3838 }
3839
3840 /* Prevent lower_vec_to_mov from coalescing 64-to-32 conversions and comparisons
3841 * into unsupported channels of registers.
3842 */
3843 static bool
ntt_vec_to_mov_writemask_cb(const nir_instr * instr,unsigned writemask,UNUSED const void * _data)3844 ntt_vec_to_mov_writemask_cb(const nir_instr *instr, unsigned writemask, UNUSED const void *_data)
3845 {
3846 if (instr->type != nir_instr_type_alu)
3847 return false;
3848
3849 nir_alu_instr *alu = nir_instr_as_alu(instr);
3850 int dst_32 = alu->def.bit_size == 32;
3851 int src_64 = nir_src_bit_size(alu->src[0].src) == 64;
3852
3853 if (src_64 && dst_32) {
3854 int num_srcs = nir_op_infos[alu->op].num_inputs;
3855
3856 if (num_srcs == 2 || nir_op_infos[alu->op].output_type == nir_type_bool32) {
3857 /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz
3858 * instead of .xy. Just support scalar compares storing to .x,
3859 * GLSL-to-TGSI only ever emitted scalar ops anyway.
3860 */
3861 if (writemask != TGSI_WRITEMASK_X)
3862 return false;
3863 } else {
3864 /* TGSI's 64-to-32-bit conversions can only store to .xy (since a TGSI
3865 * register can only store a dvec2). Don't try to coalesce to write to
3866 * .zw.
3867 */
3868 if (writemask & ~(TGSI_WRITEMASK_XY))
3869 return false;
3870 }
3871 }
3872
3873 return true;
3874 }
3875
3876 /**
3877 * Translates the NIR shader to TGSI.
3878 *
3879 * This requires some lowering of the NIR shader to prepare it for translation.
3880 * We take ownership of the NIR shader passed, returning a reference to the new
3881 * TGSI tokens instead. If you need to keep the NIR, then pass us a clone.
3882 */
nir_to_tgsi_options(struct nir_shader * s,struct pipe_screen * screen,const struct nir_to_tgsi_options * options)3883 const void *nir_to_tgsi_options(struct nir_shader *s,
3884 struct pipe_screen *screen,
3885 const struct nir_to_tgsi_options *options)
3886 {
3887 struct ntt_compile *c;
3888 const void *tgsi_tokens;
3889 nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3890 bool native_integers = screen->get_shader_param(screen,
3891 pipe_shader_type_from_mesa(s->info.stage),
3892 PIPE_SHADER_CAP_INTEGERS);
3893 const struct nir_shader_compiler_options *original_options = s->options;
3894
3895 ntt_fix_nir_options(screen, s, options);
3896
3897 /* Lower array indexing on FS inputs. Since we don't set
3898 * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to
3899 * elements by ureg, and so dynamically indexing them would be invalid.
3900 * Ideally we would set that ureg flag based on
3901 * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st
3902 * splitting NIR VS outputs to elements even if the FS doesn't get the
3903 * corresponding splitting, and virgl depends on TGSI across link boundaries
3904 * having matching declarations.
3905 */
3906 if (s->info.stage == MESA_SHADER_FRAGMENT) {
3907 NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
3908 NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL);
3909 }
3910
3911 /* Lower tesslevel indirect derefs for tessellation shader.
3912 * tesslevels are now a compact array variable and nir expects a constant
3913 * array index into the compact array variable.
3914 */
3915 if (s->info.stage == MESA_SHADER_TESS_CTRL ||
3916 s->info.stage == MESA_SHADER_TESS_EVAL) {
3917 NIR_PASS_V(s, nir_lower_indirect_derefs, 0 , UINT32_MAX);
3918 }
3919
3920 NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3921 type_size, (nir_lower_io_options)0);
3922
3923 nir_to_tgsi_lower_txp(s);
3924 NIR_PASS_V(s, nir_to_tgsi_lower_tex);
3925
3926 /* While TGSI can represent PRIMID as either an input or a system value,
3927 * glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers
3928 * depend on that.
3929 */
3930 if (s->info.stage == MESA_SHADER_GEOMETRY)
3931 NIR_PASS_V(s, nir_lower_primid_sysval_to_input);
3932
3933 if (s->info.num_abos)
3934 NIR_PASS_V(s, ntt_lower_atomic_pre_dec);
3935
3936 if (!original_options->lower_uniforms_to_ubo) {
3937 NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
3938 screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
3939 !native_integers);
3940 }
3941
3942 /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
3943 * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
3944 * duplication logic we just make it so that we only see vec2s.
3945 */
3946 NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
3947 NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);
3948
3949 if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
3950 NIR_PASS_V(s, nir_lower_ubo_vec4);
3951
3952 ntt_optimize_nir(s, screen, options);
3953
3954 NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
3955
3956 /* Lower demote_if to if (cond) { demote } because TGSI doesn't have a DEMOTE_IF. */
3957 NIR_PASS_V(s, nir_lower_discard_if, nir_lower_demote_if_to_cf);
3958
3959 NIR_PASS_V(s, nir_lower_frexp);
3960
3961 bool progress;
3962 do {
3963 progress = false;
3964 NIR_PASS(progress, s, nir_opt_algebraic_late);
3965 if (progress) {
3966 NIR_PASS_V(s, nir_copy_prop);
3967 NIR_PASS_V(s, nir_opt_dce);
3968 NIR_PASS_V(s, nir_opt_cse);
3969 }
3970 } while (progress);
3971
3972 NIR_PASS_V(s, nir_opt_combine_barriers, NULL, NULL);
3973
3974 if (screen->get_shader_param(screen,
3975 pipe_shader_type_from_mesa(s->info.stage),
3976 PIPE_SHADER_CAP_INTEGERS)) {
3977 NIR_PASS_V(s, nir_lower_bool_to_int32);
3978 } else {
3979 NIR_PASS_V(s, nir_lower_int_to_float);
3980 NIR_PASS_V(s, nir_lower_bool_to_float,
3981 !options->lower_cmp && !options->lower_fabs);
3982 /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
3983 NIR_PASS_V(s, nir_copy_prop);
3984 NIR_PASS_V(s, nir_opt_dce);
3985 }
3986
3987 nir_move_options move_all =
3988 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
3989 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
3990
3991 NIR_PASS_V(s, nir_opt_move, move_all);
3992
3993 NIR_PASS_V(s, nir_convert_from_ssa, true);
3994 NIR_PASS_V(s, nir_lower_vec_to_regs, ntt_vec_to_mov_writemask_cb, NULL);
3995
3996 /* locals_to_reg_intrinsics will leave dead derefs that are good to clean up.
3997 */
3998 NIR_PASS_V(s, nir_lower_locals_to_regs, 32);
3999 NIR_PASS_V(s, nir_opt_dce);
4000
4001 /* See comment in ntt_get_alu_src for supported modifiers */
4002 NIR_PASS_V(s, nir_legacy_trivialize, !options->lower_fabs);
4003
4004 if (NIR_DEBUG(TGSI)) {
4005 fprintf(stderr, "NIR before translation to TGSI:\n");
4006 nir_print_shader(s, stderr);
4007 }
4008
4009 c = rzalloc(NULL, struct ntt_compile);
4010 c->screen = screen;
4011 c->options = options;
4012
4013 c->needs_texcoord_semantic =
4014 screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
4015 c->has_txf_lz =
4016 screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ);
4017
4018 c->s = s;
4019 c->native_integers = native_integers;
4020 c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
4021 ureg_setup_shader_info(c->ureg, &s->info);
4022 if (s->info.use_legacy_math_rules && screen->get_param(screen, PIPE_CAP_LEGACY_MATH_RULES))
4023 ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1);
4024
4025 if (s->info.stage == MESA_SHADER_FRAGMENT) {
4026 /* The draw module's polygon stipple layer doesn't respect the chosen
4027 * coordinate mode, so leave it as unspecified unless we're actually
4028 * reading the position in the shader already. See
4029 * gl-2.1-polygon-stipple-fs on softpipe.
4030 */
4031 if ((s->info.inputs_read & VARYING_BIT_POS) ||
4032 BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
4033 ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
4034 s->info.fs.origin_upper_left ?
4035 TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
4036 TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
4037
4038 ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
4039 s->info.fs.pixel_center_integer ?
4040 TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
4041 TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
4042 }
4043 }
4044 /* Emit the main function */
4045 nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
4046 ntt_emit_impl(c, impl);
4047 ureg_END(c->ureg);
4048
4049 tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
4050
4051 if (NIR_DEBUG(TGSI)) {
4052 fprintf(stderr, "TGSI after translation from NIR:\n");
4053 tgsi_dump(tgsi_tokens, 0);
4054 }
4055
4056 ureg_destroy(c->ureg);
4057
4058 ralloc_free(c);
4059 ralloc_free(s);
4060
4061 return tgsi_tokens;
4062 }
4063
4064 static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
4065 .fdot_replicates = true,
4066 .fuse_ffma32 = true,
4067 .fuse_ffma64 = true,
4068 .lower_extract_byte = true,
4069 .lower_extract_word = true,
4070 .lower_insert_byte = true,
4071 .lower_insert_word = true,
4072 .lower_fdph = true,
4073 .lower_flrp64 = true,
4074 .lower_fmod = true,
4075 .lower_uniforms_to_ubo = true,
4076 .lower_uadd_carry = true,
4077 .lower_usub_borrow = true,
4078 .lower_uadd_sat = true,
4079 .lower_usub_sat = true,
4080 .lower_vector_cmp = true,
4081 .lower_int64_options = nir_lower_imul_2x32_64,
4082 .use_interpolated_input_intrinsics = true,
4083
4084 /* TGSI doesn't have a semantic for local or global index, just local and
4085 * workgroup id.
4086 */
4087 .lower_cs_local_index_to_id = true,
4088 .has_ddx_intrinsics = true,
4089 };
4090
4091 /* Returns a default compiler options for drivers with only nir-to-tgsi-based
4092 * NIR support.
4093 */
4094 const void *
nir_to_tgsi_get_compiler_options(struct pipe_screen * pscreen,enum pipe_shader_ir ir,unsigned shader)4095 nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,
4096 enum pipe_shader_ir ir,
4097 unsigned shader)
4098 {
4099 assert(ir == PIPE_SHADER_IR_NIR);
4100 return &nir_to_tgsi_compiler_options;
4101 }
4102
4103 /** Helper for getting TGSI tokens to store for a pipe_shader_state CSO. */
4104 const void *
pipe_shader_state_to_tgsi_tokens(struct pipe_screen * screen,const struct pipe_shader_state * cso)4105 pipe_shader_state_to_tgsi_tokens(struct pipe_screen *screen,
4106 const struct pipe_shader_state *cso)
4107 {
4108 if (cso->type == PIPE_SHADER_IR_NIR) {
4109 return nir_to_tgsi((nir_shader *)cso->ir.nir, screen);
4110 } else {
4111 assert(cso->type == PIPE_SHADER_IR_TGSI);
4112 /* we need to keep a local copy of the tokens */
4113 return tgsi_dup_tokens(cso->tokens);
4114 }
4115 }
4116