xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/freedreno/a2xx/ir2_nir.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Jonathan Marek <[email protected]>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Jonathan Marek <[email protected]>
7  */
8 
9 #include "ir2_private.h"
10 
11 #include "fd2_program.h"
12 #include "freedreno_util.h"
13 #include "nir_legacy.h"
14 
15 static const nir_shader_compiler_options options = {
16    .compact_arrays = true,
17    .lower_fpow = true,
18    .lower_flrp32 = true,
19    .lower_fmod = true,
20    .lower_fdiv = true,
21    .lower_fceil = true,
22    .fuse_ffma16 = true,
23    .fuse_ffma32 = true,
24    .fuse_ffma64 = true,
25    /* .fdot_replicates = true, it is replicated, but it makes things worse */
26    .lower_all_io_to_temps = true,
27    .vertex_id_zero_based = true, /* its not implemented anyway */
28    .lower_bitops = true,
29    .lower_vector_cmp = true,
30    .lower_fdph = true,
31    .has_fsub = true,
32    .has_isub = true,
33    .no_integers = true,
34    .lower_insert_byte = true,
35    .lower_insert_word = true,
36    .force_indirect_unrolling = nir_var_all,
37    .force_indirect_unrolling_sampler = true,
38    .max_unroll_iterations = 32,
39 };
40 
41 const nir_shader_compiler_options *
ir2_get_compiler_options(void)42 ir2_get_compiler_options(void)
43 {
44    return &options;
45 }
46 
47 #define OPT(nir, pass, ...)                                                    \
48    ({                                                                          \
49       bool this_progress = false;                                              \
50       NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);                       \
51       this_progress;                                                           \
52    })
53 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
54 
55 static void
ir2_optimize_loop(nir_shader * s)56 ir2_optimize_loop(nir_shader *s)
57 {
58    bool progress;
59    do {
60       progress = false;
61 
62       OPT_V(s, nir_lower_vars_to_ssa);
63       progress |= OPT(s, nir_opt_copy_prop_vars);
64       progress |= OPT(s, nir_copy_prop);
65       progress |= OPT(s, nir_opt_dce);
66       progress |= OPT(s, nir_opt_cse);
67       /* progress |= OPT(s, nir_opt_gcm, true); */
68       progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
69       progress |= OPT(s, nir_opt_intrinsics);
70       progress |= OPT(s, nir_opt_algebraic);
71       progress |= OPT(s, nir_opt_constant_folding);
72       progress |= OPT(s, nir_opt_dead_cf);
73       if (OPT(s, nir_opt_loop)) {
74          progress |= true;
75          /* If nir_opt_loop makes progress, then we need to clean
76           * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
77           * to make progress.
78           */
79          OPT(s, nir_copy_prop);
80          OPT(s, nir_opt_dce);
81       }
82       progress |= OPT(s, nir_opt_loop_unroll);
83       progress |= OPT(s, nir_opt_if, nir_opt_if_optimize_phi_true_false);
84       progress |= OPT(s, nir_opt_remove_phis);
85       progress |= OPT(s, nir_opt_undef);
86 
87    } while (progress);
88 }
89 
90 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
91 bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
92 
93 int
ir2_optimize_nir(nir_shader * s,bool lower)94 ir2_optimize_nir(nir_shader *s, bool lower)
95 {
96    struct nir_lower_tex_options tex_options = {
97       .lower_txp = ~0u,
98       .lower_rect = 0,
99       .lower_invalid_implicit_lod = true,
100    };
101 
102    if (FD_DBG(DISASM)) {
103       debug_printf("----------------------\n");
104       nir_print_shader(s, stdout);
105       debug_printf("----------------------\n");
106    }
107 
108    OPT_V(s, nir_lower_vars_to_ssa);
109    OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out,
110          UINT32_MAX);
111 
112    if (lower) {
113       OPT_V(s, ir3_nir_apply_trig_workarounds);
114       OPT_V(s, nir_lower_tex, &tex_options);
115    }
116 
117    ir2_optimize_loop(s);
118 
119    OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
120    OPT_V(s, nir_opt_sink, nir_move_const_undef);
121 
122    /* TODO we dont want to get shaders writing to depth for depth textures */
123    if (s->info.stage == MESA_SHADER_FRAGMENT) {
124       nir_foreach_shader_out_variable (var, s) {
125          if (var->data.location == FRAG_RESULT_DEPTH)
126             return -1;
127       }
128    }
129 
130    return 0;
131 }
132 
133 static struct ir2_src
load_const(struct ir2_context * ctx,float * value_f,unsigned ncomp)134 load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
135 {
136    struct fd2_shader_stateobj *so = ctx->so;
137    unsigned imm_ncomp, swiz, idx, i, j;
138    uint32_t *value = (uint32_t *)value_f;
139 
140    /* try to merge with existing immediate (TODO: try with neg) */
141    for (idx = 0; idx < so->num_immediates; idx++) {
142       swiz = 0;
143       imm_ncomp = so->immediates[idx].ncomp;
144       for (i = 0; i < ncomp; i++) {
145          for (j = 0; j < imm_ncomp; j++) {
146             if (value[i] == so->immediates[idx].val[j])
147                break;
148          }
149          if (j == imm_ncomp) {
150             if (j == 4)
151                break;
152             so->immediates[idx].val[imm_ncomp++] = value[i];
153          }
154          swiz |= swiz_set(j, i);
155       }
156       /* matched all components */
157       if (i == ncomp)
158          break;
159    }
160 
161    /* need to allocate new immediate */
162    if (idx == so->num_immediates) {
163       swiz = 0;
164       imm_ncomp = 0;
165       for (i = 0; i < ncomp; i++) {
166          for (j = 0; j < imm_ncomp; j++) {
167             if (value[i] == ctx->so->immediates[idx].val[j])
168                break;
169          }
170          if (j == imm_ncomp) {
171             so->immediates[idx].val[imm_ncomp++] = value[i];
172          }
173          swiz |= swiz_set(j, i);
174       }
175       so->num_immediates++;
176    }
177    so->immediates[idx].ncomp = imm_ncomp;
178 
179    if (ncomp == 1)
180       swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
181 
182    return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
183 }
184 
185 struct ir2_src
ir2_zero(struct ir2_context * ctx)186 ir2_zero(struct ir2_context *ctx)
187 {
188    return load_const(ctx, (float[]){0.0f}, 1);
189 }
190 
191 static void
update_range(struct ir2_context * ctx,struct ir2_reg * reg)192 update_range(struct ir2_context *ctx, struct ir2_reg *reg)
193 {
194    if (!reg->initialized) {
195       reg->initialized = true;
196       reg->loop_depth = ctx->loop_depth;
197    }
198 
199    if (ctx->loop_depth > reg->loop_depth) {
200       reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
201    } else {
202       reg->loop_depth = ctx->loop_depth;
203       reg->block_idx_free = -1;
204    }
205 
206    /* for regs we want to free at the end of the loop in any case
207     * XXX dont do this for ssa
208     */
209    if (reg->loop_depth)
210       reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
211 }
212 
213 static struct ir2_src
make_legacy_src(struct ir2_context * ctx,nir_legacy_src src)214 make_legacy_src(struct ir2_context *ctx, nir_legacy_src src)
215 {
216    struct ir2_src res = {};
217    struct ir2_reg *reg;
218 
219    /* Handle constants specially */
220    if (src.is_ssa) {
221       nir_const_value *const_value =
222          nir_src_as_const_value(nir_src_for_ssa(src.ssa));
223 
224       if (const_value) {
225          float c[src.ssa->num_components];
226          nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);
227          return load_const(ctx, c, src.ssa->num_components);
228       }
229    }
230 
231    /* Otherwise translate the SSA def or register */
232    if (!src.is_ssa) {
233       res.num = src.reg.handle->index;
234       res.type = IR2_SRC_REG;
235       reg = &ctx->reg[res.num];
236    } else {
237       assert(ctx->ssa_map[src.ssa->index] >= 0);
238       res.num = ctx->ssa_map[src.ssa->index];
239       res.type = IR2_SRC_SSA;
240       reg = &ctx->instr[res.num].ssa;
241    }
242 
243    update_range(ctx, reg);
244    return res;
245 }
246 
247 static struct ir2_src
make_src(struct ir2_context * ctx,nir_src src)248 make_src(struct ir2_context *ctx, nir_src src)
249 {
250    return make_legacy_src(ctx, nir_legacy_chase_src(&src));
251 }
252 
253 static void
set_legacy_index(struct ir2_context * ctx,nir_legacy_dest dst,struct ir2_instr * instr)254 set_legacy_index(struct ir2_context *ctx, nir_legacy_dest dst,
255                  struct ir2_instr *instr)
256 {
257    struct ir2_reg *reg = &instr->ssa;
258 
259    if (dst.is_ssa) {
260       ctx->ssa_map[dst.ssa->index] = instr->idx;
261    } else {
262       reg = &ctx->reg[dst.reg.handle->index];
263 
264       instr->is_ssa = false;
265       instr->reg = reg;
266    }
267    update_range(ctx, reg);
268 }
269 
270 static void
set_index(struct ir2_context * ctx,nir_def * def,struct ir2_instr * instr)271 set_index(struct ir2_context *ctx, nir_def *def, struct ir2_instr *instr)
272 {
273    set_legacy_index(ctx, nir_legacy_chase_dest(def), instr);
274 }
275 
276 static struct ir2_instr *
ir2_instr_create(struct ir2_context * ctx,int type)277 ir2_instr_create(struct ir2_context *ctx, int type)
278 {
279    struct ir2_instr *instr;
280 
281    instr = &ctx->instr[ctx->instr_count++];
282    instr->idx = ctx->instr_count - 1;
283    instr->type = type;
284    instr->block_idx = ctx->block_idx;
285    instr->pred = ctx->pred;
286    instr->is_ssa = true;
287    return instr;
288 }
289 
290 static struct ir2_instr *
instr_create_alu(struct ir2_context * ctx,nir_op opcode,unsigned ncomp)291 instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
292 {
293    /* emit_alu will fixup instrs that don't map directly */
294    static const struct ir2_opc {
295       int8_t scalar, vector;
296    } nir_ir2_opc[nir_num_opcodes + 1] = {
297       [0 ... nir_num_opcodes - 1] = {-1, -1},
298 
299       [nir_op_mov] = {MAXs, MAXv},
300       [nir_op_fneg] = {MAXs, MAXv},
301       [nir_op_fabs] = {MAXs, MAXv},
302       [nir_op_fsat] = {MAXs, MAXv},
303       [nir_op_fsign] = {-1, CNDGTEv},
304       [nir_op_fadd] = {ADDs, ADDv},
305       [nir_op_fsub] = {ADDs, ADDv},
306       [nir_op_fmul] = {MULs, MULv},
307       [nir_op_ffma] = {-1, MULADDv},
308       [nir_op_fmax] = {MAXs, MAXv},
309       [nir_op_fmin] = {MINs, MINv},
310       [nir_op_ffloor] = {FLOORs, FLOORv},
311       [nir_op_ffract] = {FRACs, FRACv},
312       [nir_op_ftrunc] = {TRUNCs, TRUNCv},
313       [nir_op_fdot2] = {-1, DOT2ADDv},
314       [nir_op_fdot3] = {-1, DOT3v},
315       [nir_op_fdot4] = {-1, DOT4v},
316       [nir_op_sge] = {-1, SETGTEv},
317       [nir_op_slt] = {-1, SETGTv},
318       [nir_op_sne] = {-1, SETNEv},
319       [nir_op_seq] = {-1, SETEv},
320       [nir_op_fcsel] = {-1, CNDEv},
321       [nir_op_frsq] = {RECIPSQ_IEEE, -1},
322       [nir_op_frcp] = {RECIP_IEEE, -1},
323       [nir_op_flog2] = {LOG_IEEE, -1},
324       [nir_op_fexp2] = {EXP_IEEE, -1},
325       [nir_op_fsqrt] = {SQRT_IEEE, -1},
326       [nir_op_fcos] = {COS, -1},
327       [nir_op_fsin] = {SIN, -1},
328    /* no fsat, fneg, fabs since source mods deal with those */
329 
330    /* so we can use this function with non-nir op */
331 #define ir2_op_cube nir_num_opcodes
332       [ir2_op_cube] = {-1, CUBEv},
333    };
334 
335    struct ir2_opc op = nir_ir2_opc[opcode];
336    assert(op.vector >= 0 || op.scalar >= 0);
337 
338    struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
339    instr->alu.vector_opc = op.vector;
340    instr->alu.scalar_opc = op.scalar;
341    instr->alu.export = -1;
342    instr->alu.write_mask = (1 << ncomp) - 1;
343    instr->src_count =
344       opcode == ir2_op_cube ? 2 : nir_op_infos[opcode].num_inputs;
345    instr->ssa.ncomp = ncomp;
346    return instr;
347 }
348 
349 static struct ir2_instr *
instr_create_alu_reg(struct ir2_context * ctx,nir_op opcode,uint8_t write_mask,struct ir2_instr * share_reg)350 instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode, uint8_t write_mask,
351                      struct ir2_instr *share_reg)
352 {
353    struct ir2_instr *instr;
354    struct ir2_reg *reg;
355 
356    reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
357    reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
358 
359    instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
360    instr->alu.write_mask = write_mask;
361    instr->reg = reg;
362    instr->is_ssa = false;
363    return instr;
364 }
365 
366 static struct ir2_instr *
instr_create_alu_dest(struct ir2_context * ctx,nir_op opcode,nir_def * def)367 instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_def *def)
368 {
369    struct ir2_instr *instr;
370    instr = instr_create_alu(ctx, opcode, def->num_components);
371    set_index(ctx, def, instr);
372    return instr;
373 }
374 
375 static struct ir2_instr *
ir2_instr_create_fetch(struct ir2_context * ctx,nir_def * def,instr_fetch_opc_t opc)376 ir2_instr_create_fetch(struct ir2_context *ctx, nir_def *def,
377                        instr_fetch_opc_t opc)
378 {
379    struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
380    instr->fetch.opc = opc;
381    instr->src_count = 1;
382    instr->ssa.ncomp = def->num_components;
383    set_index(ctx, def, instr);
384    return instr;
385 }
386 
387 static struct ir2_src
make_src_noconst(struct ir2_context * ctx,nir_src src)388 make_src_noconst(struct ir2_context *ctx, nir_src src)
389 {
390    struct ir2_instr *instr;
391 
392    if (nir_src_as_const_value(src)) {
393       instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);
394       instr->src[0] = make_src(ctx, src);
395       return ir2_src(instr->idx, 0, IR2_SRC_SSA);
396    }
397 
398    return make_src(ctx, src);
399 }
400 
401 static void
emit_alu(struct ir2_context * ctx,nir_alu_instr * alu)402 emit_alu(struct ir2_context *ctx, nir_alu_instr *alu)
403 {
404    const nir_op_info *info = &nir_op_infos[alu->op];
405    nir_def *def = &alu->def;
406    struct ir2_instr *instr;
407    struct ir2_src tmp;
408    unsigned ncomp;
409 
410    /* Don't emit modifiers that are totally folded */
411    if (((alu->op == nir_op_fneg) || (alu->op == nir_op_fabs)) &&
412        nir_legacy_float_mod_folds(alu))
413       return;
414 
415    if ((alu->op == nir_op_fsat) && nir_legacy_fsat_folds(alu))
416       return;
417 
418    /* get the number of dst components */
419    ncomp = def->num_components;
420 
421    instr = instr_create_alu(ctx, alu->op, ncomp);
422 
423    nir_legacy_alu_dest legacy_dest =
424       nir_legacy_chase_alu_dest(&alu->def);
425    set_legacy_index(ctx, legacy_dest.dest, instr);
426    instr->alu.saturate = legacy_dest.fsat;
427    instr->alu.write_mask = legacy_dest.write_mask;
428 
429    for (int i = 0; i < info->num_inputs; i++) {
430       nir_alu_src *src = &alu->src[i];
431 
432       /* compress swizzle with writemask when applicable */
433       unsigned swiz = 0, j = 0;
434       for (int i = 0; i < 4; i++) {
435          if (!(legacy_dest.write_mask & 1 << i) && !info->output_size)
436             continue;
437          swiz |= swiz_set(src->swizzle[i], j++);
438       }
439 
440       nir_legacy_alu_src legacy_src =
441          nir_legacy_chase_alu_src(src, true /* fuse_abs */);
442 
443       instr->src[i] = make_legacy_src(ctx, legacy_src.src);
444       instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
445       instr->src[i].negate = legacy_src.fneg;
446       instr->src[i].abs = legacy_src.fabs;
447    }
448 
449    /* workarounds for NIR ops that don't map directly to a2xx ops */
450    switch (alu->op) {
451    case nir_op_fneg:
452       instr->src[0].negate = 1;
453       break;
454    case nir_op_fabs:
455       instr->src[0].abs = 1;
456       break;
457    case nir_op_fsat:
458       instr->alu.saturate = 1;
459       break;
460    case nir_op_slt:
461       tmp = instr->src[0];
462       instr->src[0] = instr->src[1];
463       instr->src[1] = tmp;
464       break;
465    case nir_op_fcsel:
466       tmp = instr->src[1];
467       instr->src[1] = instr->src[2];
468       instr->src[2] = tmp;
469       break;
470    case nir_op_fsub:
471       instr->src[1].negate = !instr->src[1].negate;
472       break;
473    case nir_op_fdot2:
474       instr->src_count = 3;
475       instr->src[2] = ir2_zero(ctx);
476       break;
477    case nir_op_fsign: {
478       /* we need an extra instruction to deal with the zero case */
479       struct ir2_instr *tmp;
480 
481       /* tmp = x == 0 ? 0 : 1 */
482       tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
483       tmp->src[0] = instr->src[0];
484       tmp->src[1] = ir2_zero(ctx);
485       tmp->src[2] = load_const(ctx, (float[]){1.0f}, 1);
486 
487       /* result = x >= 0 ? tmp : -tmp */
488       instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
489       instr->src[2] = instr->src[1];
490       instr->src[2].negate = true;
491       instr->src_count = 3;
492    } break;
493    default:
494       break;
495    }
496 }
497 
498 static void
load_input(struct ir2_context * ctx,nir_def * def,unsigned idx)499 load_input(struct ir2_context *ctx, nir_def *def, unsigned idx)
500 {
501    struct ir2_instr *instr;
502    int slot = -1;
503 
504    if (ctx->so->type == MESA_SHADER_VERTEX) {
505       instr = ir2_instr_create_fetch(ctx, def, 0);
506       instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
507       instr->fetch.vtx.const_idx = 20 + (idx / 3);
508       instr->fetch.vtx.const_idx_sel = idx % 3;
509       return;
510    }
511 
512    /* get slot from idx */
513    nir_foreach_shader_in_variable (var, ctx->nir) {
514       if (var->data.driver_location == idx) {
515          slot = var->data.location;
516          break;
517       }
518    }
519    assert(slot >= 0);
520 
521    switch (slot) {
522    case VARYING_SLOT_POS:
523       /* need to extract xy with abs and add tile offset on a20x
524        * zw from fragcoord input (w inverted in fragment shader)
525        * TODO: only components that are required by fragment shader
526        */
527       instr = instr_create_alu_reg(
528          ctx, ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);
529       instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
530       instr->src[0].abs = true;
531       /* on a20x, C64 contains the tile offset */
532       instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
533 
534       instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);
535       instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
536 
537       instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
538       instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
539 
540       unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
541       instr = instr_create_alu_dest(ctx, nir_op_mov, def);
542       instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
543       break;
544    default:
545       instr = instr_create_alu_dest(ctx, nir_op_mov, def);
546       instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
547       break;
548    }
549 }
550 
551 static unsigned
output_slot(struct ir2_context * ctx,nir_intrinsic_instr * intr)552 output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
553 {
554    int slot = -1;
555    unsigned idx = nir_intrinsic_base(intr);
556    nir_foreach_shader_out_variable (var, ctx->nir) {
557       if (var->data.driver_location == idx) {
558          slot = var->data.location;
559          break;
560       }
561    }
562    assert(slot != -1);
563    return slot;
564 }
565 
566 static void
store_output(struct ir2_context * ctx,nir_src src,unsigned slot,unsigned ncomp)567 store_output(struct ir2_context *ctx, nir_src src, unsigned slot,
568              unsigned ncomp)
569 {
570    struct ir2_instr *instr;
571    unsigned idx = 0;
572 
573    if (ctx->so->type == MESA_SHADER_VERTEX) {
574       switch (slot) {
575       case VARYING_SLOT_POS:
576          ctx->position = make_src(ctx, src);
577          idx = 62;
578          break;
579       case VARYING_SLOT_PSIZ:
580          ctx->so->writes_psize = true;
581          idx = 63;
582          break;
583       default:
584          /* find matching slot from fragment shader input */
585          for (idx = 0; idx < ctx->f->inputs_count; idx++)
586             if (ctx->f->inputs[idx].slot == slot)
587                break;
588          if (idx == ctx->f->inputs_count)
589             return;
590       }
591    } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
592       /* only color output is implemented */
593       return;
594    }
595 
596    instr = instr_create_alu(ctx, nir_op_mov, ncomp);
597    instr->src[0] = make_src(ctx, src);
598    instr->alu.export = idx;
599 }
600 
601 static void
emit_intrinsic(struct ir2_context * ctx,nir_intrinsic_instr * intr)602 emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
603 {
604    struct ir2_instr *instr;
605    ASSERTED nir_const_value *const_offset;
606    unsigned idx;
607 
608    switch (intr->intrinsic) {
609    case nir_intrinsic_decl_reg:
610    case nir_intrinsic_load_reg:
611    case nir_intrinsic_store_reg:
612       /* Nothing to do for these */
613       break;
614 
615    case nir_intrinsic_load_input:
616       load_input(ctx, &intr->def, nir_intrinsic_base(intr));
617       break;
618    case nir_intrinsic_store_output:
619       store_output(ctx, intr->src[0], output_slot(ctx, intr),
620                    intr->num_components);
621       break;
622    case nir_intrinsic_load_uniform:
623       const_offset = nir_src_as_const_value(intr->src[0]);
624       assert(const_offset); /* TODO can be false in ES2? */
625       idx = nir_intrinsic_base(intr);
626       idx += (uint32_t)const_offset[0].f32;
627       instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->def);
628       instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
629       break;
630    case nir_intrinsic_terminate:
631    case nir_intrinsic_terminate_if:
632       instr = ir2_instr_create(ctx, IR2_ALU);
633       instr->alu.vector_opc = VECTOR_NONE;
634       if (intr->intrinsic == nir_intrinsic_terminate_if) {
635          instr->alu.scalar_opc = KILLNEs;
636          instr->src[0] = make_src(ctx, intr->src[0]);
637       } else {
638          instr->alu.scalar_opc = KILLEs;
639          instr->src[0] = ir2_zero(ctx);
640       }
641       instr->alu.export = -1;
642       instr->src_count = 1;
643       ctx->so->has_kill = true;
644       break;
645    case nir_intrinsic_load_front_face:
646       /* gl_FrontFacing is in the sign of param.x
647        * rcp required because otherwise we can't differentiate -0.0 and +0.0
648        */
649       ctx->so->need_param = true;
650 
651       struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
652       tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
653 
654       instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->def);
655       instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
656       instr->src[1] = ir2_zero(ctx);
657       break;
658    case nir_intrinsic_load_point_coord:
659       /* param.zw (note: abs might be needed like fragcoord in param.xy?) */
660       ctx->so->need_param = true;
661 
662       instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->def);
663       instr->src[0] =
664          ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
665       break;
666    default:
667       compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
668       break;
669    }
670 }
671 
672 static void
emit_tex(struct ir2_context * ctx,nir_tex_instr * tex)673 emit_tex(struct ir2_context *ctx, nir_tex_instr *tex)
674 {
675    bool is_rect = false, is_cube = false;
676    struct ir2_instr *instr;
677    nir_src *coord, *lod_bias;
678 
679    coord = lod_bias = NULL;
680 
681    for (unsigned i = 0; i < tex->num_srcs; i++) {
682       switch (tex->src[i].src_type) {
683       case nir_tex_src_coord:
684          coord = &tex->src[i].src;
685          break;
686       case nir_tex_src_bias:
687       case nir_tex_src_lod:
688          assert(!lod_bias);
689          lod_bias = &tex->src[i].src;
690          break;
691       default:
692          compile_error(ctx, "Unhandled NIR tex src type: %d\n",
693                        tex->src[i].src_type);
694          return;
695       }
696    }
697 
698    switch (tex->op) {
699    case nir_texop_tex:
700    case nir_texop_txb:
701    case nir_texop_txl:
702       break;
703    default:
704       compile_error(ctx, "unimplemented texop %d\n", tex->op);
705       return;
706    }
707 
708    switch (tex->sampler_dim) {
709    case GLSL_SAMPLER_DIM_2D:
710    case GLSL_SAMPLER_DIM_EXTERNAL:
711       break;
712    case GLSL_SAMPLER_DIM_RECT:
713       is_rect = true;
714       break;
715    case GLSL_SAMPLER_DIM_CUBE:
716       is_cube = true;
717       break;
718    default:
719       compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
720       return;
721    }
722 
723    struct ir2_src src_coord = make_src_noconst(ctx, *coord);
724 
725    /* for cube maps
726     * tmp = cube(coord)
727     * tmp.xy = tmp.xy / |tmp.z| + 1.5
728     * coord = tmp.xyw
729     */
730    if (is_cube) {
731       struct ir2_instr *rcp, *coord_xy;
732       unsigned reg_idx;
733 
734       instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
735       instr->src[0] = src_coord;
736       instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
737       instr->src[1] = src_coord;
738       instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
739 
740       reg_idx = instr->reg - ctx->reg; /* hacky */
741 
742       rcp = instr_create_alu(ctx, nir_op_frcp, 1);
743       rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
744       rcp->src[0].abs = true;
745 
746       coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
747       coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
748       coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
749       coord_xy->src[2] = load_const(ctx, (float[]){1.5f}, 1);
750 
751       src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
752       /* TODO: lod/bias transformed by src_coord.z ? */
753    }
754 
755    instr = ir2_instr_create_fetch(ctx, &tex->def, TEX_FETCH);
756    instr->src[0] = src_coord;
757    instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0;
758    instr->fetch.tex.is_cube = is_cube;
759    instr->fetch.tex.is_rect = is_rect;
760    instr->fetch.tex.samp_id = tex->sampler_index;
761 
762    /* for lod/bias, we insert an extra src for the backend to deal with */
763    if (lod_bias) {
764       instr->src[1] = make_src_noconst(ctx, *lod_bias);
765       /* backend will use 2-3 components so apply swizzle */
766       swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
767       instr->src_count = 2;
768    }
769 }
770 
771 static void
setup_input(struct ir2_context * ctx,nir_variable * in)772 setup_input(struct ir2_context *ctx, nir_variable *in)
773 {
774    struct fd2_shader_stateobj *so = ctx->so;
775    unsigned n = in->data.driver_location;
776    unsigned slot = in->data.location;
777 
778    assert(glsl_type_is_vector_or_scalar(in->type) ||
779           glsl_type_is_unsized_array(in->type));
780 
781    /* handle later */
782    if (ctx->so->type == MESA_SHADER_VERTEX)
783       return;
784 
785    if (ctx->so->type != MESA_SHADER_FRAGMENT)
786       compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
787 
788    n = ctx->f->inputs_count++;
789 
790    /* half of fragcoord from param reg, half from a varying */
791    if (slot == VARYING_SLOT_POS) {
792       ctx->f->fragcoord = n;
793       so->need_param = true;
794    }
795 
796    ctx->f->inputs[n].slot = slot;
797    ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
798 
799    /* in->data.interpolation?
800     * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
801     */
802 }
803 
804 static void
emit_undef(struct ir2_context * ctx,nir_undef_instr * undef)805 emit_undef(struct ir2_context *ctx, nir_undef_instr *undef)
806 {
807    /* TODO we don't want to emit anything for undefs */
808 
809    struct ir2_instr *instr;
810 
811    instr = instr_create_alu_dest(ctx, nir_op_mov, &undef->def);
812    instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
813 }
814 
815 static void
emit_instr(struct ir2_context * ctx,nir_instr * instr)816 emit_instr(struct ir2_context *ctx, nir_instr *instr)
817 {
818    switch (instr->type) {
819    case nir_instr_type_alu:
820       emit_alu(ctx, nir_instr_as_alu(instr));
821       break;
822    case nir_instr_type_deref:
823       /* ignored, handled as part of the intrinsic they are src to */
824       break;
825    case nir_instr_type_intrinsic:
826       emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
827       break;
828    case nir_instr_type_load_const:
829       /* dealt with when using nir_src */
830       break;
831    case nir_instr_type_tex:
832       emit_tex(ctx, nir_instr_as_tex(instr));
833       break;
834    case nir_instr_type_jump:
835       ctx->block_has_jump[ctx->block_idx] = true;
836       break;
837    case nir_instr_type_undef:
838       emit_undef(ctx, nir_instr_as_undef(instr));
839       break;
840    default:
841       break;
842    }
843 }
844 
845 /* fragcoord.zw and a20x hw binning outputs */
846 static void
extra_position_exports(struct ir2_context * ctx,bool binning)847 extra_position_exports(struct ir2_context *ctx, bool binning)
848 {
849    struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
850 
851    if (ctx->f->fragcoord < 0 && !binning)
852       return;
853 
854    instr = instr_create_alu(ctx, nir_op_fmax, 1);
855    instr->src[0] = ctx->position;
856    instr->src[0].swizzle = IR2_SWIZZLE_W;
857    instr->src[1] = ir2_zero(ctx);
858 
859    rcp = instr_create_alu(ctx, nir_op_frcp, 1);
860    rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
861 
862    sc = instr_create_alu(ctx, nir_op_fmul, 4);
863    sc->src[0] = ctx->position;
864    sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
865 
866    wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
867    wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
868    wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
869    wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
870 
871    /* fragcoord z/w */
872    if (ctx->f->fragcoord >= 0 && !binning) {
873       instr = instr_create_alu(ctx, nir_op_mov, 1);
874       instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
875       instr->alu.export = ctx->f->fragcoord;
876 
877       instr = instr_create_alu(ctx, nir_op_mov, 1);
878       instr->src[0] = ctx->position;
879       instr->src[0].swizzle = IR2_SWIZZLE_W;
880       instr->alu.export = ctx->f->fragcoord;
881       instr->alu.write_mask = 2;
882    }
883 
884    if (!binning)
885       return;
886 
887    off = instr_create_alu(ctx, nir_op_fadd, 1);
888    off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
889    off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
890 
891    /* 8 max set in freedreno_screen.. unneeded instrs patched out */
892    for (int i = 0; i < 8; i++) {
893       instr = instr_create_alu(ctx, nir_op_ffma, 4);
894       instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
895       instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
896       instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
897       instr->alu.export = 32;
898 
899       instr = instr_create_alu(ctx, nir_op_ffma, 4);
900       instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
901       instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
902       instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
903       instr->alu.export = 33;
904    }
905 }
906 
907 static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
908 
909 static bool
emit_block(struct ir2_context * ctx,nir_block * block)910 emit_block(struct ir2_context *ctx, nir_block *block)
911 {
912    struct ir2_instr *instr;
913    nir_block *succs = block->successors[0];
914 
915    ctx->block_idx = block->index;
916 
917    nir_foreach_instr (instr, block)
918       emit_instr(ctx, instr);
919 
920    if (!succs || !succs->index)
921       return false;
922 
923    /* we want to be smart and always jump and have the backend cleanup
924     * but we are not, so there are two cases where jump is needed:
925     *  loops (succs index lower)
926     *  jumps (jump instruction seen in block)
927     */
928    if (succs->index > block->index && !ctx->block_has_jump[block->index])
929       return false;
930 
931    assert(block->successors[1] == NULL);
932 
933    instr = ir2_instr_create(ctx, IR2_CF);
934    instr->cf.block_idx = succs->index;
935    /* XXX can't jump to a block with different predicate */
936    return true;
937 }
938 
939 static void
emit_if(struct ir2_context * ctx,nir_if * nif)940 emit_if(struct ir2_context *ctx, nir_if *nif)
941 {
942    unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
943    struct ir2_instr *instr;
944 
945    /* XXX: blob seems to always use same register for condition */
946 
947    instr = ir2_instr_create(ctx, IR2_ALU);
948    instr->src[0] = make_src(ctx, nif->condition);
949    instr->src_count = 1;
950    instr->ssa.ncomp = 1;
951    instr->alu.vector_opc = VECTOR_NONE;
952    instr->alu.scalar_opc = SCALAR_NONE;
953    instr->alu.export = -1;
954    instr->alu.write_mask = 1;
955    instr->pred = 0;
956 
957    /* if nested, use PRED_SETNE_PUSHv */
958    if (pred) {
959       instr->alu.vector_opc = PRED_SETNE_PUSHv;
960       instr->src[1] = instr->src[0];
961       instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
962       instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
963       instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
964       instr->src_count = 2;
965    } else {
966       instr->alu.scalar_opc = PRED_SETNEs;
967    }
968 
969    ctx->pred_idx = instr->idx;
970    ctx->pred = 3;
971 
972    emit_cf_list(ctx, &nif->then_list);
973 
974    /* TODO: if these is no else branch we don't need this
975     * and if the else branch is simple, can just flip ctx->pred instead
976     */
977    instr = ir2_instr_create(ctx, IR2_ALU);
978    instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
979    instr->src_count = 1;
980    instr->ssa.ncomp = 1;
981    instr->alu.vector_opc = VECTOR_NONE;
982    instr->alu.scalar_opc = PRED_SET_INVs;
983    instr->alu.export = -1;
984    instr->alu.write_mask = 1;
985    instr->pred = 0;
986    ctx->pred_idx = instr->idx;
987 
988    emit_cf_list(ctx, &nif->else_list);
989 
990    /* restore predicate for nested predicates */
991    if (pred) {
992       instr = ir2_instr_create(ctx, IR2_ALU);
993       instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
994       instr->src_count = 1;
995       instr->ssa.ncomp = 1;
996       instr->alu.vector_opc = VECTOR_NONE;
997       instr->alu.scalar_opc = PRED_SET_POPs;
998       instr->alu.export = -1;
999       instr->alu.write_mask = 1;
1000       instr->pred = 0;
1001       ctx->pred_idx = instr->idx;
1002    }
1003 
1004    /* restore ctx->pred */
1005    ctx->pred = pred;
1006 }
1007 
1008 /* get the highest block idx in the loop, so we know when
1009  * we can free registers that are allocated outside the loop
1010  */
1011 static unsigned
loop_last_block(struct exec_list * list)1012 loop_last_block(struct exec_list *list)
1013 {
1014    nir_cf_node *node =
1015       exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
1016    switch (node->type) {
1017    case nir_cf_node_block:
1018       return nir_cf_node_as_block(node)->index;
1019    case nir_cf_node_if:
1020       assert(0); /* XXX could this ever happen? */
1021       return 0;
1022    case nir_cf_node_loop:
1023       return loop_last_block(&nir_cf_node_as_loop(node)->body);
1024    default:
1025       compile_error(ctx, "Not supported\n");
1026       return 0;
1027    }
1028 }
1029 
1030 static void
emit_loop(struct ir2_context * ctx,nir_loop * nloop)1031 emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1032 {
1033    assert(!nir_loop_has_continue_construct(nloop));
1034    ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1035    emit_cf_list(ctx, &nloop->body);
1036    ctx->loop_depth--;
1037 }
1038 
1039 static bool
emit_cf_list(struct ir2_context * ctx,struct exec_list * list)1040 emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1041 {
1042    bool ret = false;
1043    foreach_list_typed (nir_cf_node, node, node, list) {
1044       ret = false;
1045       switch (node->type) {
1046       case nir_cf_node_block:
1047          ret = emit_block(ctx, nir_cf_node_as_block(node));
1048          break;
1049       case nir_cf_node_if:
1050          emit_if(ctx, nir_cf_node_as_if(node));
1051          break;
1052       case nir_cf_node_loop:
1053          emit_loop(ctx, nir_cf_node_as_loop(node));
1054          break;
1055       case nir_cf_node_function:
1056          compile_error(ctx, "Not supported\n");
1057          break;
1058       }
1059    }
1060    return ret;
1061 }
1062 
1063 static void
cleanup_binning(struct ir2_context * ctx)1064 cleanup_binning(struct ir2_context *ctx)
1065 {
1066    assert(ctx->so->type == MESA_SHADER_VERTEX);
1067 
1068    /* kill non-position outputs for binning variant */
1069    nir_foreach_block (block, nir_shader_get_entrypoint(ctx->nir)) {
1070       nir_foreach_instr_safe (instr, block) {
1071          if (instr->type != nir_instr_type_intrinsic)
1072             continue;
1073 
1074          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1075          if (intr->intrinsic != nir_intrinsic_store_output)
1076             continue;
1077 
1078          if (output_slot(ctx, intr) != VARYING_SLOT_POS)
1079             nir_instr_remove(instr);
1080       }
1081    }
1082 
1083    ir2_optimize_nir(ctx->nir, false);
1084 }
1085 
1086 static bool
ir2_alu_to_scalar_filter_cb(const nir_instr * instr,const void * data)1087 ir2_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)
1088 {
1089    if (instr->type != nir_instr_type_alu)
1090       return false;
1091 
1092    nir_alu_instr *alu = nir_instr_as_alu(instr);
1093    switch (alu->op) {
1094    case nir_op_frsq:
1095    case nir_op_frcp:
1096    case nir_op_flog2:
1097    case nir_op_fexp2:
1098    case nir_op_fsqrt:
1099    case nir_op_fcos:
1100    case nir_op_fsin:
1101       return true;
1102    default:
1103       break;
1104    }
1105 
1106    return false;
1107 }
1108 
1109 void
ir2_nir_compile(struct ir2_context * ctx,bool binning)1110 ir2_nir_compile(struct ir2_context *ctx, bool binning)
1111 {
1112    struct fd2_shader_stateobj *so = ctx->so;
1113 
1114    memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1115 
1116    ctx->nir = nir_shader_clone(NULL, so->nir);
1117 
1118    if (binning)
1119       cleanup_binning(ctx);
1120 
1121    OPT_V(ctx->nir, nir_copy_prop);
1122    OPT_V(ctx->nir, nir_opt_dce);
1123    OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons);
1124 
1125    OPT_V(ctx->nir, nir_lower_int_to_float);
1126    OPT_V(ctx->nir, nir_lower_bool_to_float, true);
1127    while (OPT(ctx->nir, nir_opt_algebraic))
1128       ;
1129    OPT_V(ctx->nir, nir_opt_algebraic_late);
1130    OPT_V(ctx->nir, nir_lower_alu_to_scalar, ir2_alu_to_scalar_filter_cb, NULL);
1131 
1132    OPT_V(ctx->nir, nir_convert_from_ssa, true);
1133 
1134    OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest, false);
1135    OPT_V(ctx->nir, nir_lower_vec_to_regs, NULL, NULL);
1136 
1137    OPT_V(ctx->nir, nir_legacy_trivialize, true);
1138 
1139    OPT_V(ctx->nir, nir_opt_dce);
1140 
1141    nir_sweep(ctx->nir);
1142 
1143    if (FD_DBG(DISASM)) {
1144       debug_printf("----------------------\n");
1145       nir_print_shader(ctx->nir, stdout);
1146       debug_printf("----------------------\n");
1147    }
1148 
1149    /* fd2_shader_stateobj init */
1150    if (so->type == MESA_SHADER_FRAGMENT) {
1151       ctx->f->fragcoord = -1;
1152       ctx->f->inputs_count = 0;
1153       memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1154    }
1155 
1156    /* Setup inputs: */
1157    nir_foreach_shader_in_variable (in, ctx->nir)
1158       setup_input(ctx, in);
1159 
1160    if (so->type == MESA_SHADER_FRAGMENT) {
1161       unsigned idx;
1162       for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1163          ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1164          update_range(ctx, &ctx->input[idx]);
1165       }
1166       /* assume we have param input and kill it later if not */
1167       ctx->input[idx].ncomp = 4;
1168       update_range(ctx, &ctx->input[idx]);
1169    } else {
1170       ctx->input[0].ncomp = 1;
1171       ctx->input[2].ncomp = 1;
1172       update_range(ctx, &ctx->input[0]);
1173       update_range(ctx, &ctx->input[2]);
1174    }
1175 
1176    /* And emit the body: */
1177    nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1178 
1179    nir_foreach_reg_decl (decl, fxn) {
1180       assert(decl->def.index < ARRAY_SIZE(ctx->reg));
1181       ctx->reg[decl->def.index].ncomp = nir_intrinsic_num_components(decl);
1182       ctx->reg_count = MAX2(ctx->reg_count, decl->def.index + 1);
1183    }
1184 
1185    nir_metadata_require(fxn, nir_metadata_block_index);
1186    emit_cf_list(ctx, &fxn->body);
1187    /* TODO emit_block(ctx, fxn->end_block); */
1188 
1189    if (so->type == MESA_SHADER_VERTEX)
1190       extra_position_exports(ctx, binning);
1191 
1192    ralloc_free(ctx->nir);
1193 
1194    /* kill unused param input */
1195    if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1196       ctx->input[ctx->f->inputs_count].initialized = false;
1197 }
1198