xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/freedreno/a2xx/ir2.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Jonathan Marek <[email protected]>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Jonathan Marek <[email protected]>
7  */
8 
9 #include "ir2_private.h"
10 
11 static bool
scalar_possible(struct ir2_instr * instr)12 scalar_possible(struct ir2_instr *instr)
13 {
14    if (instr->alu.scalar_opc == SCALAR_NONE)
15       return false;
16 
17    return src_ncomp(instr) == 1;
18 }
19 
20 static bool
is_alu_compatible(struct ir2_instr * a,struct ir2_instr * b)21 is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b)
22 {
23    if (!a)
24       return true;
25 
26    /* dont use same instruction twice */
27    if (a == b)
28       return false;
29 
30    /* PRED_SET must be alone */
31    if (b->alu.scalar_opc >= PRED_SETEs &&
32        b->alu.scalar_opc <= PRED_SET_RESTOREs)
33       return false;
34 
35    /* must write to same export (issues otherwise?) */
36    return a->alu.export == b->alu.export;
37 }
38 
39 /* priority of vector instruction for scheduling (lower=higher prio) */
40 static unsigned
alu_vector_prio(struct ir2_instr * instr)41 alu_vector_prio(struct ir2_instr *instr)
42 {
43    if (instr->alu.vector_opc == VECTOR_NONE)
44       return ~0u;
45 
46    if (is_export(instr))
47       return 4;
48 
49    /* TODO check src type and ncomps */
50    if (instr->src_count == 3)
51       return 0;
52 
53    if (!scalar_possible(instr))
54       return 1;
55 
56    return instr->src_count == 2 ? 2 : 3;
57 }
58 
59 /* priority of scalar instruction for scheduling (lower=higher prio) */
60 static unsigned
alu_scalar_prio(struct ir2_instr * instr)61 alu_scalar_prio(struct ir2_instr *instr)
62 {
63    if (!scalar_possible(instr))
64       return ~0u;
65 
66    /* this case is dealt with later */
67    if (instr->src_count > 1)
68       return ~0u;
69 
70    if (is_export(instr))
71       return 4;
72 
73    /* PRED to end of block */
74    if (instr->alu.scalar_opc >= PRED_SETEs &&
75        instr->alu.scalar_opc <= PRED_SET_RESTOREs)
76       return 5;
77 
78    /* scalar only have highest priority */
79    return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3;
80 }
81 
82 /* this is a bit messy:
83  * we want to find a slot where we can insert a scalar MOV with
84  * a vector instruction that was already scheduled
85  */
86 static struct ir2_sched_instr *
insert(struct ir2_context * ctx,unsigned block_idx,unsigned reg_idx,struct ir2_src src1,unsigned * comp)87 insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx,
88        struct ir2_src src1, unsigned *comp)
89 {
90    struct ir2_sched_instr *sched = NULL, *s;
91    unsigned i, mask = 0xf;
92 
93    /* go first earliest point where the mov can be inserted */
94    for (i = ctx->instr_sched_count - 1; i > 0; i--) {
95       s = &ctx->instr_sched[i - 1];
96 
97       if (s->instr && s->instr->block_idx != block_idx)
98          break;
99       if (s->instr_s && s->instr_s->block_idx != block_idx)
100          break;
101 
102       if (src1.type == IR2_SRC_SSA) {
103          if ((s->instr && s->instr->idx == src1.num) ||
104              (s->instr_s && s->instr_s->idx == src1.num))
105             break;
106       }
107 
108       unsigned mr = ~(s->reg_state[reg_idx / 8] >> reg_idx % 8 * 4 & 0xf);
109       if ((mask & mr) == 0)
110          break;
111 
112       mask &= mr;
113       if (s->instr_s || s->instr->src_count == 3)
114          continue;
115 
116       if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0)
117          continue;
118 
119       sched = s;
120    }
121    *comp = ffs(mask) - 1;
122 
123    if (sched) {
124       for (s = sched; s != &ctx->instr_sched[ctx->instr_sched_count]; s++)
125          s->reg_state[reg_idx / 8] |= 1 << (*comp + reg_idx % 8 * 4);
126    }
127 
128    return sched;
129 }
130 
131 /* case1:
132  * in this case, insert a mov to place the 2nd src into to same reg
133  * (scalar sources come from the same register)
134  *
135  * this is a common case which works when one of the srcs is input/const
136  * but for instrs which have 2 ssa/reg srcs, then its not ideal
137  */
138 static bool
scalarize_case1(struct ir2_context * ctx,struct ir2_instr * instr,bool order)139 scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order)
140 {
141    struct ir2_src src0 = instr->src[order];
142    struct ir2_src src1 = instr->src[!order];
143    struct ir2_sched_instr *sched;
144    struct ir2_instr *ins;
145    struct ir2_reg *reg;
146    unsigned idx, comp;
147 
148    switch (src0.type) {
149    case IR2_SRC_CONST:
150    case IR2_SRC_INPUT:
151       return false;
152    default:
153       break;
154    }
155 
156    /* TODO, insert needs logic for this */
157    if (src1.type == IR2_SRC_REG)
158       return false;
159 
160    /* we could do something if they match src1.. */
161    if (src0.negate || src0.abs)
162       return false;
163 
164    reg = get_reg_src(ctx, &src0);
165 
166    /* result not used more since we will overwrite */
167    for (int i = 0; i < 4; i++)
168       if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i))
169          return false;
170 
171    /* find a place to insert the mov */
172    sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp);
173    if (!sched)
174       return false;
175 
176    ins = &ctx->instr[idx = ctx->instr_count++];
177    ins->idx = idx;
178    ins->type = IR2_ALU;
179    ins->src[0] = src1;
180    ins->src_count = 1;
181    ins->is_ssa = true;
182    ins->ssa.idx = reg->idx;
183    ins->ssa.ncomp = 1;
184    ins->ssa.comp[0].c = comp;
185    ins->alu.scalar_opc = MAXs;
186    ins->alu.export = -1;
187    ins->alu.write_mask = 1;
188    ins->pred = instr->pred;
189    ins->block_idx = instr->block_idx;
190 
191    instr->src[0] = src0;
192    instr->alu.src1_swizzle = comp;
193 
194    sched->instr_s = ins;
195    return true;
196 }
197 
198 /* fill sched with next fetch or (vector and/or scalar) alu instruction */
199 static int
sched_next(struct ir2_context * ctx,struct ir2_sched_instr * sched)200 sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched)
201 {
202    struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL;
203    unsigned avail_count = 0;
204 
205    instr_alloc_type_t export = ~0u;
206    int block_idx = -1;
207 
208    /* XXX merge this loop with the other one somehow? */
209    ir2_foreach_instr (instr, ctx) {
210       if (!instr->need_emit)
211          continue;
212       if (is_export(instr))
213          export = MIN2(export, export_buf(instr->alu.export));
214    }
215 
216    ir2_foreach_instr (instr, ctx) {
217       if (!instr->need_emit)
218          continue;
219 
220       /* dont mix exports */
221       if (is_export(instr) && export_buf(instr->alu.export) != export)
222          continue;
223 
224       if (block_idx < 0)
225          block_idx = instr->block_idx;
226       else if (block_idx != instr->block_idx || /* must be same block */
227                instr->type == IR2_CF ||         /* CF/MEM must be alone */
228                (is_export(instr) && export == SQ_MEMORY))
229          break;
230       /* it works because IR2_CF is always at end of block
231        * and somewhat same idea with MEM exports, which might not be alone
232        * but will end up in-order at least
233        */
234 
235       /* check if dependencies are satisfied */
236       bool is_ok = true;
237       ir2_foreach_src (src, instr) {
238          if (src->type == IR2_SRC_REG) {
239             /* need to check if all previous instructions in the block
240              * which write the reg have been emitted
241              * slow..
242              * XXX: check components instead of whole register
243              */
244             struct ir2_reg *reg = get_reg_src(ctx, src);
245             ir2_foreach_instr (p, ctx) {
246                if (!p->is_ssa && p->reg == reg && p->idx < instr->idx)
247                   is_ok &= !p->need_emit;
248             }
249          } else if (src->type == IR2_SRC_SSA) {
250             /* in this case its easy, just check need_emit */
251             is_ok &= !ctx->instr[src->num].need_emit;
252          }
253       }
254       /* don't reorder non-ssa write before read */
255       if (!instr->is_ssa) {
256          ir2_foreach_instr (p, ctx) {
257             if (!p->need_emit || p->idx >= instr->idx)
258                continue;
259 
260             ir2_foreach_src (src, p) {
261                if (get_reg_src(ctx, src) == instr->reg)
262                   is_ok = false;
263             }
264          }
265       }
266       /* don't reorder across predicates */
267       if (avail_count && instr->pred != avail[0]->pred)
268          is_ok = false;
269 
270       if (!is_ok)
271          continue;
272 
273       avail[avail_count++] = instr;
274    }
275 
276    if (!avail_count) {
277       assert(block_idx == -1);
278       return -1;
279    }
280 
281    /* priority to FETCH instructions */
282    ir2_foreach_avail (instr) {
283       if (instr->type == IR2_ALU)
284          continue;
285 
286       ra_src_free(ctx, instr);
287       ra_reg(ctx, get_reg(instr), -1, false, 0);
288 
289       instr->need_emit = false;
290       sched->instr = instr;
291       sched->instr_s = NULL;
292       return block_idx;
293    }
294 
295    /* TODO precompute priorities */
296 
297    unsigned prio_v = ~0u, prio_s = ~0u, prio;
298    ir2_foreach_avail (instr) {
299       prio = alu_vector_prio(instr);
300       if (prio < prio_v) {
301          instr_v = instr;
302          prio_v = prio;
303       }
304    }
305 
306    /* TODO can still insert scalar if src_count=3, if smart about it */
307    if (!instr_v || instr_v->src_count < 3) {
308       ir2_foreach_avail (instr) {
309          bool compat = is_alu_compatible(instr_v, instr);
310 
311          prio = alu_scalar_prio(instr);
312          if (prio >= prio_v && !compat)
313             continue;
314 
315          if (prio < prio_s) {
316             instr_s = instr;
317             prio_s = prio;
318             if (!compat)
319                instr_v = NULL;
320          }
321       }
322    }
323 
324    assert(instr_v || instr_s);
325 
326    /* now, we try more complex insertion of vector instruction as scalar
327     * TODO: if we are smart we can still insert if instr_v->src_count==3
328     */
329    if (!instr_s && instr_v->src_count < 3) {
330       ir2_foreach_avail (instr) {
331          if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr))
332             continue;
333 
334          /* at this point, src_count should always be 2 */
335          assert(instr->src_count == 2);
336 
337          if (scalarize_case1(ctx, instr, 0)) {
338             instr_s = instr;
339             break;
340          }
341          if (scalarize_case1(ctx, instr, 1)) {
342             instr_s = instr;
343             break;
344          }
345       }
346    }
347 
348    /* free src registers */
349    if (instr_v) {
350       instr_v->need_emit = false;
351       ra_src_free(ctx, instr_v);
352    }
353 
354    if (instr_s) {
355       instr_s->need_emit = false;
356       ra_src_free(ctx, instr_s);
357    }
358 
359    /* allocate dst registers */
360    if (instr_v)
361       ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v),
362              instr_v->alu.write_mask);
363 
364    if (instr_s)
365       ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s),
366              instr_s->alu.write_mask);
367 
368    sched->instr = instr_v;
369    sched->instr_s = instr_s;
370    return block_idx;
371 }
372 
373 /* scheduling: determine order of instructions */
374 static void
schedule_instrs(struct ir2_context * ctx)375 schedule_instrs(struct ir2_context *ctx)
376 {
377    struct ir2_sched_instr *sched;
378    int block_idx;
379 
380    /* allocate input registers */
381    for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++)
382       if (ctx->input[idx].initialized)
383          ra_reg(ctx, &ctx->input[idx], idx, false, 0);
384 
385    for (;;) {
386       sched = &ctx->instr_sched[ctx->instr_sched_count++];
387       block_idx = sched_next(ctx, sched);
388       if (block_idx < 0)
389          break;
390       memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state));
391 
392       /* catch texture fetch after scheduling and insert the
393        * SET_TEX_LOD right before it if necessary
394        * TODO clean this up
395        */
396       struct ir2_instr *instr = sched->instr, *tex_lod;
397       if (instr && instr->type == IR2_FETCH && instr->fetch.opc == TEX_FETCH &&
398           instr->src_count == 2) {
399          /* generate the SET_LOD instruction */
400          tex_lod = &ctx->instr[ctx->instr_count++];
401          tex_lod->type = IR2_FETCH;
402          tex_lod->block_idx = instr->block_idx;
403          tex_lod->pred = instr->pred;
404          tex_lod->fetch.opc = TEX_SET_TEX_LOD;
405          tex_lod->src[0] = instr->src[1];
406          tex_lod->src_count = 1;
407 
408          sched[1] = sched[0];
409          sched->instr = tex_lod;
410          ctx->instr_sched_count++;
411       }
412 
413       bool free_block = true;
414       ir2_foreach_instr (instr, ctx)
415          free_block &= instr->block_idx != block_idx;
416       if (free_block)
417          ra_block_free(ctx, block_idx);
418    };
419    ctx->instr_sched_count--;
420 }
421 
422 void
ir2_compile(struct fd2_shader_stateobj * so,unsigned variant,struct fd2_shader_stateobj * fp)423 ir2_compile(struct fd2_shader_stateobj *so, unsigned variant,
424             struct fd2_shader_stateobj *fp)
425 {
426    struct ir2_context ctx = {};
427    bool binning = !fp && so->type == MESA_SHADER_VERTEX;
428 
429    if (fp)
430       so->variant[variant].f = fp->variant[0].f;
431 
432    ctx.so = so;
433    ctx.info = &so->variant[variant].info;
434    ctx.f = &so->variant[variant].f;
435    ctx.info->max_reg = -1;
436 
437    /* convert nir to internal representation */
438    ir2_nir_compile(&ctx, binning);
439 
440    /* copy propagate srcs */
441    cp_src(&ctx);
442 
443    /* get ref_counts and kill non-needed instructions */
444    ra_count_refs(&ctx);
445 
446    /* remove movs used to write outputs */
447    cp_export(&ctx);
448 
449    /* instruction order.. and vector->scalar conversions */
450    schedule_instrs(&ctx);
451 
452    /* finally, assemble to bitcode */
453    assemble(&ctx, binning);
454 }
455