xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Jonathan Marek <[email protected]>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Jonathan Marek <[email protected]>
7  */
8 
9 #include "ir2_private.h"
10 
11 static unsigned
src_swizzle(struct ir2_context * ctx,struct ir2_src * src,unsigned ncomp)12 src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
13 {
14    struct ir2_reg_component *comps;
15    unsigned swiz = 0;
16 
17    switch (src->type) {
18    case IR2_SRC_SSA:
19    case IR2_SRC_REG:
20       break;
21    default:
22       return src->swizzle;
23    }
24    /* we need to take into account where the components were allocated */
25    comps = get_reg_src(ctx, src)->comp;
26    for (int i = 0; i < ncomp; i++) {
27       swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i);
28    }
29    return swiz;
30 }
31 
32 /* alu instr need to take into how the output components are allocated */
33 
34 /* scalar doesn't need to take into account dest swizzle */
35 
36 static unsigned
alu_swizzle_scalar(struct ir2_context * ctx,struct ir2_src * reg)37 alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg)
38 {
39    /* hardware seems to take from W, but swizzle everywhere just in case */
40    return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX);
41 }
42 
43 static unsigned
alu_swizzle(struct ir2_context * ctx,struct ir2_instr * instr,struct ir2_src * src)44 alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr,
45             struct ir2_src *src)
46 {
47    struct ir2_reg_component *comp = get_reg(instr)->comp;
48    unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr));
49    unsigned swiz = 0;
50 
51    /* non per component special cases */
52    switch (instr->alu.vector_opc) {
53    case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv:
54       return alu_swizzle_scalar(ctx, src);
55    case DOT2ADDv:
56    case DOT3v:
57    case DOT4v:
58    case CUBEv:
59       return swiz0;
60    default:
61       break;
62    }
63 
64    for (int i = 0, j = 0; i < dst_ncomp(instr); j++) {
65       if (instr->alu.write_mask & 1 << j) {
66          if (comp[j].c != 7)
67             swiz |= swiz_set(i, comp[j].c);
68          i++;
69       }
70    }
71    return swiz_merge(swiz0, swiz);
72 }
73 
74 static unsigned
alu_swizzle_scalar2(struct ir2_context * ctx,struct ir2_src * src,unsigned s1)75 alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1)
76 {
77    /* hardware seems to take from ZW, but swizzle everywhere (ABAB) */
78    unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0);
79    return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY);
80 }
81 
82 /* write_mask needs to be transformed by allocation information */
83 
84 static unsigned
alu_write_mask(struct ir2_context * ctx,struct ir2_instr * instr)85 alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr)
86 {
87    struct ir2_reg_component *comp = get_reg(instr)->comp;
88    unsigned write_mask = 0;
89 
90    for (int i = 0; i < 4; i++) {
91       if (instr->alu.write_mask & 1 << i)
92          write_mask |= 1 << comp[i].c;
93    }
94 
95    return write_mask;
96 }
97 
98 /* fetch instructions can swizzle dest, but src swizzle needs conversion */
99 
100 static unsigned
fetch_swizzle(struct ir2_context * ctx,struct ir2_src * src,unsigned ncomp)101 fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
102 {
103    unsigned alu_swiz = src_swizzle(ctx, src, ncomp);
104    unsigned swiz = 0;
105    for (int i = 0; i < ncomp; i++)
106       swiz |= swiz_get(alu_swiz, i) << i * 2;
107    return swiz;
108 }
109 
110 static unsigned
fetch_dst_swiz(struct ir2_context * ctx,struct ir2_instr * instr)111 fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr)
112 {
113    struct ir2_reg_component *comp = get_reg(instr)->comp;
114    unsigned dst_swiz = 0xfff;
115    for (int i = 0; i < dst_ncomp(instr); i++) {
116       dst_swiz &= ~(7 << comp[i].c * 3);
117       dst_swiz |= i << comp[i].c * 3;
118    }
119    return dst_swiz;
120 }
121 
122 /* register / export # for instr */
123 static unsigned
dst_to_reg(struct ir2_context * ctx,struct ir2_instr * instr)124 dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr)
125 {
126    if (is_export(instr))
127       return instr->alu.export;
128 
129    return get_reg(instr)->idx;
130 }
131 
132 /* register # for src */
133 static unsigned
src_to_reg(struct ir2_context * ctx,struct ir2_src * src)134 src_to_reg(struct ir2_context *ctx, struct ir2_src *src)
135 {
136    return get_reg_src(ctx, src)->idx;
137 }
138 
139 static unsigned
src_reg_byte(struct ir2_context * ctx,struct ir2_src * src)140 src_reg_byte(struct ir2_context *ctx, struct ir2_src *src)
141 {
142    if (src->type == IR2_SRC_CONST) {
143       assert(!src->abs); /* no abs bit for const */
144       return src->num;
145    }
146    return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0);
147 }
148 
149 /* produce the 12 byte binary instruction for a given sched_instr */
150 static void
fill_instr(struct ir2_context * ctx,struct ir2_sched_instr * sched,instr_t * bc,bool * is_fetch)151 fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched, instr_t *bc,
152            bool *is_fetch)
153 {
154    struct ir2_instr *instr = sched->instr, *instr_s, *instr_v;
155 
156    *bc = (instr_t){};
157 
158    if (instr && instr->type == IR2_FETCH) {
159       *is_fetch = true;
160 
161       bc->fetch.opc = instr->fetch.opc;
162       bc->fetch.pred_select = !!instr->pred;
163       bc->fetch.pred_condition = instr->pred & 1;
164 
165       struct ir2_src *src = instr->src;
166 
167       if (instr->fetch.opc == VTX_FETCH) {
168          instr_fetch_vtx_t *vtx = &bc->fetch.vtx;
169 
170          assert(instr->fetch.vtx.const_idx <= 0x1f);
171          assert(instr->fetch.vtx.const_idx_sel <= 0x3);
172 
173          vtx->src_reg = src_to_reg(ctx, src);
174          vtx->src_swiz = fetch_swizzle(ctx, src, 1);
175          vtx->dst_reg = dst_to_reg(ctx, instr);
176          vtx->dst_swiz = fetch_dst_swiz(ctx, instr);
177 
178          vtx->must_be_one = 1;
179          vtx->const_index = instr->fetch.vtx.const_idx;
180          vtx->const_index_sel = instr->fetch.vtx.const_idx_sel;
181 
182          /* other fields will be patched */
183 
184          /* XXX seems like every FETCH but the first has
185           * this bit set:
186           */
187          vtx->reserved3 = instr->idx ? 0x1 : 0x0;
188          vtx->reserved0 = instr->idx ? 0x2 : 0x3;
189       } else if (instr->fetch.opc == TEX_FETCH) {
190          instr_fetch_tex_t *tex = &bc->fetch.tex;
191 
192          tex->src_reg = src_to_reg(ctx, src);
193          tex->src_swiz = fetch_swizzle(ctx, src, 3);
194          tex->dst_reg = dst_to_reg(ctx, instr);
195          tex->dst_swiz = fetch_dst_swiz(ctx, instr);
196          /* tex->const_idx = patch_fetches */
197          tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
198          tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
199          tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
200          tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
201          tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
202          tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
203          tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
204          tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT;
205          tex->use_reg_lod = instr->src_count == 2;
206          tex->sample_location = SAMPLE_CENTER;
207          tex->tx_coord_denorm = instr->fetch.tex.is_rect;
208       } else if (instr->fetch.opc == TEX_SET_TEX_LOD) {
209          instr_fetch_tex_t *tex = &bc->fetch.tex;
210 
211          tex->src_reg = src_to_reg(ctx, src);
212          tex->src_swiz = fetch_swizzle(ctx, src, 1);
213          tex->dst_reg = 0;
214          tex->dst_swiz = 0xfff;
215 
216          tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
217          tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
218          tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
219          tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
220          tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
221          tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
222          tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
223          tex->use_comp_lod = 1;
224          tex->use_reg_lod = 0;
225          tex->sample_location = SAMPLE_CENTER;
226       } else {
227          assert(0);
228       }
229       return;
230    }
231 
232    instr_v = sched->instr;
233    instr_s = sched->instr_s;
234 
235    if (instr_v) {
236       struct ir2_src src1, src2, *src3;
237 
238       src1 = instr_v->src[0];
239       src2 = instr_v->src[instr_v->src_count > 1];
240       src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL;
241 
242       bc->alu.vector_opc = instr_v->alu.vector_opc;
243       bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v);
244       bc->alu.vector_dest = dst_to_reg(ctx, instr_v);
245       bc->alu.vector_clamp = instr_v->alu.saturate;
246       bc->alu.export_data = instr_v->alu.export >= 0;
247 
248       /* single operand SETEv, use 0.0f as src2 */
249       if (instr_v->src_count == 1 &&
250           (bc->alu.vector_opc == SETEv || bc->alu.vector_opc == SETNEv ||
251            bc->alu.vector_opc == SETGTv || bc->alu.vector_opc == SETGTEv))
252          src2 = ir2_zero(ctx);
253 
254       /* export32 instr for a20x hw binning has this bit set..
255        * it seems to do more than change the base address of constants
256        * XXX this is a hack
257        */
258       bc->alu.relative_addr =
259          (bc->alu.export_data && bc->alu.vector_dest == 32);
260 
261       bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1);
262       bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1);
263       bc->alu.src1_reg_negate = src1.negate;
264       bc->alu.src1_sel = src1.type != IR2_SRC_CONST;
265 
266       bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2);
267       bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2);
268       bc->alu.src2_reg_negate = src2.negate;
269       bc->alu.src2_sel = src2.type != IR2_SRC_CONST;
270 
271       if (src3) {
272          bc->alu.src3_reg_byte = src_reg_byte(ctx, src3);
273          bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3);
274          bc->alu.src3_reg_negate = src3->negate;
275          bc->alu.src3_sel = src3->type != IR2_SRC_CONST;
276       }
277 
278       bc->alu.pred_select = instr_v->pred;
279    }
280 
281    if (instr_s) {
282       struct ir2_src *src = instr_s->src;
283 
284       bc->alu.scalar_opc = instr_s->alu.scalar_opc;
285       bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s);
286       bc->alu.scalar_dest = dst_to_reg(ctx, instr_s);
287       bc->alu.scalar_clamp = instr_s->alu.saturate;
288       bc->alu.export_data = instr_s->alu.export >= 0;
289 
290       if (instr_s->src_count == 1) {
291          bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
292          bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src);
293          bc->alu.src3_reg_negate = src->negate;
294          bc->alu.src3_sel = src->type != IR2_SRC_CONST;
295       } else {
296          assert(instr_s->src_count == 2);
297 
298          bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
299          bc->alu.src3_swiz =
300             alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle);
301          bc->alu.src3_reg_negate = src->negate;
302          bc->alu.src3_sel = src->type != IR2_SRC_CONST;
303          ;
304       }
305 
306       if (instr_v)
307          assert(instr_s->pred == instr_v->pred);
308       bc->alu.pred_select = instr_s->pred;
309    }
310 
311    *is_fetch = false;
312    return;
313 }
314 
315 static unsigned
write_cfs(struct ir2_context * ctx,instr_cf_t * cfs,unsigned cf_idx,instr_cf_alloc_t * alloc,instr_cf_exec_t * exec)316 write_cfs(struct ir2_context *ctx, instr_cf_t *cfs, unsigned cf_idx,
317           instr_cf_alloc_t *alloc, instr_cf_exec_t *exec)
318 {
319    assert(exec->count);
320 
321    if (alloc)
322       cfs[cf_idx++].alloc = *alloc;
323 
324    /* for memory alloc offset for patching */
325    if (alloc && alloc->buffer_select == SQ_MEMORY &&
326        ctx->info->mem_export_ptr == -1)
327       ctx->info->mem_export_ptr = cf_idx / 2 * 3;
328 
329    cfs[cf_idx++].exec = *exec;
330    exec->address += exec->count;
331    exec->serialize = 0;
332    exec->count = 0;
333 
334    return cf_idx;
335 }
336 
337 /* assemble the final shader */
338 void
assemble(struct ir2_context * ctx,bool binning)339 assemble(struct ir2_context *ctx, bool binning)
340 {
341    /* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384)
342     * address is 9 bits so could it be 512 ?
343     */
344    instr_cf_t cfs[384];
345    instr_t bytecode[384], bc;
346    unsigned block_addr[128];
347    unsigned num_cf = 0;
348 
349    /* CF instr state */
350    instr_cf_exec_t exec = {.opc = EXEC};
351    instr_cf_alloc_t alloc = {.opc = ALLOC};
352 
353    int sync_id, sync_id_prev = -1;
354    bool is_fetch = false;
355    bool need_sync = true;
356    bool need_alloc = false;
357    unsigned block_idx = 0;
358 
359    ctx->info->mem_export_ptr = -1;
360    ctx->info->num_fetch_instrs = 0;
361 
362    /* vertex shader always needs to allocate at least one parameter
363     * if it will never happen,
364     */
365    if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) {
366       alloc.buffer_select = SQ_PARAMETER_PIXEL;
367       cfs[num_cf++].alloc = alloc;
368    }
369 
370    block_addr[0] = 0;
371 
372    for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) {
373       struct ir2_instr *instr = ctx->instr_sched[j].instr;
374 
375       /* catch IR2_CF since it isn't a regular instruction */
376       if (instr && instr->type == IR2_CF) {
377          assert(!need_alloc); /* XXX */
378 
379          /* flush any exec cf before inserting jmp */
380          if (exec.count)
381             num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec);
382 
383          cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t){
384             .opc = COND_JMP,
385             .address = instr->cf.block_idx, /* will be fixed later */
386             .force_call = !instr->pred,
387             .predicated_jmp = 1,
388             .direction = instr->cf.block_idx > instr->block_idx,
389             .condition = instr->pred & 1,
390          };
391          continue;
392       }
393 
394       /* fill the 3 dwords for the instruction */
395       fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch);
396 
397       /* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */
398       sync_id = 0;
399       if (is_fetch)
400          sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2;
401 
402       need_sync = sync_id != sync_id_prev;
403       sync_id_prev = sync_id;
404 
405       unsigned block;
406       {
407 
408          if (ctx->instr_sched[j].instr)
409             block = ctx->instr_sched[j].instr->block_idx;
410          else
411             block = ctx->instr_sched[j].instr_s->block_idx;
412 
413          assert(block_idx <= block);
414       }
415 
416       /* info for patching */
417       if (is_fetch) {
418          struct ir2_fetch_info *info =
419             &ctx->info->fetch_info[ctx->info->num_fetch_instrs++];
420          info->offset = i * 3; /* add cf offset later */
421 
422          if (bc.fetch.opc == VTX_FETCH) {
423             info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz;
424          } else if (bc.fetch.opc == TEX_FETCH) {
425             info->tex.samp_id = instr->fetch.tex.samp_id;
426             info->tex.src_swiz = bc.fetch.tex.src_swiz;
427          } else {
428             ctx->info->num_fetch_instrs--;
429          }
430       }
431 
432       /* exec cf after 6 instr or when switching between fetch / alu */
433       if (exec.count == 6 ||
434           (exec.count && (need_sync || block != block_idx))) {
435          num_cf =
436             write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
437          need_alloc = false;
438       }
439 
440       /* update block_addrs for jmp patching */
441       while (block_idx < block)
442          block_addr[++block_idx] = num_cf;
443 
444       /* export - fill alloc cf */
445       if (!is_fetch && bc.alu.export_data) {
446          /* get the export buffer from either vector/scalar dest */
447          instr_alloc_type_t buffer = export_buf(bc.alu.vector_dest);
448          if (bc.alu.scalar_write_mask) {
449             if (bc.alu.vector_write_mask)
450                assert(buffer == export_buf(bc.alu.scalar_dest));
451             buffer = export_buf(bc.alu.scalar_dest);
452          }
453 
454          /* flush previous alloc if the buffer changes */
455          bool need_new_alloc = buffer != alloc.buffer_select;
456 
457          /* memory export always in 32/33 pair, new alloc on 32 */
458          if (bc.alu.vector_dest == 32)
459             need_new_alloc = true;
460 
461          if (need_new_alloc && exec.count) {
462             num_cf =
463                write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
464             need_alloc = false;
465          }
466 
467          need_alloc |= need_new_alloc;
468 
469          alloc.size = 0;
470          alloc.buffer_select = buffer;
471 
472          if (buffer == SQ_PARAMETER_PIXEL &&
473              ctx->so->type == MESA_SHADER_VERTEX)
474             alloc.size = ctx->f->inputs_count - 1;
475 
476          if (buffer == SQ_POSITION)
477             alloc.size = ctx->so->writes_psize;
478       }
479 
480       if (is_fetch)
481          exec.serialize |= 0x1 << exec.count * 2;
482       if (need_sync)
483          exec.serialize |= 0x2 << exec.count * 2;
484 
485       need_sync = false;
486       exec.count += 1;
487       bytecode[i++] = bc;
488    }
489 
490    /* final exec cf */
491    exec.opc = EXEC_END;
492    num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
493 
494    /* insert nop to get an even # of CFs */
495    if (num_cf % 2)
496       cfs[num_cf++] = (instr_cf_t){.opc = NOP};
497 
498    /* patch cf addrs */
499    for (int idx = 0; idx < num_cf; idx++) {
500       switch (cfs[idx].opc) {
501       case NOP:
502       case ALLOC:
503          break;
504       case EXEC:
505       case EXEC_END:
506          cfs[idx].exec.address += num_cf / 2;
507          break;
508       case COND_JMP:
509          cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address];
510          break;
511       default:
512          assert(0);
513       }
514    }
515 
516    /* concatenate cfs and alu/fetch */
517    uint32_t cfdwords = num_cf / 2 * 3;
518    uint32_t alufetchdwords = exec.address * 3;
519    uint32_t sizedwords = cfdwords + alufetchdwords;
520    uint32_t *dwords = malloc(sizedwords * 4);
521    assert(dwords);
522    memcpy(dwords, cfs, cfdwords * 4);
523    memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4);
524 
525    /* finalize ir2_shader_info */
526    ctx->info->dwords = dwords;
527    ctx->info->sizedwords = sizedwords;
528    for (int i = 0; i < ctx->info->num_fetch_instrs; i++)
529       ctx->info->fetch_info[i].offset += cfdwords;
530 
531    if (FD_DBG(DISASM)) {
532       DBG("disassemble: type=%d", ctx->so->type);
533       disasm_a2xx(dwords, sizedwords, 0, ctx->so->type);
534    }
535 }
536