1 /*
2 * Copyright © 2018 Jonathan Marek <[email protected]>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <[email protected]>
7 */
8
9 #include "ir2_private.h"
10
11 static unsigned
src_swizzle(struct ir2_context * ctx,struct ir2_src * src,unsigned ncomp)12 src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
13 {
14 struct ir2_reg_component *comps;
15 unsigned swiz = 0;
16
17 switch (src->type) {
18 case IR2_SRC_SSA:
19 case IR2_SRC_REG:
20 break;
21 default:
22 return src->swizzle;
23 }
24 /* we need to take into account where the components were allocated */
25 comps = get_reg_src(ctx, src)->comp;
26 for (int i = 0; i < ncomp; i++) {
27 swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i);
28 }
29 return swiz;
30 }
31
32 /* alu instr need to take into how the output components are allocated */
33
34 /* scalar doesn't need to take into account dest swizzle */
35
36 static unsigned
alu_swizzle_scalar(struct ir2_context * ctx,struct ir2_src * reg)37 alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg)
38 {
39 /* hardware seems to take from W, but swizzle everywhere just in case */
40 return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX);
41 }
42
43 static unsigned
alu_swizzle(struct ir2_context * ctx,struct ir2_instr * instr,struct ir2_src * src)44 alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr,
45 struct ir2_src *src)
46 {
47 struct ir2_reg_component *comp = get_reg(instr)->comp;
48 unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr));
49 unsigned swiz = 0;
50
51 /* non per component special cases */
52 switch (instr->alu.vector_opc) {
53 case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv:
54 return alu_swizzle_scalar(ctx, src);
55 case DOT2ADDv:
56 case DOT3v:
57 case DOT4v:
58 case CUBEv:
59 return swiz0;
60 default:
61 break;
62 }
63
64 for (int i = 0, j = 0; i < dst_ncomp(instr); j++) {
65 if (instr->alu.write_mask & 1 << j) {
66 if (comp[j].c != 7)
67 swiz |= swiz_set(i, comp[j].c);
68 i++;
69 }
70 }
71 return swiz_merge(swiz0, swiz);
72 }
73
74 static unsigned
alu_swizzle_scalar2(struct ir2_context * ctx,struct ir2_src * src,unsigned s1)75 alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1)
76 {
77 /* hardware seems to take from ZW, but swizzle everywhere (ABAB) */
78 unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0);
79 return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY);
80 }
81
82 /* write_mask needs to be transformed by allocation information */
83
84 static unsigned
alu_write_mask(struct ir2_context * ctx,struct ir2_instr * instr)85 alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr)
86 {
87 struct ir2_reg_component *comp = get_reg(instr)->comp;
88 unsigned write_mask = 0;
89
90 for (int i = 0; i < 4; i++) {
91 if (instr->alu.write_mask & 1 << i)
92 write_mask |= 1 << comp[i].c;
93 }
94
95 return write_mask;
96 }
97
98 /* fetch instructions can swizzle dest, but src swizzle needs conversion */
99
100 static unsigned
fetch_swizzle(struct ir2_context * ctx,struct ir2_src * src,unsigned ncomp)101 fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
102 {
103 unsigned alu_swiz = src_swizzle(ctx, src, ncomp);
104 unsigned swiz = 0;
105 for (int i = 0; i < ncomp; i++)
106 swiz |= swiz_get(alu_swiz, i) << i * 2;
107 return swiz;
108 }
109
110 static unsigned
fetch_dst_swiz(struct ir2_context * ctx,struct ir2_instr * instr)111 fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr)
112 {
113 struct ir2_reg_component *comp = get_reg(instr)->comp;
114 unsigned dst_swiz = 0xfff;
115 for (int i = 0; i < dst_ncomp(instr); i++) {
116 dst_swiz &= ~(7 << comp[i].c * 3);
117 dst_swiz |= i << comp[i].c * 3;
118 }
119 return dst_swiz;
120 }
121
122 /* register / export # for instr */
123 static unsigned
dst_to_reg(struct ir2_context * ctx,struct ir2_instr * instr)124 dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr)
125 {
126 if (is_export(instr))
127 return instr->alu.export;
128
129 return get_reg(instr)->idx;
130 }
131
132 /* register # for src */
133 static unsigned
src_to_reg(struct ir2_context * ctx,struct ir2_src * src)134 src_to_reg(struct ir2_context *ctx, struct ir2_src *src)
135 {
136 return get_reg_src(ctx, src)->idx;
137 }
138
139 static unsigned
src_reg_byte(struct ir2_context * ctx,struct ir2_src * src)140 src_reg_byte(struct ir2_context *ctx, struct ir2_src *src)
141 {
142 if (src->type == IR2_SRC_CONST) {
143 assert(!src->abs); /* no abs bit for const */
144 return src->num;
145 }
146 return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0);
147 }
148
149 /* produce the 12 byte binary instruction for a given sched_instr */
150 static void
fill_instr(struct ir2_context * ctx,struct ir2_sched_instr * sched,instr_t * bc,bool * is_fetch)151 fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched, instr_t *bc,
152 bool *is_fetch)
153 {
154 struct ir2_instr *instr = sched->instr, *instr_s, *instr_v;
155
156 *bc = (instr_t){};
157
158 if (instr && instr->type == IR2_FETCH) {
159 *is_fetch = true;
160
161 bc->fetch.opc = instr->fetch.opc;
162 bc->fetch.pred_select = !!instr->pred;
163 bc->fetch.pred_condition = instr->pred & 1;
164
165 struct ir2_src *src = instr->src;
166
167 if (instr->fetch.opc == VTX_FETCH) {
168 instr_fetch_vtx_t *vtx = &bc->fetch.vtx;
169
170 assert(instr->fetch.vtx.const_idx <= 0x1f);
171 assert(instr->fetch.vtx.const_idx_sel <= 0x3);
172
173 vtx->src_reg = src_to_reg(ctx, src);
174 vtx->src_swiz = fetch_swizzle(ctx, src, 1);
175 vtx->dst_reg = dst_to_reg(ctx, instr);
176 vtx->dst_swiz = fetch_dst_swiz(ctx, instr);
177
178 vtx->must_be_one = 1;
179 vtx->const_index = instr->fetch.vtx.const_idx;
180 vtx->const_index_sel = instr->fetch.vtx.const_idx_sel;
181
182 /* other fields will be patched */
183
184 /* XXX seems like every FETCH but the first has
185 * this bit set:
186 */
187 vtx->reserved3 = instr->idx ? 0x1 : 0x0;
188 vtx->reserved0 = instr->idx ? 0x2 : 0x3;
189 } else if (instr->fetch.opc == TEX_FETCH) {
190 instr_fetch_tex_t *tex = &bc->fetch.tex;
191
192 tex->src_reg = src_to_reg(ctx, src);
193 tex->src_swiz = fetch_swizzle(ctx, src, 3);
194 tex->dst_reg = dst_to_reg(ctx, instr);
195 tex->dst_swiz = fetch_dst_swiz(ctx, instr);
196 /* tex->const_idx = patch_fetches */
197 tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
198 tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
199 tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
200 tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
201 tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
202 tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
203 tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
204 tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT;
205 tex->use_reg_lod = instr->src_count == 2;
206 tex->sample_location = SAMPLE_CENTER;
207 tex->tx_coord_denorm = instr->fetch.tex.is_rect;
208 } else if (instr->fetch.opc == TEX_SET_TEX_LOD) {
209 instr_fetch_tex_t *tex = &bc->fetch.tex;
210
211 tex->src_reg = src_to_reg(ctx, src);
212 tex->src_swiz = fetch_swizzle(ctx, src, 1);
213 tex->dst_reg = 0;
214 tex->dst_swiz = 0xfff;
215
216 tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
217 tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
218 tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
219 tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
220 tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
221 tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
222 tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
223 tex->use_comp_lod = 1;
224 tex->use_reg_lod = 0;
225 tex->sample_location = SAMPLE_CENTER;
226 } else {
227 assert(0);
228 }
229 return;
230 }
231
232 instr_v = sched->instr;
233 instr_s = sched->instr_s;
234
235 if (instr_v) {
236 struct ir2_src src1, src2, *src3;
237
238 src1 = instr_v->src[0];
239 src2 = instr_v->src[instr_v->src_count > 1];
240 src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL;
241
242 bc->alu.vector_opc = instr_v->alu.vector_opc;
243 bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v);
244 bc->alu.vector_dest = dst_to_reg(ctx, instr_v);
245 bc->alu.vector_clamp = instr_v->alu.saturate;
246 bc->alu.export_data = instr_v->alu.export >= 0;
247
248 /* single operand SETEv, use 0.0f as src2 */
249 if (instr_v->src_count == 1 &&
250 (bc->alu.vector_opc == SETEv || bc->alu.vector_opc == SETNEv ||
251 bc->alu.vector_opc == SETGTv || bc->alu.vector_opc == SETGTEv))
252 src2 = ir2_zero(ctx);
253
254 /* export32 instr for a20x hw binning has this bit set..
255 * it seems to do more than change the base address of constants
256 * XXX this is a hack
257 */
258 bc->alu.relative_addr =
259 (bc->alu.export_data && bc->alu.vector_dest == 32);
260
261 bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1);
262 bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1);
263 bc->alu.src1_reg_negate = src1.negate;
264 bc->alu.src1_sel = src1.type != IR2_SRC_CONST;
265
266 bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2);
267 bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2);
268 bc->alu.src2_reg_negate = src2.negate;
269 bc->alu.src2_sel = src2.type != IR2_SRC_CONST;
270
271 if (src3) {
272 bc->alu.src3_reg_byte = src_reg_byte(ctx, src3);
273 bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3);
274 bc->alu.src3_reg_negate = src3->negate;
275 bc->alu.src3_sel = src3->type != IR2_SRC_CONST;
276 }
277
278 bc->alu.pred_select = instr_v->pred;
279 }
280
281 if (instr_s) {
282 struct ir2_src *src = instr_s->src;
283
284 bc->alu.scalar_opc = instr_s->alu.scalar_opc;
285 bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s);
286 bc->alu.scalar_dest = dst_to_reg(ctx, instr_s);
287 bc->alu.scalar_clamp = instr_s->alu.saturate;
288 bc->alu.export_data = instr_s->alu.export >= 0;
289
290 if (instr_s->src_count == 1) {
291 bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
292 bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src);
293 bc->alu.src3_reg_negate = src->negate;
294 bc->alu.src3_sel = src->type != IR2_SRC_CONST;
295 } else {
296 assert(instr_s->src_count == 2);
297
298 bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
299 bc->alu.src3_swiz =
300 alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle);
301 bc->alu.src3_reg_negate = src->negate;
302 bc->alu.src3_sel = src->type != IR2_SRC_CONST;
303 ;
304 }
305
306 if (instr_v)
307 assert(instr_s->pred == instr_v->pred);
308 bc->alu.pred_select = instr_s->pred;
309 }
310
311 *is_fetch = false;
312 return;
313 }
314
315 static unsigned
write_cfs(struct ir2_context * ctx,instr_cf_t * cfs,unsigned cf_idx,instr_cf_alloc_t * alloc,instr_cf_exec_t * exec)316 write_cfs(struct ir2_context *ctx, instr_cf_t *cfs, unsigned cf_idx,
317 instr_cf_alloc_t *alloc, instr_cf_exec_t *exec)
318 {
319 assert(exec->count);
320
321 if (alloc)
322 cfs[cf_idx++].alloc = *alloc;
323
324 /* for memory alloc offset for patching */
325 if (alloc && alloc->buffer_select == SQ_MEMORY &&
326 ctx->info->mem_export_ptr == -1)
327 ctx->info->mem_export_ptr = cf_idx / 2 * 3;
328
329 cfs[cf_idx++].exec = *exec;
330 exec->address += exec->count;
331 exec->serialize = 0;
332 exec->count = 0;
333
334 return cf_idx;
335 }
336
337 /* assemble the final shader */
338 void
assemble(struct ir2_context * ctx,bool binning)339 assemble(struct ir2_context *ctx, bool binning)
340 {
341 /* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384)
342 * address is 9 bits so could it be 512 ?
343 */
344 instr_cf_t cfs[384];
345 instr_t bytecode[384], bc;
346 unsigned block_addr[128];
347 unsigned num_cf = 0;
348
349 /* CF instr state */
350 instr_cf_exec_t exec = {.opc = EXEC};
351 instr_cf_alloc_t alloc = {.opc = ALLOC};
352
353 int sync_id, sync_id_prev = -1;
354 bool is_fetch = false;
355 bool need_sync = true;
356 bool need_alloc = false;
357 unsigned block_idx = 0;
358
359 ctx->info->mem_export_ptr = -1;
360 ctx->info->num_fetch_instrs = 0;
361
362 /* vertex shader always needs to allocate at least one parameter
363 * if it will never happen,
364 */
365 if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) {
366 alloc.buffer_select = SQ_PARAMETER_PIXEL;
367 cfs[num_cf++].alloc = alloc;
368 }
369
370 block_addr[0] = 0;
371
372 for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) {
373 struct ir2_instr *instr = ctx->instr_sched[j].instr;
374
375 /* catch IR2_CF since it isn't a regular instruction */
376 if (instr && instr->type == IR2_CF) {
377 assert(!need_alloc); /* XXX */
378
379 /* flush any exec cf before inserting jmp */
380 if (exec.count)
381 num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec);
382
383 cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t){
384 .opc = COND_JMP,
385 .address = instr->cf.block_idx, /* will be fixed later */
386 .force_call = !instr->pred,
387 .predicated_jmp = 1,
388 .direction = instr->cf.block_idx > instr->block_idx,
389 .condition = instr->pred & 1,
390 };
391 continue;
392 }
393
394 /* fill the 3 dwords for the instruction */
395 fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch);
396
397 /* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */
398 sync_id = 0;
399 if (is_fetch)
400 sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2;
401
402 need_sync = sync_id != sync_id_prev;
403 sync_id_prev = sync_id;
404
405 unsigned block;
406 {
407
408 if (ctx->instr_sched[j].instr)
409 block = ctx->instr_sched[j].instr->block_idx;
410 else
411 block = ctx->instr_sched[j].instr_s->block_idx;
412
413 assert(block_idx <= block);
414 }
415
416 /* info for patching */
417 if (is_fetch) {
418 struct ir2_fetch_info *info =
419 &ctx->info->fetch_info[ctx->info->num_fetch_instrs++];
420 info->offset = i * 3; /* add cf offset later */
421
422 if (bc.fetch.opc == VTX_FETCH) {
423 info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz;
424 } else if (bc.fetch.opc == TEX_FETCH) {
425 info->tex.samp_id = instr->fetch.tex.samp_id;
426 info->tex.src_swiz = bc.fetch.tex.src_swiz;
427 } else {
428 ctx->info->num_fetch_instrs--;
429 }
430 }
431
432 /* exec cf after 6 instr or when switching between fetch / alu */
433 if (exec.count == 6 ||
434 (exec.count && (need_sync || block != block_idx))) {
435 num_cf =
436 write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
437 need_alloc = false;
438 }
439
440 /* update block_addrs for jmp patching */
441 while (block_idx < block)
442 block_addr[++block_idx] = num_cf;
443
444 /* export - fill alloc cf */
445 if (!is_fetch && bc.alu.export_data) {
446 /* get the export buffer from either vector/scalar dest */
447 instr_alloc_type_t buffer = export_buf(bc.alu.vector_dest);
448 if (bc.alu.scalar_write_mask) {
449 if (bc.alu.vector_write_mask)
450 assert(buffer == export_buf(bc.alu.scalar_dest));
451 buffer = export_buf(bc.alu.scalar_dest);
452 }
453
454 /* flush previous alloc if the buffer changes */
455 bool need_new_alloc = buffer != alloc.buffer_select;
456
457 /* memory export always in 32/33 pair, new alloc on 32 */
458 if (bc.alu.vector_dest == 32)
459 need_new_alloc = true;
460
461 if (need_new_alloc && exec.count) {
462 num_cf =
463 write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
464 need_alloc = false;
465 }
466
467 need_alloc |= need_new_alloc;
468
469 alloc.size = 0;
470 alloc.buffer_select = buffer;
471
472 if (buffer == SQ_PARAMETER_PIXEL &&
473 ctx->so->type == MESA_SHADER_VERTEX)
474 alloc.size = ctx->f->inputs_count - 1;
475
476 if (buffer == SQ_POSITION)
477 alloc.size = ctx->so->writes_psize;
478 }
479
480 if (is_fetch)
481 exec.serialize |= 0x1 << exec.count * 2;
482 if (need_sync)
483 exec.serialize |= 0x2 << exec.count * 2;
484
485 need_sync = false;
486 exec.count += 1;
487 bytecode[i++] = bc;
488 }
489
490 /* final exec cf */
491 exec.opc = EXEC_END;
492 num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
493
494 /* insert nop to get an even # of CFs */
495 if (num_cf % 2)
496 cfs[num_cf++] = (instr_cf_t){.opc = NOP};
497
498 /* patch cf addrs */
499 for (int idx = 0; idx < num_cf; idx++) {
500 switch (cfs[idx].opc) {
501 case NOP:
502 case ALLOC:
503 break;
504 case EXEC:
505 case EXEC_END:
506 cfs[idx].exec.address += num_cf / 2;
507 break;
508 case COND_JMP:
509 cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address];
510 break;
511 default:
512 assert(0);
513 }
514 }
515
516 /* concatenate cfs and alu/fetch */
517 uint32_t cfdwords = num_cf / 2 * 3;
518 uint32_t alufetchdwords = exec.address * 3;
519 uint32_t sizedwords = cfdwords + alufetchdwords;
520 uint32_t *dwords = malloc(sizedwords * 4);
521 assert(dwords);
522 memcpy(dwords, cfs, cfdwords * 4);
523 memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4);
524
525 /* finalize ir2_shader_info */
526 ctx->info->dwords = dwords;
527 ctx->info->sizedwords = sizedwords;
528 for (int i = 0; i < ctx->info->num_fetch_instrs; i++)
529 ctx->info->fetch_info[i].offset += cfdwords;
530
531 if (FD_DBG(DISASM)) {
532 DBG("disassemble: type=%d", ctx->so->type);
533 disasm_a2xx(dwords, sizedwords, 0, ctx->so->type);
534 }
535 }
536