1 /*
2 * Copyright © 2018 Jonathan Marek <[email protected]>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <[email protected]>
7 */
8
9 #include "ir2_private.h"
10
11 static bool
scalar_possible(struct ir2_instr * instr)12 scalar_possible(struct ir2_instr *instr)
13 {
14 if (instr->alu.scalar_opc == SCALAR_NONE)
15 return false;
16
17 return src_ncomp(instr) == 1;
18 }
19
20 static bool
is_alu_compatible(struct ir2_instr * a,struct ir2_instr * b)21 is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b)
22 {
23 if (!a)
24 return true;
25
26 /* dont use same instruction twice */
27 if (a == b)
28 return false;
29
30 /* PRED_SET must be alone */
31 if (b->alu.scalar_opc >= PRED_SETEs &&
32 b->alu.scalar_opc <= PRED_SET_RESTOREs)
33 return false;
34
35 /* must write to same export (issues otherwise?) */
36 return a->alu.export == b->alu.export;
37 }
38
39 /* priority of vector instruction for scheduling (lower=higher prio) */
40 static unsigned
alu_vector_prio(struct ir2_instr * instr)41 alu_vector_prio(struct ir2_instr *instr)
42 {
43 if (instr->alu.vector_opc == VECTOR_NONE)
44 return ~0u;
45
46 if (is_export(instr))
47 return 4;
48
49 /* TODO check src type and ncomps */
50 if (instr->src_count == 3)
51 return 0;
52
53 if (!scalar_possible(instr))
54 return 1;
55
56 return instr->src_count == 2 ? 2 : 3;
57 }
58
59 /* priority of scalar instruction for scheduling (lower=higher prio) */
60 static unsigned
alu_scalar_prio(struct ir2_instr * instr)61 alu_scalar_prio(struct ir2_instr *instr)
62 {
63 if (!scalar_possible(instr))
64 return ~0u;
65
66 /* this case is dealt with later */
67 if (instr->src_count > 1)
68 return ~0u;
69
70 if (is_export(instr))
71 return 4;
72
73 /* PRED to end of block */
74 if (instr->alu.scalar_opc >= PRED_SETEs &&
75 instr->alu.scalar_opc <= PRED_SET_RESTOREs)
76 return 5;
77
78 /* scalar only have highest priority */
79 return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3;
80 }
81
82 /* this is a bit messy:
83 * we want to find a slot where we can insert a scalar MOV with
84 * a vector instruction that was already scheduled
85 */
86 static struct ir2_sched_instr *
insert(struct ir2_context * ctx,unsigned block_idx,unsigned reg_idx,struct ir2_src src1,unsigned * comp)87 insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx,
88 struct ir2_src src1, unsigned *comp)
89 {
90 struct ir2_sched_instr *sched = NULL, *s;
91 unsigned i, mask = 0xf;
92
93 /* go first earliest point where the mov can be inserted */
94 for (i = ctx->instr_sched_count - 1; i > 0; i--) {
95 s = &ctx->instr_sched[i - 1];
96
97 if (s->instr && s->instr->block_idx != block_idx)
98 break;
99 if (s->instr_s && s->instr_s->block_idx != block_idx)
100 break;
101
102 if (src1.type == IR2_SRC_SSA) {
103 if ((s->instr && s->instr->idx == src1.num) ||
104 (s->instr_s && s->instr_s->idx == src1.num))
105 break;
106 }
107
108 unsigned mr = ~(s->reg_state[reg_idx / 8] >> reg_idx % 8 * 4 & 0xf);
109 if ((mask & mr) == 0)
110 break;
111
112 mask &= mr;
113 if (s->instr_s || s->instr->src_count == 3)
114 continue;
115
116 if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0)
117 continue;
118
119 sched = s;
120 }
121 *comp = ffs(mask) - 1;
122
123 if (sched) {
124 for (s = sched; s != &ctx->instr_sched[ctx->instr_sched_count]; s++)
125 s->reg_state[reg_idx / 8] |= 1 << (*comp + reg_idx % 8 * 4);
126 }
127
128 return sched;
129 }
130
131 /* case1:
132 * in this case, insert a mov to place the 2nd src into to same reg
133 * (scalar sources come from the same register)
134 *
135 * this is a common case which works when one of the srcs is input/const
136 * but for instrs which have 2 ssa/reg srcs, then its not ideal
137 */
138 static bool
scalarize_case1(struct ir2_context * ctx,struct ir2_instr * instr,bool order)139 scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order)
140 {
141 struct ir2_src src0 = instr->src[order];
142 struct ir2_src src1 = instr->src[!order];
143 struct ir2_sched_instr *sched;
144 struct ir2_instr *ins;
145 struct ir2_reg *reg;
146 unsigned idx, comp;
147
148 switch (src0.type) {
149 case IR2_SRC_CONST:
150 case IR2_SRC_INPUT:
151 return false;
152 default:
153 break;
154 }
155
156 /* TODO, insert needs logic for this */
157 if (src1.type == IR2_SRC_REG)
158 return false;
159
160 /* we could do something if they match src1.. */
161 if (src0.negate || src0.abs)
162 return false;
163
164 reg = get_reg_src(ctx, &src0);
165
166 /* result not used more since we will overwrite */
167 for (int i = 0; i < 4; i++)
168 if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i))
169 return false;
170
171 /* find a place to insert the mov */
172 sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp);
173 if (!sched)
174 return false;
175
176 ins = &ctx->instr[idx = ctx->instr_count++];
177 ins->idx = idx;
178 ins->type = IR2_ALU;
179 ins->src[0] = src1;
180 ins->src_count = 1;
181 ins->is_ssa = true;
182 ins->ssa.idx = reg->idx;
183 ins->ssa.ncomp = 1;
184 ins->ssa.comp[0].c = comp;
185 ins->alu.scalar_opc = MAXs;
186 ins->alu.export = -1;
187 ins->alu.write_mask = 1;
188 ins->pred = instr->pred;
189 ins->block_idx = instr->block_idx;
190
191 instr->src[0] = src0;
192 instr->alu.src1_swizzle = comp;
193
194 sched->instr_s = ins;
195 return true;
196 }
197
198 /* fill sched with next fetch or (vector and/or scalar) alu instruction */
199 static int
sched_next(struct ir2_context * ctx,struct ir2_sched_instr * sched)200 sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched)
201 {
202 struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL;
203 unsigned avail_count = 0;
204
205 instr_alloc_type_t export = ~0u;
206 int block_idx = -1;
207
208 /* XXX merge this loop with the other one somehow? */
209 ir2_foreach_instr (instr, ctx) {
210 if (!instr->need_emit)
211 continue;
212 if (is_export(instr))
213 export = MIN2(export, export_buf(instr->alu.export));
214 }
215
216 ir2_foreach_instr (instr, ctx) {
217 if (!instr->need_emit)
218 continue;
219
220 /* dont mix exports */
221 if (is_export(instr) && export_buf(instr->alu.export) != export)
222 continue;
223
224 if (block_idx < 0)
225 block_idx = instr->block_idx;
226 else if (block_idx != instr->block_idx || /* must be same block */
227 instr->type == IR2_CF || /* CF/MEM must be alone */
228 (is_export(instr) && export == SQ_MEMORY))
229 break;
230 /* it works because IR2_CF is always at end of block
231 * and somewhat same idea with MEM exports, which might not be alone
232 * but will end up in-order at least
233 */
234
235 /* check if dependencies are satisfied */
236 bool is_ok = true;
237 ir2_foreach_src (src, instr) {
238 if (src->type == IR2_SRC_REG) {
239 /* need to check if all previous instructions in the block
240 * which write the reg have been emitted
241 * slow..
242 * XXX: check components instead of whole register
243 */
244 struct ir2_reg *reg = get_reg_src(ctx, src);
245 ir2_foreach_instr (p, ctx) {
246 if (!p->is_ssa && p->reg == reg && p->idx < instr->idx)
247 is_ok &= !p->need_emit;
248 }
249 } else if (src->type == IR2_SRC_SSA) {
250 /* in this case its easy, just check need_emit */
251 is_ok &= !ctx->instr[src->num].need_emit;
252 }
253 }
254 /* don't reorder non-ssa write before read */
255 if (!instr->is_ssa) {
256 ir2_foreach_instr (p, ctx) {
257 if (!p->need_emit || p->idx >= instr->idx)
258 continue;
259
260 ir2_foreach_src (src, p) {
261 if (get_reg_src(ctx, src) == instr->reg)
262 is_ok = false;
263 }
264 }
265 }
266 /* don't reorder across predicates */
267 if (avail_count && instr->pred != avail[0]->pred)
268 is_ok = false;
269
270 if (!is_ok)
271 continue;
272
273 avail[avail_count++] = instr;
274 }
275
276 if (!avail_count) {
277 assert(block_idx == -1);
278 return -1;
279 }
280
281 /* priority to FETCH instructions */
282 ir2_foreach_avail (instr) {
283 if (instr->type == IR2_ALU)
284 continue;
285
286 ra_src_free(ctx, instr);
287 ra_reg(ctx, get_reg(instr), -1, false, 0);
288
289 instr->need_emit = false;
290 sched->instr = instr;
291 sched->instr_s = NULL;
292 return block_idx;
293 }
294
295 /* TODO precompute priorities */
296
297 unsigned prio_v = ~0u, prio_s = ~0u, prio;
298 ir2_foreach_avail (instr) {
299 prio = alu_vector_prio(instr);
300 if (prio < prio_v) {
301 instr_v = instr;
302 prio_v = prio;
303 }
304 }
305
306 /* TODO can still insert scalar if src_count=3, if smart about it */
307 if (!instr_v || instr_v->src_count < 3) {
308 ir2_foreach_avail (instr) {
309 bool compat = is_alu_compatible(instr_v, instr);
310
311 prio = alu_scalar_prio(instr);
312 if (prio >= prio_v && !compat)
313 continue;
314
315 if (prio < prio_s) {
316 instr_s = instr;
317 prio_s = prio;
318 if (!compat)
319 instr_v = NULL;
320 }
321 }
322 }
323
324 assert(instr_v || instr_s);
325
326 /* now, we try more complex insertion of vector instruction as scalar
327 * TODO: if we are smart we can still insert if instr_v->src_count==3
328 */
329 if (!instr_s && instr_v->src_count < 3) {
330 ir2_foreach_avail (instr) {
331 if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr))
332 continue;
333
334 /* at this point, src_count should always be 2 */
335 assert(instr->src_count == 2);
336
337 if (scalarize_case1(ctx, instr, 0)) {
338 instr_s = instr;
339 break;
340 }
341 if (scalarize_case1(ctx, instr, 1)) {
342 instr_s = instr;
343 break;
344 }
345 }
346 }
347
348 /* free src registers */
349 if (instr_v) {
350 instr_v->need_emit = false;
351 ra_src_free(ctx, instr_v);
352 }
353
354 if (instr_s) {
355 instr_s->need_emit = false;
356 ra_src_free(ctx, instr_s);
357 }
358
359 /* allocate dst registers */
360 if (instr_v)
361 ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v),
362 instr_v->alu.write_mask);
363
364 if (instr_s)
365 ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s),
366 instr_s->alu.write_mask);
367
368 sched->instr = instr_v;
369 sched->instr_s = instr_s;
370 return block_idx;
371 }
372
373 /* scheduling: determine order of instructions */
374 static void
schedule_instrs(struct ir2_context * ctx)375 schedule_instrs(struct ir2_context *ctx)
376 {
377 struct ir2_sched_instr *sched;
378 int block_idx;
379
380 /* allocate input registers */
381 for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++)
382 if (ctx->input[idx].initialized)
383 ra_reg(ctx, &ctx->input[idx], idx, false, 0);
384
385 for (;;) {
386 sched = &ctx->instr_sched[ctx->instr_sched_count++];
387 block_idx = sched_next(ctx, sched);
388 if (block_idx < 0)
389 break;
390 memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state));
391
392 /* catch texture fetch after scheduling and insert the
393 * SET_TEX_LOD right before it if necessary
394 * TODO clean this up
395 */
396 struct ir2_instr *instr = sched->instr, *tex_lod;
397 if (instr && instr->type == IR2_FETCH && instr->fetch.opc == TEX_FETCH &&
398 instr->src_count == 2) {
399 /* generate the SET_LOD instruction */
400 tex_lod = &ctx->instr[ctx->instr_count++];
401 tex_lod->type = IR2_FETCH;
402 tex_lod->block_idx = instr->block_idx;
403 tex_lod->pred = instr->pred;
404 tex_lod->fetch.opc = TEX_SET_TEX_LOD;
405 tex_lod->src[0] = instr->src[1];
406 tex_lod->src_count = 1;
407
408 sched[1] = sched[0];
409 sched->instr = tex_lod;
410 ctx->instr_sched_count++;
411 }
412
413 bool free_block = true;
414 ir2_foreach_instr (instr, ctx)
415 free_block &= instr->block_idx != block_idx;
416 if (free_block)
417 ra_block_free(ctx, block_idx);
418 };
419 ctx->instr_sched_count--;
420 }
421
422 void
ir2_compile(struct fd2_shader_stateobj * so,unsigned variant,struct fd2_shader_stateobj * fp)423 ir2_compile(struct fd2_shader_stateobj *so, unsigned variant,
424 struct fd2_shader_stateobj *fp)
425 {
426 struct ir2_context ctx = {};
427 bool binning = !fp && so->type == MESA_SHADER_VERTEX;
428
429 if (fp)
430 so->variant[variant].f = fp->variant[0].f;
431
432 ctx.so = so;
433 ctx.info = &so->variant[variant].info;
434 ctx.f = &so->variant[variant].f;
435 ctx.info->max_reg = -1;
436
437 /* convert nir to internal representation */
438 ir2_nir_compile(&ctx, binning);
439
440 /* copy propagate srcs */
441 cp_src(&ctx);
442
443 /* get ref_counts and kill non-needed instructions */
444 ra_count_refs(&ctx);
445
446 /* remove movs used to write outputs */
447 cp_export(&ctx);
448
449 /* instruction order.. and vector->scalar conversions */
450 schedule_instrs(&ctx);
451
452 /* finally, assemble to bitcode */
453 assemble(&ctx, binning);
454 }
455