1 /*
2 * Copyright © 2016 Rob Clark <[email protected]>
3 * Copyright © 2018 Google, Inc.
4 * SPDX-License-Identifier: MIT
5 *
6 * Authors:
7 * Rob Clark <[email protected]>
8 */
9
10 #define FD_BO_NO_HARDPIN 1
11
12 #include <initializer_list>
13
14 #include "pipe/p_state.h"
15 #include "util/bitset.h"
16 #include "util/format/u_format.h"
17 #include "util/u_inlines.h"
18 #include "util/u_memory.h"
19 #include "util/u_string.h"
20
21 #include "freedreno_program.h"
22
23 #include "fd6_const.h"
24 #include "fd6_emit.h"
25 #include "fd6_pack.h"
26 #include "fd6_program.h"
27 #include "fd6_texture.h"
28
29 /**
30 * Temporary program building state.
31 */
32 struct program_builder {
33 struct fd6_program_state *state;
34 struct fd_context *ctx;
35 const struct ir3_cache_key *key;
36 const struct ir3_shader_variant *vs;
37 const struct ir3_shader_variant *hs;
38 const struct ir3_shader_variant *ds;
39 const struct ir3_shader_variant *gs;
40 const struct ir3_shader_variant *fs;
41 const struct ir3_shader_variant *last_shader;
42 bool binning_pass;
43 };
44
45 template <chip CHIP>
46 struct xs_config {
47 uint16_t reg_sp_xs_instrlen;
48 uint16_t reg_hlsq_xs_ctrl;
49 uint16_t reg_sp_xs_first_exec_offset;
50 uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
51 uint16_t reg_sp_xs_vgpr_config;
52 };
53
54 template <chip CHIP>
55 static const struct xs_config<CHIP> xs_configs[] = {
56 [MESA_SHADER_VERTEX] = {
57 REG_A6XX_SP_VS_INSTRLEN,
58 CHIP == A6XX ? REG_A6XX_HLSQ_VS_CNTL : REG_A7XX_HLSQ_VS_CNTL,
59 REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
60 REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
61 REG_A7XX_SP_VS_VGPR_CONFIG,
62 },
63 [MESA_SHADER_TESS_CTRL] = {
64 REG_A6XX_SP_HS_INSTRLEN,
65 CHIP == A6XX ? REG_A6XX_HLSQ_HS_CNTL : REG_A7XX_HLSQ_HS_CNTL,
66 REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
67 REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
68 REG_A7XX_SP_HS_VGPR_CONFIG,
69 },
70 [MESA_SHADER_TESS_EVAL] = {
71 REG_A6XX_SP_DS_INSTRLEN,
72 CHIP == A6XX ? REG_A6XX_HLSQ_DS_CNTL : REG_A7XX_HLSQ_DS_CNTL,
73 REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
74 REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
75 REG_A7XX_SP_DS_VGPR_CONFIG,
76 },
77 [MESA_SHADER_GEOMETRY] = {
78 REG_A6XX_SP_GS_INSTRLEN,
79 CHIP == A6XX ? REG_A6XX_HLSQ_GS_CNTL : REG_A7XX_HLSQ_GS_CNTL,
80 REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
81 REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
82 REG_A7XX_SP_GS_VGPR_CONFIG,
83 },
84 [MESA_SHADER_FRAGMENT] = {
85 REG_A6XX_SP_FS_INSTRLEN,
86 CHIP == A6XX ? REG_A6XX_HLSQ_FS_CNTL : REG_A7XX_HLSQ_FS_CNTL,
87 REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
88 REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
89 REG_A7XX_SP_FS_VGPR_CONFIG,
90 },
91 [MESA_SHADER_COMPUTE] = {
92 REG_A6XX_SP_CS_INSTRLEN,
93 CHIP == A6XX ? REG_A6XX_HLSQ_CS_CNTL : REG_A7XX_HLSQ_CS_CNTL,
94 REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
95 REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
96 REG_A7XX_SP_CS_VGPR_CONFIG,
97 },
98 };
99
100 template <chip CHIP>
101 void
fd6_emit_shader(struct fd_context * ctx,struct fd_ringbuffer * ring,const struct ir3_shader_variant * so)102 fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring,
103 const struct ir3_shader_variant *so)
104 {
105 if (!so) {
106 /* shader stage disabled */
107 return;
108 }
109
110 #if MESA_DEBUG
111 /* Name should generally match what you get with MESA_SHADER_CAPTURE_PATH: */
112 const char *name = so->name;
113 if (name)
114 fd_emit_string5(ring, name, strlen(name));
115 #endif
116
117 gl_shader_stage type = so->type;
118 if (type == MESA_SHADER_KERNEL)
119 type = MESA_SHADER_COMPUTE;
120
121 enum a6xx_threadsize thrsz =
122 so->info.double_threadsize ? THREAD128 : THREAD64;
123
124 switch (type) {
125 case MESA_SHADER_VERTEX:
126 OUT_REG(ring, A6XX_SP_VS_CTRL_REG0(
127 .halfregfootprint = so->info.max_half_reg + 1,
128 .fullregfootprint = so->info.max_reg + 1,
129 .branchstack = ir3_shader_branchstack_hw(so),
130 .mergedregs = so->mergedregs,
131 .earlypreamble = so->early_preamble,
132 ));
133 break;
134 case MESA_SHADER_TESS_CTRL:
135 OUT_REG(ring, A6XX_SP_HS_CTRL_REG0(
136 .halfregfootprint = so->info.max_half_reg + 1,
137 .fullregfootprint = so->info.max_reg + 1,
138 .branchstack = ir3_shader_branchstack_hw(so),
139 .earlypreamble = so->early_preamble,
140 ));
141 break;
142 case MESA_SHADER_TESS_EVAL:
143 OUT_REG(ring, A6XX_SP_DS_CTRL_REG0(
144 .halfregfootprint = so->info.max_half_reg + 1,
145 .fullregfootprint = so->info.max_reg + 1,
146 .branchstack = ir3_shader_branchstack_hw(so),
147 .earlypreamble = so->early_preamble,
148 ));
149 break;
150 case MESA_SHADER_GEOMETRY:
151 OUT_REG(ring, A6XX_SP_GS_CTRL_REG0(
152 .halfregfootprint = so->info.max_half_reg + 1,
153 .fullregfootprint = so->info.max_reg + 1,
154 .branchstack = ir3_shader_branchstack_hw(so),
155 .earlypreamble = so->early_preamble,
156 ));
157 break;
158 case MESA_SHADER_FRAGMENT:
159 OUT_REG(ring, A6XX_SP_FS_CTRL_REG0(
160 .halfregfootprint = so->info.max_half_reg + 1,
161 .fullregfootprint = so->info.max_reg + 1,
162 .branchstack = ir3_shader_branchstack_hw(so),
163 .threadsize = thrsz,
164 .varying = so->total_in != 0,
165 .lodpixmask = so->need_full_quad,
166 /* unknown bit, seems unnecessary */
167 .unk24 = true,
168 .pixlodenable = so->need_pixlod,
169 .earlypreamble = so->early_preamble,
170 .mergedregs = so->mergedregs,
171 ));
172 break;
173 case MESA_SHADER_COMPUTE:
174 thrsz = ctx->screen->info->a6xx.supports_double_threadsize ? thrsz : THREAD128;
175 OUT_REG(ring, A6XX_SP_CS_CTRL_REG0(
176 .halfregfootprint = so->info.max_half_reg + 1,
177 .fullregfootprint = so->info.max_reg + 1,
178 .branchstack = ir3_shader_branchstack_hw(so),
179 .threadsize = thrsz,
180 .earlypreamble = so->early_preamble,
181 .mergedregs = so->mergedregs,
182 ));
183 break;
184 default:
185 unreachable("bad shader stage");
186 }
187
188 const struct xs_config<CHIP> *cfg = &xs_configs<CHIP>[type];
189
190 OUT_PKT4(ring, cfg->reg_sp_xs_instrlen, 1);
191 OUT_RING(ring, so->instrlen);
192
193 /* emit program binary & private memory layout
194 */
195
196 ir3_get_private_mem(ctx, so);
197
198 uint32_t per_sp_size = ctx->pvtmem[so->pvtmem_per_wave].per_sp_size;
199
200 fd_ringbuffer_attach_bo(ring, so->bo);
201
202 OUT_PKT4(ring, cfg->reg_sp_xs_first_exec_offset, 7);
203 OUT_RING(ring, 0); /* SP_xS_OBJ_FIRST_EXEC_OFFSET */
204 OUT_RELOC(ring, so->bo, 0, 0, 0); /* SP_xS_OBJ_START_LO */
205 OUT_RING(ring, A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size));
206 if (so->pvtmem_size > 0) { /* SP_xS_PVT_MEM_ADDR */
207 fd_ringbuffer_attach_bo(ring, ctx->pvtmem[so->pvtmem_per_wave].bo);
208 OUT_RELOC(ring, ctx->pvtmem[so->pvtmem_per_wave].bo, 0, 0, 0);
209 } else {
210 OUT_RING(ring, 0);
211 OUT_RING(ring, 0);
212 }
213 OUT_RING(ring, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(per_sp_size) |
214 COND(so->pvtmem_per_wave,
215 A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
216
217 OUT_PKT4(ring, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
218 OUT_RING(ring, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(per_sp_size));
219
220 if (CHIP >= A7XX) {
221 OUT_PKT4(ring, cfg->reg_sp_xs_vgpr_config, 1);
222 OUT_RING(ring, 0);
223 }
224
225 if (CHIP == A6XX) {
226 uint32_t shader_preload_size =
227 MIN2(so->instrlen, ctx->screen->info->a6xx.instr_cache_size);
228
229 enum a6xx_state_block sb = fd6_stage2shadersb(so->type);
230 OUT_PKT7(ring, fd6_stage2opcode(so->type), 3);
231 OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
232 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
233 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
234 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
235 CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
236 OUT_RELOC(ring, so->bo, 0, 0, 0);
237 }
238
239 fd6_emit_immediates(so, ring);
240 }
241 FD_GENX(fd6_emit_shader);
242
243 /**
244 * Build a pre-baked state-obj to disable SO, so that we aren't dynamically
245 * building this at draw time whenever we transition from SO enabled->disabled
246 */
247 static void
setup_stream_out_disable(struct fd_context * ctx)248 setup_stream_out_disable(struct fd_context *ctx)
249 {
250 unsigned sizedw = 4;
251
252 if (ctx->screen->info->a6xx.tess_use_shared)
253 sizedw += 2;
254
255 struct fd_ringbuffer *ring =
256 fd_ringbuffer_new_object(ctx->pipe, (1 + sizedw) * 4);
257
258 OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, sizedw);
259 OUT_RING(ring, REG_A6XX_VPC_SO_CNTL);
260 OUT_RING(ring, 0);
261 OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL);
262 OUT_RING(ring, 0);
263
264 if (ctx->screen->info->a6xx.tess_use_shared) {
265 OUT_RING(ring, REG_A6XX_PC_SO_STREAM_CNTL);
266 OUT_RING(ring, 0);
267 }
268
269 fd6_context(ctx)->streamout_disable_stateobj = ring;
270 }
271
272 static void
setup_stream_out(struct fd_context * ctx,struct fd6_program_state * state,const struct ir3_shader_variant * v,struct ir3_shader_linkage * l)273 setup_stream_out(struct fd_context *ctx, struct fd6_program_state *state,
274 const struct ir3_shader_variant *v,
275 struct ir3_shader_linkage *l)
276 {
277 const struct ir3_stream_output_info *strmout = &v->stream_output;
278
279 /* Note: 64 here comes from the HW layout of the program RAM. The program
280 * for stream N is at DWORD 64 * N.
281 */
282 #define A6XX_SO_PROG_DWORDS 64
283 uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
284 BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
285
286 memset(prog, 0, sizeof(prog));
287
288 for (unsigned i = 0; i < strmout->num_outputs; i++) {
289 const struct ir3_stream_output *out = &strmout->output[i];
290 unsigned k = out->register_index;
291 unsigned idx;
292
293 /* linkage map sorted by order frag shader wants things, so
294 * a bit less ideal here..
295 */
296 for (idx = 0; idx < l->cnt; idx++)
297 if (l->var[idx].slot == v->outputs[k].slot)
298 break;
299
300 assert(idx < l->cnt);
301
302 for (unsigned j = 0; j < out->num_components; j++) {
303 unsigned c = j + out->start_component;
304 unsigned loc = l->var[idx].loc + c;
305 unsigned off = j + out->dst_offset; /* in dwords */
306
307 unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
308 if (loc & 1) {
309 prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
310 A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
311 A6XX_VPC_SO_PROG_B_OFF(off * 4);
312 } else {
313 prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
314 A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
315 A6XX_VPC_SO_PROG_A_OFF(off * 4);
316 }
317 BITSET_SET(valid_dwords, dword);
318 }
319 }
320
321 unsigned prog_count = 0;
322 unsigned start, end;
323 BITSET_FOREACH_RANGE (start, end, valid_dwords,
324 A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
325 prog_count += end - start + 1;
326 }
327
328 const bool emit_pc_so_stream_cntl =
329 ctx->screen->info->a6xx.tess_use_shared &&
330 v->type == MESA_SHADER_TESS_EVAL;
331
332 unsigned sizedw = 10 + (2 * prog_count);
333 if (emit_pc_so_stream_cntl)
334 sizedw += 2;
335
336 struct fd_ringbuffer *ring =
337 fd_ringbuffer_new_object(ctx->pipe, (1 + sizedw) * 4);
338
339 OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, sizedw);
340 OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL);
341 OUT_RING(ring,
342 A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(strmout->streams_written) |
343 COND(strmout->stride[0] > 0,
344 A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + strmout->output[0].stream)) |
345 COND(strmout->stride[1] > 0,
346 A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + strmout->output[1].stream)) |
347 COND(strmout->stride[2] > 0,
348 A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + strmout->output[2].stream)) |
349 COND(strmout->stride[3] > 0,
350 A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + strmout->output[3].stream)));
351 OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(0));
352 OUT_RING(ring, strmout->stride[0]);
353 OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(1));
354 OUT_RING(ring, strmout->stride[1]);
355 OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(2));
356 OUT_RING(ring, strmout->stride[2]);
357 OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(3));
358 OUT_RING(ring, strmout->stride[3]);
359
360 bool first = true;
361 BITSET_FOREACH_RANGE (start, end, valid_dwords,
362 A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
363 OUT_RING(ring, REG_A6XX_VPC_SO_CNTL);
364 OUT_RING(ring, COND(first, A6XX_VPC_SO_CNTL_RESET) |
365 A6XX_VPC_SO_CNTL_ADDR(start));
366 for (unsigned i = start; i < end; i++) {
367 OUT_RING(ring, REG_A6XX_VPC_SO_PROG);
368 OUT_RING(ring, prog[i]);
369 }
370 first = false;
371 }
372
373 if (emit_pc_so_stream_cntl) {
374 /* Possibly not tess_use_shared related, but the combination of
375 * tess + xfb fails some tests if we don't emit this.
376 */
377 OUT_RING(ring, REG_A6XX_PC_SO_STREAM_CNTL);
378 OUT_RING(ring, A6XX_PC_SO_STREAM_CNTL_STREAM_ENABLE(0x1));
379 }
380
381 state->streamout_stateobj = ring;
382 }
383
384 static uint32_t
sp_xs_config(const struct ir3_shader_variant * v)385 sp_xs_config(const struct ir3_shader_variant *v)
386 {
387 if (!v)
388 return 0;
389
390 return A6XX_SP_VS_CONFIG_ENABLED |
391 COND(v->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
392 COND(v->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
393 COND(v->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
394 COND(v->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
395 A6XX_SP_VS_CONFIG_NIBO(ir3_shader_nibo(v)) |
396 A6XX_SP_VS_CONFIG_NTEX(v->num_samp) |
397 A6XX_SP_VS_CONFIG_NSAMP(v->num_samp);
398 }
399
400 template <chip CHIP>
401 static void
setup_config_stateobj(struct fd_context * ctx,struct fd6_program_state * state)402 setup_config_stateobj(struct fd_context *ctx, struct fd6_program_state *state)
403 {
404 struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 100 * 4);
405
406 OUT_REG(ring, HLSQ_INVALIDATE_CMD(CHIP, .vs_state = true, .hs_state = true,
407 .ds_state = true, .gs_state = true,
408 .fs_state = true, .cs_state = true,
409 .cs_ibo = true, .gfx_ibo = true, ));
410
411 assert(state->vs->constlen >= state->bs->constlen);
412
413 OUT_REG(ring, HLSQ_VS_CNTL(
414 CHIP,
415 .constlen = state->vs->constlen,
416 .enabled = true,
417 ));
418 OUT_REG(ring, HLSQ_HS_CNTL(
419 CHIP,
420 .constlen = COND(state->hs, state->hs->constlen),
421 .enabled = COND(state->hs, true),
422 ));
423 OUT_REG(ring, HLSQ_DS_CNTL(
424 CHIP,
425 .constlen = COND(state->ds, state->ds->constlen),
426 .enabled = COND(state->ds, true),
427 ));
428 OUT_REG(ring, HLSQ_GS_CNTL(
429 CHIP,
430 .constlen = COND(state->gs, state->gs->constlen),
431 .enabled = COND(state->gs, true),
432 ));
433 OUT_REG(ring, HLSQ_FS_CNTL(
434 CHIP,
435 .constlen = state->fs->constlen,
436 .enabled = true,
437 ));
438
439 OUT_PKT4(ring, REG_A6XX_SP_VS_CONFIG, 1);
440 OUT_RING(ring, sp_xs_config(state->vs));
441
442 OUT_PKT4(ring, REG_A6XX_SP_HS_CONFIG, 1);
443 OUT_RING(ring, sp_xs_config(state->hs));
444
445 OUT_PKT4(ring, REG_A6XX_SP_DS_CONFIG, 1);
446 OUT_RING(ring, sp_xs_config(state->ds));
447
448 OUT_PKT4(ring, REG_A6XX_SP_GS_CONFIG, 1);
449 OUT_RING(ring, sp_xs_config(state->gs));
450
451 OUT_PKT4(ring, REG_A6XX_SP_FS_CONFIG, 1);
452 OUT_RING(ring, sp_xs_config(state->fs));
453
454 OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1);
455 OUT_RING(ring, ir3_shader_nibo(state->fs));
456
457 state->config_stateobj = ring;
458 }
459
460 static inline uint32_t
next_regid(uint32_t reg,uint32_t increment)461 next_regid(uint32_t reg, uint32_t increment)
462 {
463 if (VALIDREG(reg))
464 return reg + increment;
465 else
466 return regid(63, 0);
467 }
468
469 static void
fd6_emit_tess_bos(struct fd_screen * screen,struct fd_ringbuffer * ring,const struct ir3_shader_variant * s)470 fd6_emit_tess_bos(struct fd_screen *screen, struct fd_ringbuffer *ring,
471 const struct ir3_shader_variant *s) assert_dt
472 {
473 const struct ir3_const_state *const_state = ir3_const_state(s);
474 const unsigned regid = const_state->offsets.primitive_param + 1;
475 uint32_t dwords = 8;
476
477 if (regid >= s->constlen)
478 return;
479
480 fd_ringbuffer_attach_bo(ring, screen->tess_bo);
481
482 OUT_PKT7(ring, fd6_stage2opcode(s->type), 7);
483 OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid) |
484 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
485 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
486 CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(s->type)) |
487 CP_LOAD_STATE6_0_NUM_UNIT(dwords / 4));
488 OUT_RING(ring, 0);
489 OUT_RING(ring, 0);
490 OUT_RELOC(ring, screen->tess_bo, FD6_TESS_FACTOR_SIZE, 0, 0);
491 OUT_RELOC(ring, screen->tess_bo, 0, 0, 0);
492 }
493
494 static enum a6xx_tess_output
primitive_to_tess(enum mesa_prim primitive)495 primitive_to_tess(enum mesa_prim primitive)
496 {
497 switch (primitive) {
498 case MESA_PRIM_POINTS:
499 return TESS_POINTS;
500 case MESA_PRIM_LINE_STRIP:
501 return TESS_LINES;
502 case MESA_PRIM_TRIANGLE_STRIP:
503 return TESS_CW_TRIS;
504 default:
505 unreachable("");
506 }
507 }
508
509 #define MAX_VERTEX_ATTRIBS 32
510
511 static void
emit_vfd_dest(struct fd_ringbuffer * ring,const struct ir3_shader_variant * vs)512 emit_vfd_dest(struct fd_ringbuffer *ring, const struct ir3_shader_variant *vs)
513 {
514 uint32_t attr_count = 0;
515
516 for (uint32_t i = 0; i < vs->inputs_count; i++)
517 if (!vs->inputs[i].sysval)
518 attr_count++;
519
520 OUT_REG(ring, A6XX_VFD_CONTROL_0(
521 .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */
522 .decode_cnt = attr_count));
523
524 if (attr_count)
525 OUT_PKT4(ring, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count);
526
527 for (uint32_t i = 0; i < attr_count; i++) {
528 assert(!vs->inputs[i].sysval);
529 OUT_RING(ring,
530 A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vs->inputs[i].compmask) |
531 A6XX_VFD_DEST_CNTL_INSTR_REGID(vs->inputs[i].regid));
532 }
533 }
534
535 static void
emit_vs_system_values(struct fd_ringbuffer * ring,const struct program_builder * b)536 emit_vs_system_values(struct fd_ringbuffer *ring,
537 const struct program_builder *b)
538 {
539 const uint32_t vertexid_regid =
540 ir3_find_sysval_regid(b->vs, SYSTEM_VALUE_VERTEX_ID);
541 const uint32_t instanceid_regid =
542 ir3_find_sysval_regid(b->vs, SYSTEM_VALUE_INSTANCE_ID);
543 const uint32_t tess_coord_x_regid =
544 ir3_find_sysval_regid(b->ds, SYSTEM_VALUE_TESS_COORD);
545 const uint32_t tess_coord_y_regid = next_regid(tess_coord_x_regid, 1);
546 const uint32_t hs_rel_patch_regid =
547 ir3_find_sysval_regid(b->hs, SYSTEM_VALUE_REL_PATCH_ID_IR3);
548 const uint32_t ds_rel_patch_regid =
549 ir3_find_sysval_regid(b->ds, SYSTEM_VALUE_REL_PATCH_ID_IR3);
550 const uint32_t hs_invocation_regid =
551 ir3_find_sysval_regid(b->hs, SYSTEM_VALUE_TCS_HEADER_IR3);
552 const uint32_t gs_primitiveid_regid =
553 ir3_find_sysval_regid(b->gs, SYSTEM_VALUE_PRIMITIVE_ID);
554 const uint32_t vs_primitiveid_regid = b->hs ?
555 ir3_find_sysval_regid(b->hs, SYSTEM_VALUE_PRIMITIVE_ID) :
556 gs_primitiveid_regid;
557 const uint32_t ds_primitiveid_regid =
558 ir3_find_sysval_regid(b->ds, SYSTEM_VALUE_PRIMITIVE_ID);
559 const uint32_t gsheader_regid =
560 ir3_find_sysval_regid(b->gs, SYSTEM_VALUE_GS_HEADER_IR3);
561
562 /* Note: we currently don't support multiview.
563 */
564 const uint32_t viewid_regid = INVALID_REG;
565
566 OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6);
567 OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
568 A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
569 A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
570 A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
571 OUT_RING(ring, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
572 A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
573 OUT_RING(ring, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
574 A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
575 A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
576 A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
577 OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */
578 OUT_RING(ring, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
579 0xfc00); /* VFD_CONTROL_5 */
580 OUT_RING(ring, COND(b->fs->reads_primid, A6XX_VFD_CONTROL_6_PRIMID4PSEN)); /* VFD_CONTROL_6 */
581 }
582
583 template <chip CHIP>
584 static void
emit_vpc(struct fd_ringbuffer * ring,const struct program_builder * b)585 emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b)
586 {
587 const struct ir3_shader_variant *last_shader = b->last_shader;
588
589 /* note: doesn't compile as static because of the array regs.. */
590 const struct reg_config {
591 uint16_t reg_sp_xs_out_reg;
592 uint16_t reg_sp_xs_vpc_dst_reg;
593 uint16_t reg_vpc_xs_pack;
594 uint16_t reg_vpc_xs_clip_cntl;
595 uint16_t reg_gras_xs_cl_cntl;
596 uint16_t reg_pc_xs_out_cntl;
597 uint16_t reg_sp_xs_primitive_cntl;
598 uint16_t reg_vpc_xs_layer_cntl;
599 uint16_t reg_gras_xs_layer_cntl;
600 } reg_config[] = {
601 [MESA_SHADER_VERTEX] = {
602 REG_A6XX_SP_VS_OUT_REG(0),
603 REG_A6XX_SP_VS_VPC_DST_REG(0),
604 REG_A6XX_VPC_VS_PACK,
605 REG_A6XX_VPC_VS_CLIP_CNTL,
606 REG_A6XX_GRAS_VS_CL_CNTL,
607 REG_A6XX_PC_VS_OUT_CNTL,
608 REG_A6XX_SP_VS_PRIMITIVE_CNTL,
609 REG_A6XX_VPC_VS_LAYER_CNTL,
610 REG_A6XX_GRAS_VS_LAYER_CNTL
611 },
612 [MESA_SHADER_TESS_CTRL] = {
613 0,
614 0,
615 0,
616 0,
617 0,
618 REG_A6XX_PC_HS_OUT_CNTL,
619 0,
620 0,
621 0
622 },
623 [MESA_SHADER_TESS_EVAL] = {
624 REG_A6XX_SP_DS_OUT_REG(0),
625 REG_A6XX_SP_DS_VPC_DST_REG(0),
626 REG_A6XX_VPC_DS_PACK,
627 REG_A6XX_VPC_DS_CLIP_CNTL,
628 REG_A6XX_GRAS_DS_CL_CNTL,
629 REG_A6XX_PC_DS_OUT_CNTL,
630 REG_A6XX_SP_DS_PRIMITIVE_CNTL,
631 REG_A6XX_VPC_DS_LAYER_CNTL,
632 REG_A6XX_GRAS_DS_LAYER_CNTL
633 },
634 [MESA_SHADER_GEOMETRY] = {
635 REG_A6XX_SP_GS_OUT_REG(0),
636 REG_A6XX_SP_GS_VPC_DST_REG(0),
637 REG_A6XX_VPC_GS_PACK,
638 REG_A6XX_VPC_GS_CLIP_CNTL,
639 REG_A6XX_GRAS_GS_CL_CNTL,
640 REG_A6XX_PC_GS_OUT_CNTL,
641 REG_A6XX_SP_GS_PRIMITIVE_CNTL,
642 REG_A6XX_VPC_GS_LAYER_CNTL,
643 REG_A6XX_GRAS_GS_LAYER_CNTL
644 },
645 };
646 const struct reg_config *cfg = ®_config[b->last_shader->type];
647
648 struct ir3_shader_linkage linkage = {
649 .primid_loc = 0xff,
650 .clip0_loc = 0xff,
651 .clip1_loc = 0xff,
652 };
653
654 /* If we have streamout, link against the real FS, rather than the
655 * dummy FS used for binning pass state, to ensure the OUTLOC's
656 * match. Depending on whether we end up doing sysmem or gmem,
657 * the actual streamout could happen with either the binning pass
658 * or draw pass program, but the same streamout stateobj is used
659 * in either case:
660 */
661 bool do_streamout = (b->last_shader->stream_output.num_outputs > 0);
662 ir3_link_shaders(&linkage, b->last_shader,
663 do_streamout ? b->state->fs : b->fs,
664 true);
665
666 if (do_streamout)
667 ir3_link_stream_out(&linkage, b->last_shader);
668
669 emit_vs_system_values(ring, b);
670
671 OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4);
672 OUT_RING(ring, ~linkage.varmask[0]);
673 OUT_RING(ring, ~linkage.varmask[1]);
674 OUT_RING(ring, ~linkage.varmask[2]);
675 OUT_RING(ring, ~linkage.varmask[3]);
676
677 /* a6xx finds position/pointsize at the end */
678 const uint32_t position_regid =
679 ir3_find_output_regid(last_shader, VARYING_SLOT_POS);
680 const uint32_t pointsize_regid =
681 ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
682 const uint32_t layer_regid =
683 ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
684 const uint32_t view_regid =
685 ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
686 const uint32_t clip0_regid =
687 ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
688 const uint32_t clip1_regid =
689 ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
690 uint32_t flags_regid = b->gs ?
691 ir3_find_output_regid(b->gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
692
693 uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
694
695 // XXX replace regid(63,0) with INVALID_REG
696 if (layer_regid != regid(63, 0)) {
697 layer_loc = linkage.max_loc;
698 ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc);
699 }
700
701 if (view_regid != regid(63, 0)) {
702 view_loc = linkage.max_loc;
703 ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc);
704 }
705
706 if (position_regid != regid(63, 0)) {
707 position_loc = linkage.max_loc;
708 ir3_link_add(&linkage, VARYING_SLOT_POS, position_regid, 0xf, linkage.max_loc);
709 }
710
711 if (pointsize_regid != regid(63, 0)) {
712 pointsize_loc = linkage.max_loc;
713 ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc);
714 }
715
716 uint8_t clip_mask = last_shader->clip_mask,
717 cull_mask = last_shader->cull_mask;
718 uint8_t clip_cull_mask = clip_mask | cull_mask;
719
720 clip_mask &= b->key->clip_plane_enable;
721
722 /* Handle the case where clip/cull distances aren't read by the FS */
723 uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
724 if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
725 clip0_loc = linkage.max_loc;
726 ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid,
727 clip_cull_mask & 0xf, linkage.max_loc);
728 }
729 if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
730 clip1_loc = linkage.max_loc;
731 ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid,
732 clip_cull_mask >> 4, linkage.max_loc);
733 }
734
735 /* If we have stream-out, we use the full shader for binning
736 * pass, rather than the optimized binning pass one, so that we
737 * have all the varying outputs available for xfb. So streamout
738 * state should always be derived from the non-binning pass
739 * program:
740 */
741 if (do_streamout && !b->binning_pass) {
742 setup_stream_out(b->ctx, b->state, b->last_shader, &linkage);
743
744 if (!fd6_context(b->ctx)->streamout_disable_stateobj)
745 setup_stream_out_disable(b->ctx);
746 }
747
748 /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
749 * at least when a DS is the last stage, so add a dummy output to keep it
750 * happy if there aren't any. We do this late in order to avoid emitting
751 * any unused code and make sure that optimizations don't remove it.
752 */
753 if (linkage.cnt == 0)
754 ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc);
755
756 /* map outputs of the last shader to VPC */
757 assert(linkage.cnt <= 32);
758 const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
759 const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
760 uint16_t sp_out[32] = {0};
761 uint8_t sp_vpc_dst[32] = {0};
762 for (uint32_t i = 0; i < linkage.cnt; i++) {
763 sp_out[i] =
764 A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
765 A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
766 sp_vpc_dst[i] =
767 A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
768 }
769
770 OUT_PKT4(ring, cfg->reg_sp_xs_out_reg, sp_out_count);
771 OUT_BUF(ring, sp_out, sp_out_count);
772
773 OUT_PKT4(ring, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
774 OUT_BUF(ring, sp_vpc_dst, sp_vpc_dst_count);
775
776 OUT_PKT4(ring, cfg->reg_vpc_xs_pack, 1);
777 OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
778 A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
779 A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc));
780
781 OUT_PKT4(ring, cfg->reg_vpc_xs_clip_cntl, 1);
782 OUT_RING(ring, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
783 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
784 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
785
786 OUT_PKT4(ring, cfg->reg_gras_xs_cl_cntl, 1);
787 OUT_RING(ring, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(clip_mask) |
788 A6XX_GRAS_VS_CL_CNTL_CULL_MASK(cull_mask));
789
790 const struct ir3_shader_variant *geom_stages[] = { b->vs, b->hs, b->ds, b->gs };
791
792 for (unsigned i = 0; i < ARRAY_SIZE(geom_stages); i++) {
793 const struct ir3_shader_variant *shader = geom_stages[i];
794 if (!shader)
795 continue;
796
797 bool primid = shader->type != MESA_SHADER_VERTEX &&
798 VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
799
800 OUT_PKT4(ring, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
801 if (shader == last_shader) {
802 OUT_RING(ring, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
803 CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
804 CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
805 CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
806 COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
807 COND(primid, A6XX_PC_GS_OUT_CNTL_PRIMITIVE_ID) |
808 A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
809 } else {
810 OUT_RING(ring, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
811 }
812 }
813
814 /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */
815 assert(flags_regid != INVALID_REG);
816
817 OUT_PKT4(ring, cfg->reg_sp_xs_primitive_cntl, 1);
818 OUT_RING(ring, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
819 A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
820
821 OUT_PKT4(ring, cfg->reg_vpc_xs_layer_cntl, 1);
822 OUT_RING(ring, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
823 A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc));
824
825 OUT_PKT4(ring, cfg->reg_gras_xs_layer_cntl, 1);
826 OUT_RING(ring, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
827 CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
828
829 OUT_REG(ring, A6XX_PC_PS_CNTL(b->fs->reads_primid));
830
831 if (CHIP >= A7XX) {
832 OUT_REG(ring, A6XX_GRAS_UNKNOWN_8110(0x2));
833 OUT_REG(ring, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
834 }
835
836 OUT_PKT4(ring, REG_A6XX_VPC_CNTL_0, 1);
837 OUT_RING(ring, A6XX_VPC_CNTL_0_NUMNONPOSVAR(b->fs->total_in) |
838 COND(b->fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
839 A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) |
840 A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc));
841
842 if (b->hs) {
843 OUT_PKT4(ring, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
844 OUT_RING(ring, b->hs->tess.tcs_vertices_out);
845
846 fd6_emit_link_map(b->vs, b->hs, ring);
847 fd6_emit_link_map(b->hs, b->ds, ring);
848 }
849
850 if (b->gs) {
851 uint32_t vertices_out, invocations, vec4_size;
852 uint32_t prev_stage_output_size =
853 b->ds ? b->ds->output_size : b->vs->output_size;
854
855 if (b->hs) {
856 fd6_emit_link_map(b->ds, b->gs, ring);
857 } else {
858 fd6_emit_link_map(b->vs, b->gs, ring);
859 }
860 vertices_out = MAX2(1, b->gs->gs.vertices_out) - 1;
861 enum a6xx_tess_output output =
862 primitive_to_tess((enum mesa_prim)b->gs->gs.output_primitive);
863 invocations = b->gs->gs.invocations - 1;
864 /* Size of per-primitive alloction in ldlw memory in vec4s. */
865 vec4_size = b->gs->gs.vertices_in *
866 DIV_ROUND_UP(prev_stage_output_size, 4);
867
868 OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
869 OUT_RING(ring,
870 A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) |
871 A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) |
872 A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations));
873
874 if (CHIP >= A7XX) {
875 OUT_REG(ring,
876 A7XX_VPC_PRIMITIVE_CNTL_5(
877 .gs_vertices_out = vertices_out,
878 .gs_invocations = invocations,
879 .gs_output = output,
880 )
881 );
882 } else {
883 OUT_PKT4(ring, REG_A6XX_VPC_GS_PARAM, 1);
884 OUT_RING(ring, 0xff);
885 }
886
887 OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
888 OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
889
890 uint32_t prim_size = prev_stage_output_size;
891 if (prim_size > 64)
892 prim_size = 64;
893 else if (prim_size == 64)
894 prim_size = 63;
895
896 OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1);
897 OUT_RING(ring, prim_size);
898 }
899 }
900
901 static enum a6xx_tex_prefetch_cmd
tex_opc_to_prefetch_cmd(opc_t tex_opc)902 tex_opc_to_prefetch_cmd(opc_t tex_opc)
903 {
904 switch (tex_opc) {
905 case OPC_SAM:
906 return TEX_PREFETCH_SAM;
907 default:
908 unreachable("Unknown tex opc for prefeth cmd");
909 }
910 }
911
912 template <chip CHIP>
913 static void
emit_fs_inputs(struct fd_ringbuffer * ring,const struct program_builder * b)914 emit_fs_inputs(struct fd_ringbuffer *ring, const struct program_builder *b)
915 {
916 const struct ir3_shader_variant *fs = b->fs;
917 uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
918 uint32_t ij_regid[IJ_COUNT];
919 uint32_t smask_in_regid;
920
921 bool sample_shading = fs->per_samp | fs->key.sample_shading;
922 bool enable_varyings = fs->total_in > 0;
923
924 samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
925 smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
926 face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
927 coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
928 zwcoord_regid = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
929 for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
930 ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
931
932 if (fs->num_sampler_prefetch > 0) {
933 /* It seems like ij_pix is *required* to be r0.x */
934 assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) ||
935 ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
936 }
937
938 OUT_PKT4(ring, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
939 OUT_RING(ring, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
940 COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID(0x1ff)) |
941 COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID4COORD(0x1ff)) |
942 COND(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]),
943 A6XX_SP_FS_PREFETCH_CNTL_IJ_WRITE_DISABLE) |
944 COND(fs->prefetch_end_of_quad,
945 A6XX_SP_FS_PREFETCH_CNTL_ENDOFQUAD));
946 for (int i = 0; i < fs->num_sampler_prefetch; i++) {
947 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
948 OUT_RING(ring, SP_FS_PREFETCH_CMD(
949 CHIP, i,
950 .src = prefetch->src,
951 /* For a7xx, samp_id/tex_id is always in SP_FS_BINDLESS_PREFETCH_CMD[n]
952 * even in the non-bindless case (which probably makes the reg name
953 * wrong)
954 */
955 .samp_id = (CHIP == A6XX) ? prefetch->samp_id : 0,
956 .tex_id = (CHIP == A6XX) ? prefetch->tex_id : 0,
957 .dst = prefetch->dst,
958 .wrmask = prefetch->wrmask,
959 .half = prefetch->half_precision,
960 .bindless = prefetch->bindless,
961 .cmd = tex_opc_to_prefetch_cmd(prefetch->tex_opc),
962 ).value
963 );
964 }
965
966 if (CHIP == A7XX) {
967 for (int i = 0; i < fs->num_sampler_prefetch; i++) {
968 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
969 OUT_REG(ring,
970 A6XX_SP_FS_BINDLESS_PREFETCH_CMD(i,
971 .samp_id = prefetch->samp_id,
972 .tex_id = prefetch->tex_id,
973 )
974 );
975 }
976 }
977
978 OUT_REG(ring,
979 HLSQ_CONTROL_1_REG(CHIP,
980 b->ctx->screen->info->a6xx.prim_alloc_threshold),
981 HLSQ_CONTROL_2_REG(
982 CHIP,
983 .faceregid = face_regid,
984 .sampleid = samp_id_regid,
985 .samplemask = smask_in_regid,
986 .centerrhw = ij_regid[IJ_PERSP_CENTER_RHW],
987 ),
988 HLSQ_CONTROL_3_REG(
989 CHIP,
990 .ij_persp_pixel = ij_regid[IJ_PERSP_PIXEL],
991 .ij_linear_pixel = ij_regid[IJ_LINEAR_PIXEL],
992 .ij_persp_centroid = ij_regid[IJ_PERSP_CENTROID],
993 .ij_linear_centroid = ij_regid[IJ_LINEAR_CENTROID],
994 ),
995 HLSQ_CONTROL_4_REG(
996 CHIP,
997 .ij_persp_sample = ij_regid[IJ_PERSP_SAMPLE],
998 .ij_linear_sample = ij_regid[IJ_LINEAR_SAMPLE],
999 .xycoordregid = coord_regid,
1000 .zwcoordregid = zwcoord_regid,
1001 ),
1002 HLSQ_CONTROL_5_REG(
1003 CHIP,
1004 .linelengthregid = INVALID_REG,
1005 .foveationqualityregid = INVALID_REG,
1006 ),
1007 );
1008
1009 if (CHIP >= A7XX) {
1010 uint32_t sysval_regs = 0;
1011 for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) {
1012 if (VALIDREG(ij_regid[i])) {
1013 if (i == IJ_PERSP_CENTER_RHW)
1014 sysval_regs += 1;
1015 else
1016 sysval_regs += 2;
1017 }
1018 }
1019
1020 for (uint32_t sysval : { face_regid, samp_id_regid, smask_in_regid }) {
1021 if (VALIDREG(sysval))
1022 sysval_regs += 1;
1023 }
1024
1025 for (uint32_t sysval : { coord_regid, zwcoord_regid }) {
1026 if (VALIDREG(sysval))
1027 sysval_regs += 2;
1028 }
1029
1030 OUT_REG(ring,
1031 A7XX_HLSQ_UNKNOWN_A9AE(
1032 .sysval_regs_count = sysval_regs,
1033 .unk8 = 1,
1034 .unk9 = 1,
1035 )
1036 );
1037 }
1038
1039 enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
1040 OUT_REG(ring,
1041 HLSQ_FS_CNTL_0(
1042 CHIP,
1043 .threadsize = thrsz,
1044 .varyings = enable_varyings,
1045 ),
1046 );
1047
1048 bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
1049 bool need_size_persamp = false;
1050 if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) {
1051 if (sample_shading)
1052 need_size_persamp = true;
1053 else
1054 need_size = true;
1055 }
1056
1057 OUT_PKT4(ring, REG_A6XX_GRAS_CNTL, 1);
1058 OUT_RING(ring,
1059 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
1060 CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
1061 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
1062 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1063 CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
1064 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1065 COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1066 COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1067 COND(fs->fragcoord_compmask != 0,
1068 A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
1069
1070 OUT_PKT4(ring, REG_A6XX_RB_RENDER_CONTROL0, 2);
1071 OUT_RING(ring,
1072 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1073 CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1074 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1075 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1076 CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
1077 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1078 COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1079 COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1080 COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1081 COND(fs->fragcoord_compmask != 0,
1082 A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1083 OUT_RING(ring,
1084 A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
1085 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
1086 CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1087 CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1088 CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) |
1089 COND(fs->post_depth_coverage, A6XX_RB_RENDER_CONTROL1_POSTDEPTHCOVERAGE) |
1090 COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
1091
1092 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_CNTL, 1);
1093 OUT_RING(ring, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1094
1095 OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
1096 OUT_RING(ring,
1097 CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
1098 A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
1099 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
1100
1101 OUT_PKT4(ring, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1102 OUT_RING(ring, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1103 }
1104
1105 static void
emit_fs_outputs(struct fd_ringbuffer * ring,const struct program_builder * b)1106 emit_fs_outputs(struct fd_ringbuffer *ring, const struct program_builder *b)
1107 {
1108 const struct ir3_shader_variant *fs = b->fs;
1109 uint32_t smask_regid, posz_regid, stencilref_regid;
1110
1111 posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1112 smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1113 stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1114
1115 /* we can't write gl_SampleMask for !msaa.. if b0 is zero then we
1116 * end up masking the single sample!!
1117 */
1118 if (!b->key->key.msaa)
1119 smask_regid = regid(63, 0);
1120
1121 int output_reg_count = 0;
1122 uint32_t fragdata_regid[8];
1123
1124 for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1125 unsigned slot = fs->color0_mrt ? FRAG_RESULT_COLOR : FRAG_RESULT_DATA0 + i;
1126 fragdata_regid[i] = ir3_find_output_regid(fs, slot);
1127 if (VALIDREG(fragdata_regid[i]))
1128 output_reg_count = i + 1;
1129 }
1130
1131 OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1);
1132 OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1133 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1134 A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1135 COND(fs->dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1136
1137 OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_REG(0), output_reg_count);
1138 for (uint32_t i = 0; i < output_reg_count; i++) {
1139 OUT_RING(ring, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1140 COND(fragdata_regid[i] & HALF_REG_ID,
1141 A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
1142
1143 if (VALIDREG(fragdata_regid[i])) {
1144 b->state->mrt_components |= 0xf << (i * 4);
1145 }
1146 }
1147 }
1148
1149 template <chip CHIP>
1150 static void
setup_stateobj(struct fd_ringbuffer * ring,const struct program_builder * b)1151 setup_stateobj(struct fd_ringbuffer *ring, const struct program_builder *b)
1152 assert_dt
1153 {
1154 fd6_emit_shader<CHIP>(b->ctx, ring, b->vs);
1155 fd6_emit_shader<CHIP>(b->ctx, ring, b->hs);
1156 fd6_emit_shader<CHIP>(b->ctx, ring, b->ds);
1157 fd6_emit_shader<CHIP>(b->ctx, ring, b->gs);
1158 if (!b->binning_pass)
1159 fd6_emit_shader<CHIP>(b->ctx, ring, b->fs);
1160
1161 OUT_PKT4(ring, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
1162 OUT_RING(ring, 0);
1163
1164 emit_vfd_dest(ring, b->vs);
1165
1166 emit_vpc<CHIP>(ring, b);
1167
1168 emit_fs_inputs<CHIP>(ring, b);
1169 emit_fs_outputs(ring, b);
1170
1171 if (b->hs) {
1172 fd6_emit_tess_bos(b->ctx->screen, ring, b->hs);
1173 fd6_emit_tess_bos(b->ctx->screen, ring, b->ds);
1174 }
1175
1176 if (b->hs) {
1177 uint32_t patch_control_points = b->key->patch_vertices;
1178
1179 uint32_t patch_local_mem_size_16b =
1180 patch_control_points * b->vs->output_size / 4;
1181
1182 /* Total attribute slots in HS incoming patch. */
1183 OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1);
1184 OUT_RING(ring, patch_local_mem_size_16b);
1185
1186 const uint32_t wavesize = 64;
1187 const uint32_t vs_hs_local_mem_size = 16384;
1188
1189 uint32_t max_patches_per_wave;
1190 if (b->ctx->screen->info->a6xx.tess_use_shared) {
1191 /* HS invocations for a patch are always within the same wave,
1192 * making barriers less expensive. VS can't have barriers so we
1193 * don't care about VS invocations being in the same wave.
1194 */
1195 max_patches_per_wave = wavesize / b->hs->tess.tcs_vertices_out;
1196 } else {
1197 /* VS is also in the same wave */
1198 max_patches_per_wave =
1199 wavesize / MAX2(patch_control_points,
1200 b->hs->tess.tcs_vertices_out);
1201 }
1202
1203
1204 uint32_t patches_per_wave =
1205 MIN2(vs_hs_local_mem_size / (patch_local_mem_size_16b * 16),
1206 max_patches_per_wave);
1207
1208 uint32_t wave_input_size = DIV_ROUND_UP(
1209 patches_per_wave * patch_local_mem_size_16b * 16, 256);
1210
1211 OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1212 OUT_RING(ring, wave_input_size);
1213
1214 enum a6xx_tess_output output;
1215 if (b->ds->tess.point_mode)
1216 output = TESS_POINTS;
1217 else if (b->ds->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES)
1218 output = TESS_LINES;
1219 else if (b->ds->tess.ccw)
1220 output = TESS_CCW_TRIS;
1221 else
1222 output = TESS_CW_TRIS;
1223
1224 OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1);
1225 OUT_RING(ring, A6XX_PC_TESS_CNTL_SPACING(
1226 fd6_gl2spacing(b->ds->tess.spacing)) |
1227 A6XX_PC_TESS_CNTL_OUTPUT(output));
1228 }
1229 }
1230
1231 static void emit_interp_state(struct fd_ringbuffer *ring,
1232 const struct fd6_program_state *state,
1233 bool rasterflat,
1234 bool sprite_coord_mode,
1235 uint32_t sprite_coord_enable);
1236
1237 static struct fd_ringbuffer *
create_interp_stateobj(struct fd_context * ctx,struct fd6_program_state * state)1238 create_interp_stateobj(struct fd_context *ctx, struct fd6_program_state *state)
1239 {
1240 struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 18 * 4);
1241
1242 emit_interp_state(ring, state, false, false, 0);
1243
1244 return ring;
1245 }
1246
1247 /* build the program streaming state which is not part of the pre-
1248 * baked stateobj because of dependency on other gl state (rasterflat
1249 * or sprite-coord-replacement)
1250 */
1251 struct fd_ringbuffer *
fd6_program_interp_state(struct fd6_emit * emit)1252 fd6_program_interp_state(struct fd6_emit *emit)
1253 {
1254 const struct fd6_program_state *state = fd6_emit_get_prog(emit);
1255
1256 if (!unlikely(emit->rasterflat || emit->sprite_coord_enable)) {
1257 /* fastpath: */
1258 return fd_ringbuffer_ref(state->interp_stateobj);
1259 } else {
1260 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
1261 emit->ctx->batch->submit, 18 * 4, FD_RINGBUFFER_STREAMING);
1262
1263 emit_interp_state(ring, state, emit->rasterflat,
1264 emit->sprite_coord_mode, emit->sprite_coord_enable);
1265
1266 return ring;
1267 }
1268 }
1269
1270 static void
emit_interp_state(struct fd_ringbuffer * ring,const struct fd6_program_state * state,bool rasterflat,bool sprite_coord_mode,uint32_t sprite_coord_enable)1271 emit_interp_state(struct fd_ringbuffer *ring, const struct fd6_program_state *state,
1272 bool rasterflat, bool sprite_coord_mode,
1273 uint32_t sprite_coord_enable)
1274 {
1275 const struct ir3_shader_variant *fs = state->fs;
1276 uint32_t vinterp[8], vpsrepl[8];
1277
1278 memset(vinterp, 0, sizeof(vinterp));
1279 memset(vpsrepl, 0, sizeof(vpsrepl));
1280
1281 for (int j = -1; (j = ir3_next_varying(fs, j)) < (int)fs->inputs_count;) {
1282
1283 /* NOTE: varyings are packed, so if compmask is 0xb
1284 * then first, third, and fourth component occupy
1285 * three consecutive varying slots:
1286 */
1287 unsigned compmask = fs->inputs[j].compmask;
1288
1289 uint32_t inloc = fs->inputs[j].inloc;
1290
1291 bool coord_mode = sprite_coord_mode;
1292 if (ir3_point_sprite(fs, j, sprite_coord_enable, &coord_mode)) {
1293 /* mask is two 2-bit fields, where:
1294 * '01' -> S
1295 * '10' -> T
1296 * '11' -> 1 - T (flip mode)
1297 */
1298 unsigned mask = coord_mode ? 0b1101 : 0b1001;
1299 uint32_t loc = inloc;
1300 if (compmask & 0x1) {
1301 vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2);
1302 loc++;
1303 }
1304 if (compmask & 0x2) {
1305 vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2);
1306 loc++;
1307 }
1308 if (compmask & 0x4) {
1309 /* .z <- 0.0f */
1310 vinterp[loc / 16] |= INTERP_ZERO << ((loc % 16) * 2);
1311 loc++;
1312 }
1313 if (compmask & 0x8) {
1314 /* .w <- 1.0f */
1315 vinterp[loc / 16] |= INTERP_ONE << ((loc % 16) * 2);
1316 loc++;
1317 }
1318 } else if (fs->inputs[j].slot == VARYING_SLOT_LAYER ||
1319 fs->inputs[j].slot == VARYING_SLOT_VIEWPORT) {
1320 const struct ir3_shader_variant *last_shader = fd6_last_shader(state);
1321 uint32_t loc = inloc;
1322
1323 /* If the last geometry shader doesn't statically write these, they're
1324 * implicitly zero and the FS is supposed to read zero.
1325 */
1326 if (ir3_find_output(last_shader, (gl_varying_slot)fs->inputs[j].slot) < 0 &&
1327 (compmask & 0x1)) {
1328 vinterp[loc / 16] |= INTERP_ZERO << ((loc % 16) * 2);
1329 } else {
1330 vinterp[loc / 16] |= INTERP_FLAT << ((loc % 16) * 2);
1331 }
1332 } else if (fs->inputs[j].flat || (fs->inputs[j].rasterflat && rasterflat)) {
1333 uint32_t loc = inloc;
1334
1335 for (int i = 0; i < 4; i++) {
1336 if (compmask & (1 << i)) {
1337 vinterp[loc / 16] |= INTERP_FLAT << ((loc % 16) * 2);
1338 loc++;
1339 }
1340 }
1341 }
1342 }
1343
1344 OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
1345 for (int i = 0; i < 8; i++)
1346 OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */
1347
1348 OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
1349 for (int i = 0; i < 8; i++)
1350 OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */
1351 }
1352
1353 template <chip CHIP>
1354 static struct ir3_program_state *
fd6_program_create(void * data,const struct ir3_shader_variant * bs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,const struct ir3_shader_variant * fs,const struct ir3_cache_key * key)1355 fd6_program_create(void *data, const struct ir3_shader_variant *bs,
1356 const struct ir3_shader_variant *vs,
1357 const struct ir3_shader_variant *hs,
1358 const struct ir3_shader_variant *ds,
1359 const struct ir3_shader_variant *gs,
1360 const struct ir3_shader_variant *fs,
1361 const struct ir3_cache_key *key) in_dt
1362 {
1363 struct fd_context *ctx = fd_context((struct pipe_context *)data);
1364 struct fd_screen *screen = ctx->screen;
1365 struct fd6_program_state *state = CALLOC_STRUCT(fd6_program_state);
1366
1367 tc_assert_driver_thread(ctx->tc);
1368
1369 /* if we have streamout, use full VS in binning pass, as the
1370 * binning pass VS will have outputs on other than position/psize
1371 * stripped out:
1372 */
1373 state->bs = vs->stream_output.num_outputs ? vs : bs;
1374 state->vs = vs;
1375 state->hs = hs;
1376 state->ds = ds;
1377 state->gs = gs;
1378 state->fs = fs;
1379 state->binning_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000);
1380 state->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000);
1381
1382 if (hs) {
1383 /* Allocate the fixed-size tess factor BO globally on the screen. This
1384 * lets the program (which ideally we would have shared across contexts,
1385 * though the current ir3_cache impl doesn't do that) bake in the
1386 * addresses.
1387 */
1388 fd_screen_lock(screen);
1389 if (!screen->tess_bo)
1390 screen->tess_bo =
1391 fd_bo_new(screen->dev, FD6_TESS_BO_SIZE, FD_BO_NOMAP, "tessfactor");
1392 fd_screen_unlock(screen);
1393 }
1394
1395 /* Dummy frag shader used for binning pass: */
1396 static const struct ir3_shader_variant dummy_fs = {
1397 .info = {
1398 .max_reg = -1,
1399 .max_half_reg = -1,
1400 .max_const = -1,
1401 },
1402 };
1403 /* The last geometry stage in use: */
1404 const struct ir3_shader_variant *last_shader = fd6_last_shader(state);
1405
1406 setup_config_stateobj<CHIP>(ctx, state);
1407
1408 struct program_builder b = {
1409 .state = state,
1410 .ctx = ctx,
1411 .key = key,
1412 .hs = state->hs,
1413 .ds = state->ds,
1414 .gs = state->gs,
1415 };
1416
1417 /*
1418 * Setup binning pass program state:
1419 */
1420
1421 /* binning VS is wrong when GS is present, so use nonbinning VS
1422 * TODO: compile both binning VS/GS variants correctly
1423 *
1424 * If we have stream-out, we use the full shader for binning
1425 * pass, rather than the optimized binning pass one, so that we
1426 * have all the varying outputs available for xfb. So streamout
1427 * state should always be derived from the non-binning pass
1428 * program.
1429 */
1430 b.vs = state->gs || last_shader->stream_output.num_outputs ?
1431 state->vs : state->bs;
1432 b.fs = &dummy_fs;
1433 b.last_shader = last_shader->type != MESA_SHADER_VERTEX ?
1434 last_shader : state->bs;
1435 b.binning_pass = true;
1436
1437 setup_stateobj<CHIP>(state->binning_stateobj, &b);
1438
1439 /*
1440 * Setup draw pass program state:
1441 */
1442 b.vs = state->vs;
1443 b.fs = state->fs;
1444 b.last_shader = last_shader;
1445 b.binning_pass = false;
1446
1447 setup_stateobj<CHIP>(state->stateobj, &b);
1448
1449 state->interp_stateobj = create_interp_stateobj(ctx, state);
1450
1451 const struct ir3_stream_output_info *stream_output = &last_shader->stream_output;
1452 if (stream_output->num_outputs > 0)
1453 state->stream_output = stream_output;
1454
1455 bool has_viewport =
1456 VALIDREG(ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT));
1457 state->num_viewports = has_viewport ? PIPE_MAX_VIEWPORTS : 1;
1458
1459 /* Note that binning pass uses same const state as draw pass: */
1460 state->user_consts_cmdstream_size =
1461 fd6_user_consts_cmdstream_size(state->vs) +
1462 fd6_user_consts_cmdstream_size(state->hs) +
1463 fd6_user_consts_cmdstream_size(state->ds) +
1464 fd6_user_consts_cmdstream_size(state->gs) +
1465 fd6_user_consts_cmdstream_size(state->fs);
1466
1467 unsigned num_dp = 0;
1468 if (vs->need_driver_params)
1469 num_dp++;
1470 if (gs && gs->need_driver_params)
1471 num_dp++;
1472 if (hs && hs->need_driver_params)
1473 num_dp++;
1474 if (ds && ds->need_driver_params)
1475 num_dp++;
1476
1477 state->num_driver_params = num_dp;
1478
1479 /* dual source blending has an extra fs output in the 2nd slot */
1480 if (fs->fs.color_is_dual_source) {
1481 state->mrt_components |= 0xf << 4;
1482 }
1483
1484 state->lrz_mask.val = ~0;
1485
1486 if (fs->has_kill) {
1487 state->lrz_mask.write = false;
1488 }
1489
1490 if (fs->no_earlyz || fs->writes_pos) {
1491 state->lrz_mask.enable = false;
1492 state->lrz_mask.write = false;
1493 state->lrz_mask.test = false;
1494 }
1495
1496 if (fs->fs.early_fragment_tests) {
1497 state->lrz_mask.z_mode = A6XX_EARLY_Z;
1498 } else if (fs->no_earlyz || fs->writes_pos || fs->writes_stencilref) {
1499 state->lrz_mask.z_mode = A6XX_LATE_Z;
1500 } else {
1501 /* Wildcard indicates that we need to figure out at draw time: */
1502 state->lrz_mask.z_mode = A6XX_INVALID_ZTEST;
1503 }
1504
1505 return &state->base;
1506 }
1507
1508 static void
fd6_program_destroy(void * data,struct ir3_program_state * state)1509 fd6_program_destroy(void *data, struct ir3_program_state *state)
1510 {
1511 struct fd6_program_state *so = fd6_program_state(state);
1512 fd_ringbuffer_del(so->stateobj);
1513 fd_ringbuffer_del(so->binning_stateobj);
1514 fd_ringbuffer_del(so->config_stateobj);
1515 fd_ringbuffer_del(so->interp_stateobj);
1516 if (so->streamout_stateobj)
1517 fd_ringbuffer_del(so->streamout_stateobj);
1518 free(so);
1519 }
1520
1521 template <chip CHIP>
1522 static const struct ir3_cache_funcs cache_funcs = {
1523 .create_state = fd6_program_create<CHIP>,
1524 .destroy_state = fd6_program_destroy,
1525 };
1526
1527 template <chip CHIP>
1528 void
fd6_prog_init(struct pipe_context * pctx)1529 fd6_prog_init(struct pipe_context *pctx)
1530 {
1531 struct fd_context *ctx = fd_context(pctx);
1532
1533 ctx->shader_cache = ir3_cache_create(&cache_funcs<CHIP>, ctx);
1534
1535 ir3_prog_init(pctx);
1536
1537 fd_prog_init(pctx);
1538 }
1539 FD_GENX(fd6_prog_init);
1540