1 /*
2 * Copyright © 2016 Rob Clark <[email protected]>
3 * Copyright © 2018 Google, Inc.
4 * SPDX-License-Identifier: MIT
5 *
6 * Authors:
7 * Rob Clark <[email protected]>
8 */
9
10 #define FD_BO_NO_HARDPIN 1
11
12 #include "pipe/p_state.h"
13 #include "util/format/u_format.h"
14 #include "util/u_helpers.h"
15 #include "util/u_memory.h"
16 #include "util/u_string.h"
17 #include "util/u_viewport.h"
18
19 #include "freedreno_query_hw.h"
20 #include "freedreno_resource.h"
21 #include "freedreno_state.h"
22 #include "freedreno_stompable_regs.h"
23 #include "freedreno_tracepoints.h"
24
25 #include "fd6_blend.h"
26 #include "fd6_const.h"
27 #include "fd6_context.h"
28 #include "fd6_compute.h"
29 #include "fd6_emit.h"
30 #include "fd6_image.h"
31 #include "fd6_pack.h"
32 #include "fd6_program.h"
33 #include "fd6_rasterizer.h"
34 #include "fd6_texture.h"
35 #include "fd6_zsa.h"
36
37 /* Helper to get tex stateobj.
38 */
39 static struct fd_ringbuffer *
tex_state(struct fd_context * ctx,enum pipe_shader_type type)40 tex_state(struct fd_context *ctx, enum pipe_shader_type type)
41 assert_dt
42 {
43 if (ctx->tex[type].num_textures == 0)
44 return NULL;
45
46 return fd_ringbuffer_ref(fd6_texture_state(ctx, type)->stateobj);
47 }
48
49 static struct fd_ringbuffer *
build_vbo_state(struct fd6_emit * emit)50 build_vbo_state(struct fd6_emit *emit) assert_dt
51 {
52 const struct fd_vertex_state *vtx = &emit->ctx->vtx;
53
54 const unsigned cnt = vtx->vertexbuf.count;
55 const unsigned dwords = cnt * 4; /* per vbo: reg64 + one reg32 + pkt hdr */
56
57 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
58 emit->ctx->batch->submit, 4 * dwords, FD_RINGBUFFER_STREAMING);
59
60 for (int32_t j = 0; j < cnt; j++) {
61 OUT_PKT4(ring, REG_A6XX_VFD_FETCH(j), 3);
62 const struct pipe_vertex_buffer *vb = &vtx->vertexbuf.vb[j];
63 struct fd_resource *rsc = fd_resource(vb->buffer.resource);
64 if (rsc == NULL) {
65 OUT_RING(ring, 0);
66 OUT_RING(ring, 0);
67 OUT_RING(ring, 0);
68 } else {
69 uint32_t off = vb->buffer_offset;
70 uint32_t size = vb->buffer.resource->width0 - off;
71
72 OUT_RELOC(ring, rsc->bo, off, 0, 0);
73 OUT_RING(ring, size); /* VFD_FETCH[j].SIZE */
74 }
75 }
76
77 return ring;
78 }
79
80 static enum a6xx_ztest_mode
compute_ztest_mode(struct fd6_emit * emit,bool lrz_valid)81 compute_ztest_mode(struct fd6_emit *emit, bool lrz_valid) assert_dt
82 {
83 if (emit->prog->lrz_mask.z_mode != A6XX_INVALID_ZTEST)
84 return emit->prog->lrz_mask.z_mode;
85
86 struct fd_context *ctx = emit->ctx;
87 struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
88 struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa);
89 const struct ir3_shader_variant *fs = emit->fs;
90
91 if (!zsa->base.depth_enabled) {
92 return A6XX_LATE_Z;
93 } else if ((fs->has_kill || zsa->alpha_test) &&
94 (zsa->writes_zs || !pfb->zsbuf)) {
95 /* Slightly odd, but seems like the hw wants us to select
96 * LATE_Z mode if there is no depth buffer + discard. Either
97 * that, or when occlusion query is enabled. See:
98 *
99 * dEQP-GLES31.functional.fbo.no_attachments.*
100 */
101 return lrz_valid ? A6XX_EARLY_LRZ_LATE_Z : A6XX_LATE_Z;
102 } else {
103 return A6XX_EARLY_Z;
104 }
105 }
106
107 /**
108 * Calculate normalized LRZ state based on zsa/prog/blend state, updating
109 * the zsbuf's lrz state as necessary to detect the cases where we need
110 * to invalidate lrz.
111 */
112 static struct fd6_lrz_state
compute_lrz_state(struct fd6_emit * emit)113 compute_lrz_state(struct fd6_emit *emit) assert_dt
114 {
115 struct fd_context *ctx = emit->ctx;
116 struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
117 struct fd6_lrz_state lrz;
118
119 if (!pfb->zsbuf) {
120 memset(&lrz, 0, sizeof(lrz));
121 lrz.z_mode = compute_ztest_mode(emit, false);
122 return lrz;
123 }
124
125 struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend);
126 struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa);
127 struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
128 bool reads_dest = blend->reads_dest;
129
130 lrz = zsa->lrz;
131
132 lrz.val &= emit->prog->lrz_mask.val;
133
134 /* normalize lrz state: */
135 if (reads_dest || blend->base.alpha_to_coverage) {
136 lrz.write = false;
137 }
138
139 /* Unwritten channels *that actually exist* are a form of blending
140 * reading the dest from the PoV of LRZ, but the valid dst channels
141 * isn't known when blend CSO is constructed so we need to handle
142 * that here.
143 */
144 if (ctx->all_mrt_channel_mask & ~blend->all_mrt_write_mask) {
145 lrz.write = false;
146 reads_dest = true;
147 }
148
149 /* Writing depth with blend enabled means we need to invalidate LRZ,
150 * because the written depth value could mean that a later draw with
151 * depth enabled (where we would otherwise write LRZ) could have
152 * fragments which don't pass the depth test due to this draw. For
153 * example, consider this sequence of draws, with depth mode GREATER:
154 *
155 * draw A:
156 * z=0.1, fragments pass
157 * draw B:
158 * z=0.4, fragments pass
159 * blend enabled (LRZ write disabled)
160 * depth write enabled
161 * draw C:
162 * z=0.2, fragments don't pass
163 * blend disabled
164 * depth write enabled
165 *
166 * Normally looking at the state in draw C, we'd assume we could
167 * enable LRZ write. But this would cause early-z/lrz to discard
168 * fragments from draw A which should be visible due to draw B.
169 */
170 if (reads_dest && zsa->writes_z && ctx->screen->driconf.conservative_lrz) {
171 if (!zsa->perf_warn_blend && rsc->lrz_valid) {
172 perf_debug_ctx(ctx, "Invalidating LRZ due to blend+depthwrite");
173 zsa->perf_warn_blend = true;
174 }
175 rsc->lrz_valid = false;
176 }
177
178 /* if we change depthfunc direction, bail out on using LRZ. The
179 * LRZ buffer encodes a min/max depth value per block, but if
180 * we switch from GT/GE <-> LT/LE, those values cannot be
181 * interpreted properly.
182 */
183 if (zsa->base.depth_enabled && (rsc->lrz_direction != FD_LRZ_UNKNOWN) &&
184 (rsc->lrz_direction != lrz.direction)) {
185 if (!zsa->perf_warn_zdir && rsc->lrz_valid) {
186 perf_debug_ctx(ctx, "Invalidating LRZ due to depth test direction change");
187 zsa->perf_warn_zdir = true;
188 }
189 rsc->lrz_valid = false;
190 }
191
192 if (zsa->invalidate_lrz || !rsc->lrz_valid) {
193 rsc->lrz_valid = false;
194 memset(&lrz, 0, sizeof(lrz));
195 }
196
197 lrz.z_mode = compute_ztest_mode(emit, rsc->lrz_valid);
198
199 /* Once we start writing to the real depth buffer, we lock in the
200 * direction for LRZ.. if we have to skip a LRZ write for any
201 * reason, it is still safe to have LRZ until there is a direction
202 * reversal. Prior to the reversal, since we disabled LRZ writes
203 * in the "unsafe" cases, this just means that the LRZ test may
204 * not early-discard some things that end up not passing a later
205 * test (ie. be overly concervative). But once you have a reversal
206 * of direction, it is possible to increase/decrease the z value
207 * to the point where the overly-conservative test is incorrect.
208 */
209 if (zsa->base.depth_writemask) {
210 rsc->lrz_direction = lrz.direction;
211 }
212
213 return lrz;
214 }
215
216 template <chip CHIP>
217 static struct fd_ringbuffer *
build_lrz(struct fd6_emit * emit)218 build_lrz(struct fd6_emit *emit) assert_dt
219 {
220 struct fd_context *ctx = emit->ctx;
221 struct fd6_context *fd6_ctx = fd6_context(ctx);
222 struct fd6_lrz_state lrz = compute_lrz_state(emit);
223
224 /* If the LRZ state has not changed, we can skip the emit: */
225 if (!ctx->last.dirty && (fd6_ctx->last.lrz.val == lrz.val))
226 return NULL;
227
228 fd6_ctx->last.lrz = lrz;
229
230 unsigned ndwords = (CHIP >= A7XX) ? 10 : 8;
231 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
232 ctx->batch->submit, ndwords * 4, FD_RINGBUFFER_STREAMING);
233
234 if (CHIP >= A7XX) {
235 OUT_REG(ring,
236 A6XX_GRAS_LRZ_CNTL(
237 .enable = lrz.enable,
238 .lrz_write = lrz.write,
239 .greater = lrz.direction == FD_LRZ_GREATER,
240 .z_test_enable = lrz.test,
241 .z_bounds_enable = lrz.z_bounds_enable,
242 )
243 );
244 OUT_REG(ring,
245 A7XX_GRAS_LRZ_CNTL2(
246 .disable_on_wrong_dir = false,
247 .fc_enable = false,
248 )
249 );
250 } else {
251 OUT_REG(ring,
252 A6XX_GRAS_LRZ_CNTL(
253 .enable = lrz.enable,
254 .lrz_write = lrz.write,
255 .greater = lrz.direction == FD_LRZ_GREATER,
256 .fc_enable = false,
257 .z_test_enable = lrz.test,
258 .z_bounds_enable = lrz.z_bounds_enable,
259 .disable_on_wrong_dir = false,
260 )
261 );
262 }
263 OUT_REG(ring, A6XX_RB_LRZ_CNTL(.enable = lrz.enable, ));
264
265 OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, ));
266
267 OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, ));
268
269 return ring;
270 }
271
272 static struct fd_ringbuffer *
build_scissor(struct fd6_emit * emit)273 build_scissor(struct fd6_emit *emit) assert_dt
274 {
275 struct fd_context *ctx = emit->ctx;
276 struct pipe_scissor_state *scissors = fd_context_get_scissor(ctx);
277 unsigned num_viewports = emit->prog->num_viewports;
278
279 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
280 emit->ctx->batch->submit, (1 + (2 * num_viewports)) * 4, FD_RINGBUFFER_STREAMING);
281
282 OUT_PKT4(ring, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), 2 * num_viewports);
283 for (unsigned i = 0; i < num_viewports; i++) {
284 OUT_RING(ring, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(scissors[i].minx) |
285 A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(scissors[i].miny));
286 OUT_RING(ring, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(scissors[i].maxx) |
287 A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(scissors[i].maxy));
288 }
289
290 return ring;
291 }
292
293 /* Combination of FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER_DISCARD |
294 * FD_DIRTY_PROG | FD_DIRTY_DUAL_BLEND
295 */
296 static struct fd_ringbuffer *
build_prog_fb_rast(struct fd6_emit * emit)297 build_prog_fb_rast(struct fd6_emit *emit) assert_dt
298 {
299 struct fd_context *ctx = emit->ctx;
300 struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
301 const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
302 const struct ir3_shader_variant *fs = emit->fs;
303
304 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
305 ctx->batch->submit, 9 * 4, FD_RINGBUFFER_STREAMING);
306
307 unsigned nr = pfb->nr_cbufs;
308
309 if (ctx->rasterizer->rasterizer_discard)
310 nr = 0;
311
312 struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend);
313
314 if (blend->use_dual_src_blend)
315 nr++;
316
317 OUT_PKT4(ring, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2);
318 OUT_RING(ring, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
319 COND(fs->writes_smask && pfb->samples > 1,
320 A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
321 COND(fs->writes_stencilref,
322 A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
323 COND(blend->use_dual_src_blend,
324 A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
325 OUT_RING(ring, A6XX_RB_FS_OUTPUT_CNTL1_MRT(nr));
326
327 OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL1, 1);
328 OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr));
329
330 unsigned mrt_components = 0;
331 for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
332 if (!pfb->cbufs[i])
333 continue;
334 mrt_components |= 0xf << (i * 4);
335 }
336
337 /* dual source blending has an extra fs output in the 2nd slot */
338 if (blend->use_dual_src_blend)
339 mrt_components |= 0xf << 4;
340
341 mrt_components &= prog->mrt_components;
342
343 OUT_REG(ring, A6XX_SP_FS_RENDER_COMPONENTS(.dword = mrt_components));
344 OUT_REG(ring, A6XX_RB_RENDER_COMPONENTS(.dword = mrt_components));
345
346 return ring;
347 }
348
349 static struct fd_ringbuffer *
build_blend_color(struct fd6_emit * emit)350 build_blend_color(struct fd6_emit *emit) assert_dt
351 {
352 struct fd_context *ctx = emit->ctx;
353 struct pipe_blend_color *bcolor = &ctx->blend_color;
354 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
355 ctx->batch->submit, 5 * 4, FD_RINGBUFFER_STREAMING);
356
357 OUT_REG(ring, A6XX_RB_BLEND_RED_F32(bcolor->color[0]),
358 A6XX_RB_BLEND_GREEN_F32(bcolor->color[1]),
359 A6XX_RB_BLEND_BLUE_F32(bcolor->color[2]),
360 A6XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
361
362 return ring;
363 }
364
365 static struct fd_ringbuffer *
build_sample_locations(struct fd6_emit * emit)366 build_sample_locations(struct fd6_emit *emit)
367 assert_dt
368 {
369 struct fd_context *ctx = emit->ctx;
370
371 if (!ctx->sample_locations_enabled) {
372 struct fd6_context *fd6_ctx = fd6_context(ctx);
373 return fd_ringbuffer_ref(fd6_ctx->sample_locations_disable_stateobj);
374 }
375
376 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
377 ctx->batch->submit, 9 * 4, FD_RINGBUFFER_STREAMING);
378
379 uint32_t sample_locations = 0;
380 for (int i = 0; i < 4; i++) {
381 float x = (ctx->sample_locations[i] & 0xf) / 16.0f;
382 float y = (16 - (ctx->sample_locations[i] >> 4)) / 16.0f;
383
384 x = CLAMP(x, 0.0f, 0.9375f);
385 y = CLAMP(y, 0.0f, 0.9375f);
386
387 sample_locations |=
388 (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(x) |
389 A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(y)) << i*8;
390 }
391
392 OUT_REG(ring, A6XX_GRAS_SAMPLE_CONFIG(.location_enable = true),
393 A6XX_GRAS_SAMPLE_LOCATION_0(.dword = sample_locations));
394
395 OUT_REG(ring, A6XX_RB_SAMPLE_CONFIG(.location_enable = true),
396 A6XX_RB_SAMPLE_LOCATION_0(.dword = sample_locations));
397
398 OUT_REG(ring, A6XX_SP_TP_SAMPLE_CONFIG(.location_enable = true),
399 A6XX_SP_TP_SAMPLE_LOCATION_0(.dword = sample_locations));
400
401 return ring;
402 }
403
404 template <chip CHIP>
405 static void
fd6_emit_streamout(struct fd_ringbuffer * ring,struct fd6_emit * emit)406 fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
407 {
408 struct fd_context *ctx = emit->ctx;
409 const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
410 const struct ir3_stream_output_info *info = prog->stream_output;
411 struct fd_streamout_stateobj *so = &ctx->streamout;
412 unsigned streamout_mask = 0;
413
414 if (!info)
415 return;
416
417 for (unsigned i = 0; i < so->num_targets; i++) {
418 struct fd_stream_output_target *target =
419 fd_stream_output_target(so->targets[i]);
420
421 if (!target)
422 continue;
423
424 target->stride = info->stride[i];
425
426 OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_BASE(i), 3);
427 /* VPC_SO[i].BUFFER_BASE_LO: */
428 OUT_RELOC(ring, fd_resource(target->base.buffer)->bo, 0, 0, 0);
429 OUT_RING(ring, target->base.buffer_size + target->base.buffer_offset);
430
431 struct fd_bo *offset_bo = fd_resource(target->offset_buf)->bo;
432
433 if (so->reset & (1 << i)) {
434 assert(so->offsets[i] == 0);
435
436 OUT_PKT7(ring, CP_MEM_WRITE, 3);
437 OUT_RELOC(ring, offset_bo, 0, 0, 0);
438 OUT_RING(ring, target->base.buffer_offset);
439
440 OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_OFFSET(i), 1);
441 OUT_RING(ring, target->base.buffer_offset);
442 } else {
443 OUT_PKT7(ring, CP_MEM_TO_REG, 3);
444 OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) |
445 COND(CHIP == A6XX, CP_MEM_TO_REG_0_SHIFT_BY_2) |
446 CP_MEM_TO_REG_0_UNK31 |
447 CP_MEM_TO_REG_0_CNT(0));
448 OUT_RELOC(ring, offset_bo, 0, 0, 0);
449 }
450
451 // After a draw HW would write the new offset to offset_bo
452 OUT_PKT4(ring, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);
453 OUT_RELOC(ring, offset_bo, 0, 0, 0);
454
455 so->reset &= ~(1 << i);
456
457 streamout_mask |= (1 << i);
458 }
459
460 if (streamout_mask) {
461 fd6_state_add_group(&emit->state, prog->streamout_stateobj, FD6_GROUP_SO);
462 } else if (ctx->last.streamout_mask != 0) {
463 /* If we transition from a draw with streamout to one without, turn
464 * off streamout.
465 */
466 fd6_state_add_group(&emit->state, fd6_context(ctx)->streamout_disable_stateobj,
467 FD6_GROUP_SO);
468 }
469
470 /* Make sure that any use of our TFB outputs (indirect draw source or shader
471 * UBO reads) comes after the TFB output is written. From the GL 4.6 core
472 * spec:
473 *
474 * "Buffers should not be bound or in use for both transform feedback and
475 * other purposes in the GL. Specifically, if a buffer object is
476 * simultaneously bound to a transform feedback buffer binding point
477 * and elsewhere in the GL, any writes to or reads from the buffer
478 * generate undefined values."
479 *
480 * So we idle whenever SO buffers change. Note that this function is called
481 * on every draw with TFB enabled, so check the dirty flag for the buffers
482 * themselves.
483 */
484 if (ctx->dirty & FD_DIRTY_STREAMOUT)
485 OUT_WFI5(ring);
486
487 ctx->last.streamout_mask = streamout_mask;
488 emit->streamout_mask = streamout_mask;
489 }
490
491 /**
492 * Stuff that less frequently changes and isn't (yet) moved into stategroups
493 */
494 static void
fd6_emit_non_ring(struct fd_ringbuffer * ring,struct fd6_emit * emit)495 fd6_emit_non_ring(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
496 {
497 struct fd_context *ctx = emit->ctx;
498 const enum fd_dirty_3d_state dirty = ctx->dirty;
499 unsigned num_viewports = emit->prog->num_viewports;
500
501 if (dirty & FD_DIRTY_STENCIL_REF) {
502 struct pipe_stencil_ref *sr = &ctx->stencil_ref;
503
504 OUT_PKT4(ring, REG_A6XX_RB_STENCILREF, 1);
505 OUT_RING(ring, A6XX_RB_STENCILREF_REF(sr->ref_value[0]) |
506 A6XX_RB_STENCILREF_BFREF(sr->ref_value[1]));
507 }
508
509 if (dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_PROG)) {
510 for (unsigned i = 0; i < num_viewports; i++) {
511 struct pipe_scissor_state *scissor = &ctx->viewport_scissor[i];
512 struct pipe_viewport_state *vp = & ctx->viewport[i];
513
514 OUT_REG(ring, A6XX_GRAS_CL_VPORT_XOFFSET(i, vp->translate[0]),
515 A6XX_GRAS_CL_VPORT_XSCALE(i, vp->scale[0]),
516 A6XX_GRAS_CL_VPORT_YOFFSET(i, vp->translate[1]),
517 A6XX_GRAS_CL_VPORT_YSCALE(i, vp->scale[1]),
518 A6XX_GRAS_CL_VPORT_ZOFFSET(i, vp->translate[2]),
519 A6XX_GRAS_CL_VPORT_ZSCALE(i, vp->scale[2]));
520
521 OUT_REG(
522 ring,
523 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(i,
524 .x = scissor->minx,
525 .y = scissor->miny),
526 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(i,
527 .x = scissor->maxx,
528 .y = scissor->maxy));
529 }
530
531 OUT_REG(ring, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ(.horz = ctx->guardband.x,
532 .vert = ctx->guardband.y));
533 }
534
535 /* The clamp ranges are only used when the rasterizer wants depth
536 * clamping.
537 */
538 if ((dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) &&
539 fd_depth_clamp_enabled(ctx)) {
540 for (unsigned i = 0; i < num_viewports; i++) {
541 struct pipe_viewport_state *vp = & ctx->viewport[i];
542 float zmin, zmax;
543
544 util_viewport_zmin_zmax(vp, ctx->rasterizer->clip_halfz,
545 &zmin, &zmax);
546
547 OUT_REG(ring, A6XX_GRAS_CL_Z_CLAMP_MIN(i, zmin),
548 A6XX_GRAS_CL_Z_CLAMP_MAX(i, zmax));
549
550 /* TODO: what to do about this and multi viewport ? */
551 if (i == 0)
552 OUT_REG(ring, A6XX_RB_Z_CLAMP_MIN(zmin), A6XX_RB_Z_CLAMP_MAX(zmax));
553 }
554 }
555 }
556
557 static struct fd_ringbuffer*
build_prim_mode(struct fd6_emit * emit,struct fd_context * ctx,bool gmem)558 build_prim_mode(struct fd6_emit *emit, struct fd_context *ctx, bool gmem)
559 assert_dt
560 {
561 struct fd_ringbuffer *ring =
562 fd_submit_new_ringbuffer(emit->ctx->batch->submit, 2 * 4, FD_RINGBUFFER_STREAMING);
563 uint32_t prim_mode = NO_FLUSH;
564 if (emit->fs->fs.uses_fbfetch_output) {
565 if (gmem) {
566 prim_mode = (ctx->blend->blend_coherent || emit->fs->fs.fbfetch_coherent)
567 ? FLUSH_PER_OVERLAP : NO_FLUSH;
568 } else {
569 prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
570 }
571 } else {
572 prim_mode = NO_FLUSH;
573 }
574 OUT_REG(ring, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2,
575 .single_prim_mode = (enum a6xx_single_prim_mode)prim_mode));
576 return ring;
577 }
578
579 template <chip CHIP, fd6_pipeline_type PIPELINE>
580 void
fd6_emit_3d_state(struct fd_ringbuffer * ring,struct fd6_emit * emit)581 fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit)
582 {
583 struct fd_context *ctx = emit->ctx;
584 struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
585 const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
586 const struct ir3_shader_variant *fs = emit->fs;
587
588 emit_marker6(ring, 5);
589
590 /* Special case, we need to re-emit bindless FS state w/ the
591 * fb-read state appended:
592 */
593 if ((emit->dirty_groups & BIT(FD6_GROUP_PROG)) && fs->fb_read) {
594 ctx->batch->gmem_reason |= FD_GMEM_FB_READ;
595 emit->dirty_groups |= BIT(FD6_GROUP_FS_BINDLESS);
596 }
597
598 u_foreach_bit (b, emit->dirty_groups) {
599 enum fd6_state_id group = (enum fd6_state_id)b;
600 struct fd_ringbuffer *state = NULL;
601
602 switch (group) {
603 case FD6_GROUP_VTXSTATE:
604 state = fd6_vertex_stateobj(ctx->vtx.vtx)->stateobj;
605 fd6_state_add_group(&emit->state, state, FD6_GROUP_VTXSTATE);
606 break;
607 case FD6_GROUP_VBO:
608 state = build_vbo_state(emit);
609 fd6_state_take_group(&emit->state, state, FD6_GROUP_VBO);
610 break;
611 case FD6_GROUP_ZSA:
612 state = fd6_zsa_state(
613 ctx,
614 util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0])),
615 fd_depth_clamp_enabled(ctx));
616 fd6_state_add_group(&emit->state, state, FD6_GROUP_ZSA);
617 break;
618 case FD6_GROUP_LRZ:
619 state = build_lrz<CHIP>(emit);
620 if (state)
621 fd6_state_take_group(&emit->state, state, FD6_GROUP_LRZ);
622 break;
623 case FD6_GROUP_SCISSOR:
624 state = build_scissor(emit);
625 fd6_state_take_group(&emit->state, state, FD6_GROUP_SCISSOR);
626 break;
627 case FD6_GROUP_PROG:
628 fd6_state_add_group(&emit->state, prog->config_stateobj,
629 FD6_GROUP_PROG_CONFIG);
630 fd6_state_add_group(&emit->state, prog->stateobj, FD6_GROUP_PROG);
631 fd6_state_add_group(&emit->state, prog->binning_stateobj,
632 FD6_GROUP_PROG_BINNING);
633
634 /* emit remaining streaming program state, ie. what depends on
635 * other emit state, so cannot be pre-baked.
636 */
637 fd6_state_take_group(&emit->state, fd6_program_interp_state(emit),
638 FD6_GROUP_PROG_INTERP);
639 break;
640 case FD6_GROUP_RASTERIZER:
641 state = fd6_rasterizer_state<CHIP>(ctx, emit->primitive_restart);
642 fd6_state_add_group(&emit->state, state, FD6_GROUP_RASTERIZER);
643 break;
644 case FD6_GROUP_PROG_FB_RAST:
645 state = build_prog_fb_rast(emit);
646 fd6_state_take_group(&emit->state, state, FD6_GROUP_PROG_FB_RAST);
647 break;
648 case FD6_GROUP_BLEND:
649 state = fd6_blend_variant<CHIP>(ctx->blend, pfb->samples, ctx->sample_mask)
650 ->stateobj;
651 fd6_state_add_group(&emit->state, state, FD6_GROUP_BLEND);
652 break;
653 case FD6_GROUP_BLEND_COLOR:
654 state = build_blend_color(emit);
655 fd6_state_take_group(&emit->state, state, FD6_GROUP_BLEND_COLOR);
656 break;
657 case FD6_GROUP_SAMPLE_LOCATIONS:
658 state = build_sample_locations(emit);
659 fd6_state_take_group(&emit->state, state, FD6_GROUP_SAMPLE_LOCATIONS);
660 break;
661 case FD6_GROUP_VS_BINDLESS:
662 state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_VERTEX, false);
663 fd6_state_take_group(&emit->state, state, FD6_GROUP_VS_BINDLESS);
664 break;
665 case FD6_GROUP_HS_BINDLESS:
666 state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_TESS_CTRL, false);
667 fd6_state_take_group(&emit->state, state, FD6_GROUP_HS_BINDLESS);
668 break;
669 case FD6_GROUP_DS_BINDLESS:
670 state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_TESS_EVAL, false);
671 fd6_state_take_group(&emit->state, state, FD6_GROUP_DS_BINDLESS);
672 break;
673 case FD6_GROUP_GS_BINDLESS:
674 state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_GEOMETRY, false);
675 fd6_state_take_group(&emit->state, state, FD6_GROUP_GS_BINDLESS);
676 break;
677 case FD6_GROUP_FS_BINDLESS:
678 state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_FRAGMENT, fs->fb_read);
679 fd6_state_take_group(&emit->state, state, FD6_GROUP_FS_BINDLESS);
680 break;
681 case FD6_GROUP_CONST:
682 state = fd6_build_user_consts<PIPELINE>(emit);
683 fd6_state_take_group(&emit->state, state, FD6_GROUP_CONST);
684 break;
685 case FD6_GROUP_DRIVER_PARAMS:
686 state = fd6_build_driver_params<PIPELINE>(emit);
687 fd6_state_take_group(&emit->state, state, FD6_GROUP_DRIVER_PARAMS);
688 break;
689 case FD6_GROUP_PRIMITIVE_PARAMS:
690 if (PIPELINE == HAS_TESS_GS) {
691 state = fd6_build_tess_consts(emit);
692 fd6_state_take_group(&emit->state, state, FD6_GROUP_PRIMITIVE_PARAMS);
693 }
694 break;
695 case FD6_GROUP_VS_TEX:
696 state = tex_state(ctx, PIPE_SHADER_VERTEX);
697 fd6_state_take_group(&emit->state, state, FD6_GROUP_VS_TEX);
698 break;
699 case FD6_GROUP_HS_TEX:
700 state = tex_state(ctx, PIPE_SHADER_TESS_CTRL);
701 fd6_state_take_group(&emit->state, state, FD6_GROUP_HS_TEX);
702 break;
703 case FD6_GROUP_DS_TEX:
704 state = tex_state(ctx, PIPE_SHADER_TESS_EVAL);
705 fd6_state_take_group(&emit->state, state, FD6_GROUP_DS_TEX);
706 break;
707 case FD6_GROUP_GS_TEX:
708 state = tex_state(ctx, PIPE_SHADER_GEOMETRY);
709 fd6_state_take_group(&emit->state, state, FD6_GROUP_GS_TEX);
710 break;
711 case FD6_GROUP_FS_TEX:
712 state = tex_state(ctx, PIPE_SHADER_FRAGMENT);
713 fd6_state_take_group(&emit->state, state, FD6_GROUP_FS_TEX);
714 break;
715 case FD6_GROUP_SO:
716 fd6_emit_streamout<CHIP>(ring, emit);
717 break;
718 case FD6_GROUP_PRIM_MODE_SYSMEM:
719 state = build_prim_mode(emit, ctx, false);
720 fd6_state_take_group(&emit->state, state, FD6_GROUP_PRIM_MODE_SYSMEM);
721 break;
722 case FD6_GROUP_PRIM_MODE_GMEM:
723 state = build_prim_mode(emit, ctx, true);
724 fd6_state_take_group(&emit->state, state, FD6_GROUP_PRIM_MODE_GMEM);
725 break;
726 case FD6_GROUP_NON_GROUP:
727 fd6_emit_non_ring(ring, emit);
728 break;
729 default:
730 break;
731 }
732 }
733
734 fd6_state_emit(&emit->state, ring);
735 }
736
737 template void fd6_emit_3d_state<A6XX, NO_TESS_GS>(struct fd_ringbuffer *ring, struct fd6_emit *emit);
738 template void fd6_emit_3d_state<A7XX, NO_TESS_GS>(struct fd_ringbuffer *ring, struct fd6_emit *emit);
739 template void fd6_emit_3d_state<A6XX, HAS_TESS_GS>(struct fd_ringbuffer *ring, struct fd6_emit *emit);
740 template void fd6_emit_3d_state<A7XX, HAS_TESS_GS>(struct fd_ringbuffer *ring, struct fd6_emit *emit);
741
742 template <chip CHIP>
743 void
fd6_emit_cs_state(struct fd_context * ctx,struct fd_ringbuffer * ring,struct fd6_compute_state * cs)744 fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
745 struct fd6_compute_state *cs)
746 {
747 struct fd6_state state = {};
748
749 /* We want CP_SET_DRAW_STATE to execute immediately, otherwise we need to
750 * emit consts as draw state groups (which otherwise has no benefit outside
751 * of GMEM 3d using viz stream from binning pass).
752 *
753 * In particular, the PROG state group sets up the configuration for the
754 * const state, so it must execute before we start loading consts, rather
755 * than be deferred until CP_EXEC_CS.
756 */
757 OUT_PKT7(ring, CP_SET_MODE, 1);
758 OUT_RING(ring, 1);
759
760 uint32_t gen_dirty = ctx->gen_dirty &
761 (BIT(FD6_GROUP_PROG) | BIT(FD6_GROUP_CS_TEX) | BIT(FD6_GROUP_CS_BINDLESS));
762
763 u_foreach_bit (b, gen_dirty) {
764 enum fd6_state_id group = (enum fd6_state_id)b;
765
766 switch (group) {
767 case FD6_GROUP_PROG:
768 fd6_state_add_group(&state, cs->stateobj, FD6_GROUP_PROG);
769 break;
770 case FD6_GROUP_CS_TEX:
771 fd6_state_take_group(
772 &state,
773 tex_state(ctx, PIPE_SHADER_COMPUTE),
774 FD6_GROUP_CS_TEX);
775 break;
776 case FD6_GROUP_CS_BINDLESS:
777 fd6_state_take_group(
778 &state,
779 fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_COMPUTE, false),
780 FD6_GROUP_CS_BINDLESS);
781 break;
782 default:
783 /* State-group unused for compute shaders */
784 break;
785 }
786 }
787
788 fd6_state_emit(&state, ring);
789 }
790 FD_GENX(fd6_emit_cs_state);
791
792 template <chip CHIP>
793 void
fd6_emit_ccu_cntl(struct fd_ringbuffer * ring,struct fd_screen * screen,bool gmem)794 fd6_emit_ccu_cntl(struct fd_ringbuffer *ring, struct fd_screen *screen, bool gmem)
795 {
796 const struct fd6_gmem_config *cfg = gmem ? &screen->config_gmem : &screen->config_sysmem;
797 enum a6xx_ccu_cache_size color_cache_size = !gmem ? CCU_CACHE_SIZE_FULL :
798 (enum a6xx_ccu_cache_size)(screen->info->a6xx.gmem_ccu_color_cache_fraction);
799 uint32_t color_offset = cfg->color_ccu_offset & 0x1fffff;
800 uint32_t color_offset_hi = cfg->color_ccu_offset >> 21;
801
802 uint32_t depth_offset = cfg->depth_ccu_offset & 0x1fffff;
803 uint32_t depth_offset_hi = cfg->depth_ccu_offset >> 21;
804
805 if (CHIP == A7XX) {
806 OUT_REG(ring,
807 A7XX_RB_CCU_CNTL2(
808 .depth_offset_hi = depth_offset_hi,
809 .color_offset_hi = color_offset_hi,
810 .depth_cache_size = CCU_CACHE_SIZE_FULL,
811 .depth_offset = depth_offset,
812 .color_cache_size = color_cache_size,
813 .color_offset = color_offset,
814 )
815 );
816
817 if (screen->info->a7xx.has_gmem_vpc_attr_buf) {
818 OUT_REG(ring,
819 A7XX_VPC_ATTR_BUF_SIZE_GMEM(.size_gmem = cfg->vpc_attr_buf_size),
820 A7XX_VPC_ATTR_BUF_BASE_GMEM(.base_gmem = cfg->vpc_attr_buf_offset)
821 );
822 OUT_REG(ring,
823 A7XX_PC_ATTR_BUF_SIZE_GMEM(.size_gmem = cfg->vpc_attr_buf_size)
824 );
825 }
826 } else {
827 OUT_REG(ring,
828 RB_CCU_CNTL(
829 CHIP,
830 .gmem_fast_clear_disable =
831 !screen->info->a6xx.has_gmem_fast_clear,
832 .concurrent_resolve =
833 screen->info->a6xx.concurrent_resolve,
834 .depth_offset_hi = depth_offset_hi,
835 .color_offset_hi = color_offset_hi,
836 .depth_cache_size = CCU_CACHE_SIZE_FULL,
837 .depth_offset = depth_offset,
838 .color_cache_size = color_cache_size,
839 .color_offset = color_offset,
840 )
841 );
842 }
843 }
844 FD_GENX(fd6_emit_ccu_cntl);
845
846 template <chip CHIP>
847 static void
fd6_emit_stomp(struct fd_ringbuffer * ring,const uint16_t * regs,size_t count)848 fd6_emit_stomp(struct fd_ringbuffer *ring, const uint16_t *regs, size_t count)
849 {
850 for (size_t i = 0; i < count; i++) {
851 if (fd_reg_stomp_allowed(CHIP, regs[i])) {
852 WRITE(regs[i], 0xffffffff);
853 }
854 }
855 }
856
857 /* emit setup at begin of new cmdstream buffer (don't rely on previous
858 * state, there could have been a context switch between ioctls):
859 */
860 template <chip CHIP>
861 void
fd6_emit_restore(struct fd_batch * batch,struct fd_ringbuffer * ring)862 fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
863 {
864 struct fd_context *ctx = batch->ctx;
865 struct fd_screen *screen = ctx->screen;
866
867 if (!batch->nondraw) {
868 trace_start_state_restore(&batch->trace, ring);
869 }
870
871 if (FD_DBG(STOMP)) {
872 fd6_emit_stomp<CHIP>(ring, &RP_BLIT_REGS<CHIP>[0], ARRAY_SIZE(RP_BLIT_REGS<CHIP>));
873 fd6_emit_stomp<CHIP>(ring, &CMD_REGS<CHIP>[0], ARRAY_SIZE(CMD_REGS<CHIP>));
874 }
875
876 OUT_PKT7(ring, CP_SET_MODE, 1);
877 OUT_RING(ring, 0);
878
879 if (CHIP == A6XX) {
880 fd6_cache_inv<CHIP>(ctx, ring);
881 } else {
882 OUT_PKT7(ring, CP_THREAD_CONTROL, 1);
883 OUT_RING(ring, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
884 CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
885
886 fd6_event_write<CHIP>(ctx, ring, FD_CCU_INVALIDATE_COLOR);
887 fd6_event_write<CHIP>(ctx, ring, FD_CCU_INVALIDATE_DEPTH);
888
889 OUT_PKT7(ring, CP_EVENT_WRITE, 1);
890 OUT_RING(ring, UNK_40);
891
892 fd6_event_write<CHIP>(ctx, ring, FD_CACHE_INVALIDATE);
893 OUT_WFI5(ring);
894 }
895
896 OUT_REG(ring,
897 HLSQ_INVALIDATE_CMD(CHIP,
898 .vs_state = true, .hs_state = true,
899 .ds_state = true, .gs_state = true,
900 .fs_state = true, .cs_state = true,
901 .cs_ibo = true, .gfx_ibo = true,
902 .cs_shared_const = true,
903 .gfx_shared_const = true,
904 .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
905 .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,
906 )
907 );
908
909 OUT_WFI5(ring);
910
911 if (CHIP >= A7XX) {
912 /* On A7XX, RB_CCU_CNTL was broken into two registers, RB_CCU_CNTL which has
913 * static properties that can be set once, this requires a WFI to take effect.
914 * While the newly introduced register RB_CCU_CNTL2 has properties that may
915 * change per-RP and don't require a WFI to take effect, only CCU inval/flush
916 * events are required.
917 */
918 OUT_REG(ring,
919 RB_CCU_CNTL(
920 CHIP,
921 .gmem_fast_clear_disable = true, // !screen->info->a6xx.has_gmem_fast_clear,
922 .concurrent_resolve = screen->info->a6xx.concurrent_resolve,
923 )
924 );
925 OUT_WFI5(ring);
926 }
927
928 fd6_emit_ccu_cntl<CHIP>(ring, screen, false);
929
930 for (size_t i = 0; i < ARRAY_SIZE(screen->info->a6xx.magic_raw); i++) {
931 auto magic_reg = screen->info->a6xx.magic_raw[i];
932 if (!magic_reg.reg)
933 break;
934
935 uint32_t value = magic_reg.value;
936 switch(magic_reg.reg) {
937 case REG_A6XX_TPL1_DBG_ECO_CNTL1:
938 value = (value & ~A6XX_TPL1_DBG_ECO_CNTL1_TP_UBWC_FLAG_HINT) |
939 (screen->info->a7xx.enable_tp_ubwc_flag_hint
940 ? A6XX_TPL1_DBG_ECO_CNTL1_TP_UBWC_FLAG_HINT
941 : 0);
942 break;
943 }
944
945 WRITE(magic_reg.reg, value);
946 }
947
948 WRITE(REG_A6XX_RB_DBG_ECO_CNTL, screen->info->a6xx.magic.RB_DBG_ECO_CNTL);
949 WRITE(REG_A6XX_SP_FLOAT_CNTL, A6XX_SP_FLOAT_CNTL_F16_NO_INF);
950 WRITE(REG_A6XX_SP_DBG_ECO_CNTL, screen->info->a6xx.magic.SP_DBG_ECO_CNTL);
951 WRITE(REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);
952 if (CHIP == A6XX)
953 WRITE(REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
954 WRITE(REG_A6XX_TPL1_DBG_ECO_CNTL, screen->info->a6xx.magic.TPL1_DBG_ECO_CNTL);
955 if (CHIP == A6XX) {
956 WRITE(REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
957 WRITE(REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
958 }
959
960 WRITE(REG_A6XX_VPC_DBG_ECO_CNTL, screen->info->a6xx.magic.VPC_DBG_ECO_CNTL);
961 WRITE(REG_A6XX_GRAS_DBG_ECO_CNTL, screen->info->a6xx.magic.GRAS_DBG_ECO_CNTL);
962 if (CHIP == A6XX)
963 WRITE(REG_A6XX_HLSQ_DBG_ECO_CNTL, screen->info->a6xx.magic.HLSQ_DBG_ECO_CNTL);
964 WRITE(REG_A6XX_SP_CHICKEN_BITS, screen->info->a6xx.magic.SP_CHICKEN_BITS);
965 WRITE(REG_A6XX_SP_IBO_COUNT, 0);
966 WRITE(REG_A6XX_SP_UNKNOWN_B182, 0);
967 if (CHIP == A6XX)
968 WRITE(REG_A6XX_HLSQ_SHARED_CONSTS, 0);
969 WRITE(REG_A6XX_UCHE_UNKNOWN_0E12, screen->info->a6xx.magic.UCHE_UNKNOWN_0E12);
970 WRITE(REG_A6XX_UCHE_CLIENT_PF, screen->info->a6xx.magic.UCHE_CLIENT_PF);
971 WRITE(REG_A6XX_RB_UNKNOWN_8E01, screen->info->a6xx.magic.RB_UNKNOWN_8E01);
972 WRITE(REG_A6XX_SP_UNKNOWN_A9A8, 0);
973 OUT_REG(ring,
974 A6XX_SP_MODE_CONTROL(
975 .constant_demotion_enable = true,
976 .isammode = ISAMMODE_GL,
977 .shared_consts_enable = false,
978 )
979 );
980 WRITE(REG_A6XX_SP_MODE_CONTROL,
981 A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);
982 WRITE(REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
983 WRITE(REG_A6XX_VPC_UNKNOWN_9107, 0);
984 WRITE(REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
985 WRITE(REG_A6XX_PC_MODE_CNTL, screen->info->a6xx.magic.PC_MODE_CNTL);
986
987 WRITE(REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 0);
988 WRITE(REG_A6XX_GRAS_SAMPLE_CNTL, 0);
989 WRITE(REG_A6XX_GRAS_UNKNOWN_8110, 0x2);
990
991 WRITE(REG_A6XX_RB_UNKNOWN_8818, 0);
992
993 if (CHIP == A6XX) {
994 WRITE(REG_A6XX_RB_UNKNOWN_8819, 0);
995 WRITE(REG_A6XX_RB_UNKNOWN_881A, 0);
996 WRITE(REG_A6XX_RB_UNKNOWN_881B, 0);
997 WRITE(REG_A6XX_RB_UNKNOWN_881C, 0);
998 WRITE(REG_A6XX_RB_UNKNOWN_881D, 0);
999 WRITE(REG_A6XX_RB_UNKNOWN_881E, 0);
1000 }
1001
1002 WRITE(REG_A6XX_RB_UNKNOWN_88F0, 0);
1003
1004 WRITE(REG_A6XX_VPC_POINT_COORD_INVERT, A6XX_VPC_POINT_COORD_INVERT(0).value);
1005 WRITE(REG_A6XX_VPC_UNKNOWN_9300, 0);
1006
1007 WRITE(REG_A6XX_VPC_SO_DISABLE, A6XX_VPC_SO_DISABLE(true).value);
1008
1009 OUT_REG(ring, PC_RASTER_CNTL(CHIP));
1010
1011 WRITE(REG_A6XX_PC_MULTIVIEW_CNTL, 0);
1012
1013 WRITE(REG_A6XX_SP_UNKNOWN_B183, 0);
1014
1015 WRITE(REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0);
1016 WRITE(REG_A6XX_GRAS_VS_LAYER_CNTL, 0);
1017 WRITE(REG_A6XX_GRAS_SC_CNTL, A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1018 WRITE(REG_A6XX_GRAS_UNKNOWN_80AF, 0);
1019 if (CHIP == A6XX) {
1020 WRITE(REG_A6XX_VPC_UNKNOWN_9210, 0);
1021 WRITE(REG_A6XX_VPC_UNKNOWN_9211, 0);
1022 }
1023 WRITE(REG_A6XX_VPC_UNKNOWN_9602, 0);
1024 WRITE(REG_A6XX_PC_UNKNOWN_9E72, 0);
1025 /* NOTE blob seems to (mostly?) use 0xb2 for SP_TP_MODE_CNTL
1026 * but this seems to kill texture gather offsets.
1027 */
1028 WRITE(REG_A6XX_SP_TP_MODE_CNTL, 0xa0 |
1029 A6XX_SP_TP_MODE_CNTL_ISAMMODE(ISAMMODE_GL));
1030
1031 OUT_REG(ring, HLSQ_CONTROL_5_REG(
1032 CHIP,
1033 .linelengthregid = INVALID_REG,
1034 .foveationqualityregid = INVALID_REG,
1035 ));
1036
1037 emit_marker6(ring, 7);
1038
1039 OUT_PKT4(ring, REG_A6XX_VFD_MODE_CNTL, 1);
1040 OUT_RING(ring, 0x00000000); /* VFD_MODE_CNTL */
1041
1042 WRITE(REG_A6XX_VFD_MULTIVIEW_CNTL, 0);
1043
1044 /* Clear any potential pending state groups to be safe: */
1045 OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
1046 OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
1047 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1048 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1049 OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1050 OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1051
1052 OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_CNTL, 1);
1053 OUT_RING(ring, 0x00000000); /* VPC_SO_STREAM_CNTL */
1054
1055 if (CHIP >= A7XX) {
1056 OUT_REG(ring, A6XX_GRAS_LRZ_CNTL());
1057 OUT_REG(ring, A7XX_GRAS_LRZ_CNTL2());
1058 } else {
1059 OUT_REG(ring, A6XX_GRAS_LRZ_CNTL());
1060 }
1061
1062 OUT_REG(ring, A6XX_RB_LRZ_CNTL());
1063 OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL());
1064 OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1065
1066 OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1);
1067 OUT_RING(ring, 0x00000000);
1068
1069 OUT_PKT4(ring, REG_A6XX_RB_LRZ_CNTL, 1);
1070 OUT_RING(ring, 0x00000000);
1071
1072 /* Initialize VFD_FETCH[n].SIZE to zero to avoid iova faults trying
1073 * to fetch from a VFD_FETCH[n].BASE which we've potentially inherited
1074 * from another process:
1075 */
1076 for (int32_t i = 0; i < 32; i++) {
1077 OUT_PKT4(ring, REG_A6XX_VFD_FETCH_SIZE(i), 1);
1078 OUT_RING(ring, 0);
1079 }
1080
1081 /* This happens after all drawing has been emitted to the draw CS, so we know
1082 * whether we need the tess BO pointers.
1083 */
1084 if (batch->tessellation) {
1085 assert(screen->tess_bo);
1086 fd_ringbuffer_attach_bo(ring, screen->tess_bo);
1087 OUT_REG(ring, PC_TESSFACTOR_ADDR(CHIP, screen->tess_bo));
1088 /* Updating PC_TESSFACTOR_ADDR could race with the next draw which uses it. */
1089 OUT_WFI5(ring);
1090 }
1091
1092 struct fd6_context *fd6_ctx = fd6_context(ctx);
1093 struct fd_bo *bcolor_mem = fd6_ctx->bcolor_mem;
1094
1095 OUT_PKT4(ring, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR, 2);
1096 OUT_RELOC(ring, bcolor_mem, 0, 0, 0);
1097
1098 OUT_PKT4(ring, REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, 2);
1099 OUT_RELOC(ring, bcolor_mem, 0, 0, 0);
1100
1101 /* These regs are blocked (CP_PROTECT) on a6xx: */
1102 if (CHIP >= A7XX) {
1103 OUT_REG(ring,
1104 TPL1_BICUBIC_WEIGHTS_TABLE_0(CHIP, 0),
1105 TPL1_BICUBIC_WEIGHTS_TABLE_1(CHIP, 0x3fe05ff4),
1106 TPL1_BICUBIC_WEIGHTS_TABLE_2(CHIP, 0x3fa0ebee),
1107 TPL1_BICUBIC_WEIGHTS_TABLE_3(CHIP, 0x3f5193ed),
1108 TPL1_BICUBIC_WEIGHTS_TABLE_4(CHIP, 0x3f0243f0),
1109 );
1110 }
1111
1112 if (CHIP >= A7XX) {
1113 /* Blob sets these two per draw. */
1114 OUT_REG(ring, A7XX_PC_TESS_PARAM_SIZE(FD6_TESS_PARAM_SIZE));
1115 /* Blob adds a bit more space ({0x10, 0x20, 0x30, 0x40} bytes)
1116 * but the meaning of this additional space is not known,
1117 * so we play safe and don't add it.
1118 */
1119 OUT_REG(ring, A7XX_PC_TESS_FACTOR_SIZE(FD6_TESS_FACTOR_SIZE));
1120 }
1121
1122 /* There is an optimization to skip executing draw states for draws with no
1123 * instances. Instead of simply skipping the draw, internally the firmware
1124 * sets a bit in PC_DRAW_INITIATOR that seemingly skips the draw. However
1125 * there is a hardware bug where this bit does not always cause the FS
1126 * early preamble to be skipped. Because the draw states were skipped,
1127 * SP_FS_CTRL_REG0, SP_FS_OBJ_START and so on are never updated and a
1128 * random FS preamble from the last draw is executed. If the last visible
1129 * draw is from the same submit, it shouldn't be a problem because we just
1130 * re-execute the same preamble and preambles don't have side effects, but
1131 * if it's from another process then we could execute a garbage preamble
1132 * leading to hangs and faults. To make sure this doesn't happen, we reset
1133 * SP_FS_CTRL_REG0 here, making sure that the EARLYPREAMBLE bit isn't set
1134 * so any leftover early preamble doesn't get executed. Other stages don't
1135 * seem to be affected.
1136 */
1137 if (screen->info->a6xx.has_early_preamble) {
1138 WRITE(REG_A6XX_SP_FS_CTRL_REG0, 0);
1139 }
1140
1141 if (!batch->nondraw) {
1142 trace_end_state_restore(&batch->trace, ring);
1143 }
1144 }
1145 FD_GENX(fd6_emit_restore);
1146
1147 static void
fd6_mem_to_mem(struct fd_ringbuffer * ring,struct pipe_resource * dst,unsigned dst_off,struct pipe_resource * src,unsigned src_off,unsigned sizedwords)1148 fd6_mem_to_mem(struct fd_ringbuffer *ring, struct pipe_resource *dst,
1149 unsigned dst_off, struct pipe_resource *src, unsigned src_off,
1150 unsigned sizedwords)
1151 {
1152 struct fd_bo *src_bo = fd_resource(src)->bo;
1153 struct fd_bo *dst_bo = fd_resource(dst)->bo;
1154 unsigned i;
1155
1156 fd_ringbuffer_attach_bo(ring, dst_bo);
1157 fd_ringbuffer_attach_bo(ring, src_bo);
1158
1159 for (i = 0; i < sizedwords; i++) {
1160 OUT_PKT7(ring, CP_MEM_TO_MEM, 5);
1161 OUT_RING(ring, 0x00000000);
1162 OUT_RELOC(ring, dst_bo, dst_off, 0, 0);
1163 OUT_RELOC(ring, src_bo, src_off, 0, 0);
1164
1165 dst_off += 4;
1166 src_off += 4;
1167 }
1168 }
1169
1170 void
fd6_emit_init_screen(struct pipe_screen * pscreen)1171 fd6_emit_init_screen(struct pipe_screen *pscreen)
1172 {
1173 struct fd_screen *screen = fd_screen(pscreen);
1174 screen->mem_to_mem = fd6_mem_to_mem;
1175 }
1176