xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/freedreno/a6xx/fd6_emit.cc (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016 Rob Clark <[email protected]>
3  * Copyright © 2018 Google, Inc.
4  * SPDX-License-Identifier: MIT
5  *
6  * Authors:
7  *    Rob Clark <[email protected]>
8  */
9 
10 #define FD_BO_NO_HARDPIN 1
11 
12 #include "pipe/p_state.h"
13 #include "util/format/u_format.h"
14 #include "util/u_helpers.h"
15 #include "util/u_memory.h"
16 #include "util/u_string.h"
17 #include "util/u_viewport.h"
18 
19 #include "freedreno_query_hw.h"
20 #include "freedreno_resource.h"
21 #include "freedreno_state.h"
22 #include "freedreno_stompable_regs.h"
23 #include "freedreno_tracepoints.h"
24 
25 #include "fd6_blend.h"
26 #include "fd6_const.h"
27 #include "fd6_context.h"
28 #include "fd6_compute.h"
29 #include "fd6_emit.h"
30 #include "fd6_image.h"
31 #include "fd6_pack.h"
32 #include "fd6_program.h"
33 #include "fd6_rasterizer.h"
34 #include "fd6_texture.h"
35 #include "fd6_zsa.h"
36 
37 /* Helper to get tex stateobj.
38  */
39 static struct fd_ringbuffer *
tex_state(struct fd_context * ctx,enum pipe_shader_type type)40 tex_state(struct fd_context *ctx, enum pipe_shader_type type)
41    assert_dt
42 {
43    if (ctx->tex[type].num_textures == 0)
44       return NULL;
45 
46    return fd_ringbuffer_ref(fd6_texture_state(ctx, type)->stateobj);
47 }
48 
49 static struct fd_ringbuffer *
build_vbo_state(struct fd6_emit * emit)50 build_vbo_state(struct fd6_emit *emit) assert_dt
51 {
52    const struct fd_vertex_state *vtx = &emit->ctx->vtx;
53 
54    const unsigned cnt = vtx->vertexbuf.count;
55    const unsigned dwords = cnt * 4;  /* per vbo: reg64 + one reg32 + pkt hdr */
56 
57    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
58       emit->ctx->batch->submit, 4 * dwords, FD_RINGBUFFER_STREAMING);
59 
60    for (int32_t j = 0; j < cnt; j++) {
61       OUT_PKT4(ring, REG_A6XX_VFD_FETCH(j), 3);
62       const struct pipe_vertex_buffer *vb = &vtx->vertexbuf.vb[j];
63       struct fd_resource *rsc = fd_resource(vb->buffer.resource);
64       if (rsc == NULL) {
65          OUT_RING(ring, 0);
66          OUT_RING(ring, 0);
67          OUT_RING(ring, 0);
68       } else {
69          uint32_t off = vb->buffer_offset;
70          uint32_t size = vb->buffer.resource->width0 - off;
71 
72          OUT_RELOC(ring, rsc->bo, off, 0, 0);
73          OUT_RING(ring, size);       /* VFD_FETCH[j].SIZE */
74       }
75    }
76 
77    return ring;
78 }
79 
80 static enum a6xx_ztest_mode
compute_ztest_mode(struct fd6_emit * emit,bool lrz_valid)81 compute_ztest_mode(struct fd6_emit *emit, bool lrz_valid) assert_dt
82 {
83    if (emit->prog->lrz_mask.z_mode != A6XX_INVALID_ZTEST)
84       return emit->prog->lrz_mask.z_mode;
85 
86    struct fd_context *ctx = emit->ctx;
87    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
88    struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa);
89    const struct ir3_shader_variant *fs = emit->fs;
90 
91    if (!zsa->base.depth_enabled) {
92       return A6XX_LATE_Z;
93    } else if ((fs->has_kill || zsa->alpha_test) &&
94               (zsa->writes_zs || !pfb->zsbuf)) {
95       /* Slightly odd, but seems like the hw wants us to select
96        * LATE_Z mode if there is no depth buffer + discard.  Either
97        * that, or when occlusion query is enabled.  See:
98        *
99        * dEQP-GLES31.functional.fbo.no_attachments.*
100        */
101       return lrz_valid ? A6XX_EARLY_LRZ_LATE_Z : A6XX_LATE_Z;
102    } else {
103       return A6XX_EARLY_Z;
104    }
105 }
106 
107 /**
108  * Calculate normalized LRZ state based on zsa/prog/blend state, updating
109  * the zsbuf's lrz state as necessary to detect the cases where we need
110  * to invalidate lrz.
111  */
112 static struct fd6_lrz_state
compute_lrz_state(struct fd6_emit * emit)113 compute_lrz_state(struct fd6_emit *emit) assert_dt
114 {
115    struct fd_context *ctx = emit->ctx;
116    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
117    struct fd6_lrz_state lrz;
118 
119    if (!pfb->zsbuf) {
120       memset(&lrz, 0, sizeof(lrz));
121       lrz.z_mode = compute_ztest_mode(emit, false);
122       return lrz;
123    }
124 
125    struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend);
126    struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa);
127    struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
128    bool reads_dest = blend->reads_dest;
129 
130    lrz = zsa->lrz;
131 
132    lrz.val &= emit->prog->lrz_mask.val;
133 
134    /* normalize lrz state: */
135    if (reads_dest || blend->base.alpha_to_coverage) {
136       lrz.write = false;
137    }
138 
139    /* Unwritten channels *that actually exist* are a form of blending
140     * reading the dest from the PoV of LRZ, but the valid dst channels
141     * isn't known when blend CSO is constructed so we need to handle
142     * that here.
143     */
144    if (ctx->all_mrt_channel_mask & ~blend->all_mrt_write_mask) {
145       lrz.write = false;
146       reads_dest = true;
147    }
148 
149    /* Writing depth with blend enabled means we need to invalidate LRZ,
150     * because the written depth value could mean that a later draw with
151     * depth enabled (where we would otherwise write LRZ) could have
152     * fragments which don't pass the depth test due to this draw.  For
153     * example, consider this sequence of draws, with depth mode GREATER:
154     *
155     *   draw A:
156     *     z=0.1, fragments pass
157     *   draw B:
158     *     z=0.4, fragments pass
159     *     blend enabled (LRZ write disabled)
160     *     depth write enabled
161     *   draw C:
162     *     z=0.2, fragments don't pass
163     *     blend disabled
164     *     depth write enabled
165     *
166     * Normally looking at the state in draw C, we'd assume we could
167     * enable LRZ write.  But this would cause early-z/lrz to discard
168     * fragments from draw A which should be visible due to draw B.
169     */
170    if (reads_dest && zsa->writes_z && ctx->screen->driconf.conservative_lrz) {
171       if (!zsa->perf_warn_blend && rsc->lrz_valid) {
172          perf_debug_ctx(ctx, "Invalidating LRZ due to blend+depthwrite");
173          zsa->perf_warn_blend = true;
174       }
175       rsc->lrz_valid = false;
176    }
177 
178    /* if we change depthfunc direction, bail out on using LRZ.  The
179     * LRZ buffer encodes a min/max depth value per block, but if
180     * we switch from GT/GE <-> LT/LE, those values cannot be
181     * interpreted properly.
182     */
183    if (zsa->base.depth_enabled && (rsc->lrz_direction != FD_LRZ_UNKNOWN) &&
184        (rsc->lrz_direction != lrz.direction)) {
185       if (!zsa->perf_warn_zdir && rsc->lrz_valid) {
186          perf_debug_ctx(ctx, "Invalidating LRZ due to depth test direction change");
187          zsa->perf_warn_zdir = true;
188       }
189       rsc->lrz_valid = false;
190    }
191 
192    if (zsa->invalidate_lrz || !rsc->lrz_valid) {
193       rsc->lrz_valid = false;
194       memset(&lrz, 0, sizeof(lrz));
195    }
196 
197    lrz.z_mode = compute_ztest_mode(emit, rsc->lrz_valid);
198 
199    /* Once we start writing to the real depth buffer, we lock in the
200     * direction for LRZ.. if we have to skip a LRZ write for any
201     * reason, it is still safe to have LRZ until there is a direction
202     * reversal.  Prior to the reversal, since we disabled LRZ writes
203     * in the "unsafe" cases, this just means that the LRZ test may
204     * not early-discard some things that end up not passing a later
205     * test (ie. be overly concervative).  But once you have a reversal
206     * of direction, it is possible to increase/decrease the z value
207     * to the point where the overly-conservative test is incorrect.
208     */
209    if (zsa->base.depth_writemask) {
210       rsc->lrz_direction = lrz.direction;
211    }
212 
213    return lrz;
214 }
215 
216 template <chip CHIP>
217 static struct fd_ringbuffer *
build_lrz(struct fd6_emit * emit)218 build_lrz(struct fd6_emit *emit) assert_dt
219 {
220    struct fd_context *ctx = emit->ctx;
221    struct fd6_context *fd6_ctx = fd6_context(ctx);
222    struct fd6_lrz_state lrz = compute_lrz_state(emit);
223 
224    /* If the LRZ state has not changed, we can skip the emit: */
225    if (!ctx->last.dirty && (fd6_ctx->last.lrz.val == lrz.val))
226       return NULL;
227 
228    fd6_ctx->last.lrz = lrz;
229 
230    unsigned ndwords = (CHIP >= A7XX) ? 10 : 8;
231    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
232       ctx->batch->submit, ndwords * 4, FD_RINGBUFFER_STREAMING);
233 
234    if (CHIP >= A7XX) {
235       OUT_REG(ring,
236          A6XX_GRAS_LRZ_CNTL(
237             .enable = lrz.enable,
238             .lrz_write = lrz.write,
239             .greater = lrz.direction == FD_LRZ_GREATER,
240             .z_test_enable = lrz.test,
241             .z_bounds_enable = lrz.z_bounds_enable,
242          )
243       );
244       OUT_REG(ring,
245          A7XX_GRAS_LRZ_CNTL2(
246             .disable_on_wrong_dir = false,
247             .fc_enable = false,
248          )
249       );
250    } else {
251       OUT_REG(ring,
252          A6XX_GRAS_LRZ_CNTL(
253             .enable = lrz.enable,
254             .lrz_write = lrz.write,
255             .greater = lrz.direction == FD_LRZ_GREATER,
256             .fc_enable = false,
257             .z_test_enable = lrz.test,
258             .z_bounds_enable = lrz.z_bounds_enable,
259             .disable_on_wrong_dir = false,
260          )
261       );
262    }
263    OUT_REG(ring, A6XX_RB_LRZ_CNTL(.enable = lrz.enable, ));
264 
265    OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, ));
266 
267    OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, ));
268 
269    return ring;
270 }
271 
272 static struct fd_ringbuffer *
build_scissor(struct fd6_emit * emit)273 build_scissor(struct fd6_emit *emit) assert_dt
274 {
275    struct fd_context *ctx = emit->ctx;
276    struct pipe_scissor_state *scissors = fd_context_get_scissor(ctx);
277    unsigned num_viewports = emit->prog->num_viewports;
278 
279    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
280       emit->ctx->batch->submit, (1 + (2 * num_viewports)) * 4, FD_RINGBUFFER_STREAMING);
281 
282    OUT_PKT4(ring, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), 2 * num_viewports);
283    for (unsigned i = 0; i < num_viewports; i++) {
284       OUT_RING(ring, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(scissors[i].minx) |
285                A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(scissors[i].miny));
286       OUT_RING(ring, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(scissors[i].maxx) |
287                A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(scissors[i].maxy));
288    }
289 
290    return ring;
291 }
292 
293 /* Combination of FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER_DISCARD |
294  * FD_DIRTY_PROG | FD_DIRTY_DUAL_BLEND
295  */
296 static struct fd_ringbuffer *
build_prog_fb_rast(struct fd6_emit * emit)297 build_prog_fb_rast(struct fd6_emit *emit) assert_dt
298 {
299    struct fd_context *ctx = emit->ctx;
300    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
301    const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
302    const struct ir3_shader_variant *fs = emit->fs;
303 
304    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
305       ctx->batch->submit, 9 * 4, FD_RINGBUFFER_STREAMING);
306 
307    unsigned nr = pfb->nr_cbufs;
308 
309    if (ctx->rasterizer->rasterizer_discard)
310       nr = 0;
311 
312    struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend);
313 
314    if (blend->use_dual_src_blend)
315       nr++;
316 
317    OUT_PKT4(ring, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2);
318    OUT_RING(ring, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
319                      COND(fs->writes_smask && pfb->samples > 1,
320                           A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
321                      COND(fs->writes_stencilref,
322                           A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
323                      COND(blend->use_dual_src_blend,
324                           A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
325    OUT_RING(ring, A6XX_RB_FS_OUTPUT_CNTL1_MRT(nr));
326 
327    OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL1, 1);
328    OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr));
329 
330    unsigned mrt_components = 0;
331    for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
332       if (!pfb->cbufs[i])
333          continue;
334       mrt_components |= 0xf << (i * 4);
335    }
336 
337    /* dual source blending has an extra fs output in the 2nd slot */
338    if (blend->use_dual_src_blend)
339       mrt_components |= 0xf << 4;
340 
341    mrt_components &= prog->mrt_components;
342 
343    OUT_REG(ring, A6XX_SP_FS_RENDER_COMPONENTS(.dword = mrt_components));
344    OUT_REG(ring, A6XX_RB_RENDER_COMPONENTS(.dword = mrt_components));
345 
346    return ring;
347 }
348 
349 static struct fd_ringbuffer *
build_blend_color(struct fd6_emit * emit)350 build_blend_color(struct fd6_emit *emit) assert_dt
351 {
352    struct fd_context *ctx = emit->ctx;
353    struct pipe_blend_color *bcolor = &ctx->blend_color;
354    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
355       ctx->batch->submit, 5 * 4, FD_RINGBUFFER_STREAMING);
356 
357    OUT_REG(ring, A6XX_RB_BLEND_RED_F32(bcolor->color[0]),
358            A6XX_RB_BLEND_GREEN_F32(bcolor->color[1]),
359            A6XX_RB_BLEND_BLUE_F32(bcolor->color[2]),
360            A6XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
361 
362    return ring;
363 }
364 
365 static struct fd_ringbuffer *
build_sample_locations(struct fd6_emit * emit)366 build_sample_locations(struct fd6_emit *emit)
367    assert_dt
368 {
369    struct fd_context *ctx = emit->ctx;
370 
371    if (!ctx->sample_locations_enabled) {
372       struct fd6_context *fd6_ctx = fd6_context(ctx);
373       return fd_ringbuffer_ref(fd6_ctx->sample_locations_disable_stateobj);
374    }
375 
376    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
377       ctx->batch->submit, 9 * 4, FD_RINGBUFFER_STREAMING);
378 
379    uint32_t sample_locations = 0;
380    for (int i = 0; i < 4; i++) {
381       float x = (ctx->sample_locations[i] & 0xf) / 16.0f;
382       float y = (16 - (ctx->sample_locations[i] >> 4)) / 16.0f;
383 
384       x = CLAMP(x, 0.0f, 0.9375f);
385       y = CLAMP(y, 0.0f, 0.9375f);
386 
387       sample_locations |=
388          (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(x) |
389           A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(y)) << i*8;
390    }
391 
392    OUT_REG(ring, A6XX_GRAS_SAMPLE_CONFIG(.location_enable = true),
393                  A6XX_GRAS_SAMPLE_LOCATION_0(.dword = sample_locations));
394 
395    OUT_REG(ring, A6XX_RB_SAMPLE_CONFIG(.location_enable = true),
396                  A6XX_RB_SAMPLE_LOCATION_0(.dword = sample_locations));
397 
398    OUT_REG(ring, A6XX_SP_TP_SAMPLE_CONFIG(.location_enable = true),
399                  A6XX_SP_TP_SAMPLE_LOCATION_0(.dword = sample_locations));
400 
401    return ring;
402 }
403 
404 template <chip CHIP>
405 static void
fd6_emit_streamout(struct fd_ringbuffer * ring,struct fd6_emit * emit)406 fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
407 {
408    struct fd_context *ctx = emit->ctx;
409    const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
410    const struct ir3_stream_output_info *info = prog->stream_output;
411    struct fd_streamout_stateobj *so = &ctx->streamout;
412    unsigned streamout_mask = 0;
413 
414    if (!info)
415       return;
416 
417    for (unsigned i = 0; i < so->num_targets; i++) {
418       struct fd_stream_output_target *target =
419          fd_stream_output_target(so->targets[i]);
420 
421       if (!target)
422          continue;
423 
424       target->stride = info->stride[i];
425 
426       OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_BASE(i), 3);
427       /* VPC_SO[i].BUFFER_BASE_LO: */
428       OUT_RELOC(ring, fd_resource(target->base.buffer)->bo, 0, 0, 0);
429       OUT_RING(ring, target->base.buffer_size + target->base.buffer_offset);
430 
431       struct fd_bo *offset_bo = fd_resource(target->offset_buf)->bo;
432 
433       if (so->reset & (1 << i)) {
434          assert(so->offsets[i] == 0);
435 
436          OUT_PKT7(ring, CP_MEM_WRITE, 3);
437          OUT_RELOC(ring, offset_bo, 0, 0, 0);
438          OUT_RING(ring, target->base.buffer_offset);
439 
440          OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_OFFSET(i), 1);
441          OUT_RING(ring, target->base.buffer_offset);
442       } else {
443          OUT_PKT7(ring, CP_MEM_TO_REG, 3);
444          OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) |
445                            COND(CHIP == A6XX, CP_MEM_TO_REG_0_SHIFT_BY_2) |
446                            CP_MEM_TO_REG_0_UNK31 |
447                            CP_MEM_TO_REG_0_CNT(0));
448          OUT_RELOC(ring, offset_bo, 0, 0, 0);
449       }
450 
451       // After a draw HW would write the new offset to offset_bo
452       OUT_PKT4(ring, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);
453       OUT_RELOC(ring, offset_bo, 0, 0, 0);
454 
455       so->reset &= ~(1 << i);
456 
457       streamout_mask |= (1 << i);
458    }
459 
460    if (streamout_mask) {
461       fd6_state_add_group(&emit->state, prog->streamout_stateobj, FD6_GROUP_SO);
462    } else if (ctx->last.streamout_mask != 0) {
463       /* If we transition from a draw with streamout to one without, turn
464        * off streamout.
465        */
466       fd6_state_add_group(&emit->state, fd6_context(ctx)->streamout_disable_stateobj,
467                          FD6_GROUP_SO);
468    }
469 
470    /* Make sure that any use of our TFB outputs (indirect draw source or shader
471     * UBO reads) comes after the TFB output is written.  From the GL 4.6 core
472     * spec:
473     *
474     *     "Buffers should not be bound or in use for both transform feedback and
475     *      other purposes in the GL.  Specifically, if a buffer object is
476     *      simultaneously bound to a transform feedback buffer binding point
477     *      and elsewhere in the GL, any writes to or reads from the buffer
478     *      generate undefined values."
479     *
480     * So we idle whenever SO buffers change.  Note that this function is called
481     * on every draw with TFB enabled, so check the dirty flag for the buffers
482     * themselves.
483     */
484    if (ctx->dirty & FD_DIRTY_STREAMOUT)
485       OUT_WFI5(ring);
486 
487    ctx->last.streamout_mask = streamout_mask;
488    emit->streamout_mask = streamout_mask;
489 }
490 
491 /**
492  * Stuff that less frequently changes and isn't (yet) moved into stategroups
493  */
494 static void
fd6_emit_non_ring(struct fd_ringbuffer * ring,struct fd6_emit * emit)495 fd6_emit_non_ring(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
496 {
497    struct fd_context *ctx = emit->ctx;
498    const enum fd_dirty_3d_state dirty = ctx->dirty;
499    unsigned num_viewports = emit->prog->num_viewports;
500 
501    if (dirty & FD_DIRTY_STENCIL_REF) {
502       struct pipe_stencil_ref *sr = &ctx->stencil_ref;
503 
504       OUT_PKT4(ring, REG_A6XX_RB_STENCILREF, 1);
505       OUT_RING(ring, A6XX_RB_STENCILREF_REF(sr->ref_value[0]) |
506                         A6XX_RB_STENCILREF_BFREF(sr->ref_value[1]));
507    }
508 
509    if (dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_PROG)) {
510       for (unsigned i = 0; i < num_viewports; i++) {
511          struct pipe_scissor_state *scissor = &ctx->viewport_scissor[i];
512          struct pipe_viewport_state *vp = & ctx->viewport[i];
513 
514          OUT_REG(ring, A6XX_GRAS_CL_VPORT_XOFFSET(i, vp->translate[0]),
515                  A6XX_GRAS_CL_VPORT_XSCALE(i, vp->scale[0]),
516                  A6XX_GRAS_CL_VPORT_YOFFSET(i, vp->translate[1]),
517                  A6XX_GRAS_CL_VPORT_YSCALE(i, vp->scale[1]),
518                  A6XX_GRAS_CL_VPORT_ZOFFSET(i, vp->translate[2]),
519                  A6XX_GRAS_CL_VPORT_ZSCALE(i, vp->scale[2]));
520 
521          OUT_REG(
522                ring,
523                A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(i,
524                                                 .x = scissor->minx,
525                                                 .y = scissor->miny),
526                A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(i,
527                                                 .x = scissor->maxx,
528                                                 .y = scissor->maxy));
529       }
530 
531       OUT_REG(ring, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ(.horz = ctx->guardband.x,
532                                                     .vert = ctx->guardband.y));
533    }
534 
535    /* The clamp ranges are only used when the rasterizer wants depth
536     * clamping.
537     */
538    if ((dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) &&
539        fd_depth_clamp_enabled(ctx)) {
540       for (unsigned i = 0; i < num_viewports; i++) {
541          struct pipe_viewport_state *vp = & ctx->viewport[i];
542          float zmin, zmax;
543 
544          util_viewport_zmin_zmax(vp, ctx->rasterizer->clip_halfz,
545                                  &zmin, &zmax);
546 
547          OUT_REG(ring, A6XX_GRAS_CL_Z_CLAMP_MIN(i, zmin),
548                  A6XX_GRAS_CL_Z_CLAMP_MAX(i, zmax));
549 
550          /* TODO: what to do about this and multi viewport ? */
551          if (i == 0)
552             OUT_REG(ring, A6XX_RB_Z_CLAMP_MIN(zmin), A6XX_RB_Z_CLAMP_MAX(zmax));
553       }
554    }
555 }
556 
557 static struct fd_ringbuffer*
build_prim_mode(struct fd6_emit * emit,struct fd_context * ctx,bool gmem)558 build_prim_mode(struct fd6_emit *emit, struct fd_context *ctx, bool gmem)
559    assert_dt
560 {
561    struct fd_ringbuffer *ring =
562       fd_submit_new_ringbuffer(emit->ctx->batch->submit, 2 * 4, FD_RINGBUFFER_STREAMING);
563    uint32_t prim_mode = NO_FLUSH;
564    if (emit->fs->fs.uses_fbfetch_output) {
565       if (gmem) {
566          prim_mode = (ctx->blend->blend_coherent || emit->fs->fs.fbfetch_coherent)
567             ? FLUSH_PER_OVERLAP : NO_FLUSH;
568       } else {
569          prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
570       }
571    } else {
572       prim_mode = NO_FLUSH;
573    }
574    OUT_REG(ring, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2,
575                                    .single_prim_mode = (enum a6xx_single_prim_mode)prim_mode));
576    return ring;
577 }
578 
579 template <chip CHIP, fd6_pipeline_type PIPELINE>
580 void
fd6_emit_3d_state(struct fd_ringbuffer * ring,struct fd6_emit * emit)581 fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit)
582 {
583    struct fd_context *ctx = emit->ctx;
584    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
585    const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
586    const struct ir3_shader_variant *fs = emit->fs;
587 
588    emit_marker6(ring, 5);
589 
590    /* Special case, we need to re-emit bindless FS state w/ the
591     * fb-read state appended:
592     */
593    if ((emit->dirty_groups & BIT(FD6_GROUP_PROG)) && fs->fb_read) {
594       ctx->batch->gmem_reason |= FD_GMEM_FB_READ;
595       emit->dirty_groups |= BIT(FD6_GROUP_FS_BINDLESS);
596    }
597 
598    u_foreach_bit (b, emit->dirty_groups) {
599       enum fd6_state_id group = (enum fd6_state_id)b;
600       struct fd_ringbuffer *state = NULL;
601 
602       switch (group) {
603       case FD6_GROUP_VTXSTATE:
604          state = fd6_vertex_stateobj(ctx->vtx.vtx)->stateobj;
605          fd6_state_add_group(&emit->state, state, FD6_GROUP_VTXSTATE);
606          break;
607       case FD6_GROUP_VBO:
608          state = build_vbo_state(emit);
609          fd6_state_take_group(&emit->state, state, FD6_GROUP_VBO);
610          break;
611       case FD6_GROUP_ZSA:
612          state = fd6_zsa_state(
613             ctx,
614             util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0])),
615             fd_depth_clamp_enabled(ctx));
616          fd6_state_add_group(&emit->state, state, FD6_GROUP_ZSA);
617          break;
618       case FD6_GROUP_LRZ:
619          state = build_lrz<CHIP>(emit);
620          if (state)
621             fd6_state_take_group(&emit->state, state, FD6_GROUP_LRZ);
622          break;
623       case FD6_GROUP_SCISSOR:
624          state = build_scissor(emit);
625          fd6_state_take_group(&emit->state, state, FD6_GROUP_SCISSOR);
626          break;
627       case FD6_GROUP_PROG:
628          fd6_state_add_group(&emit->state, prog->config_stateobj,
629                              FD6_GROUP_PROG_CONFIG);
630          fd6_state_add_group(&emit->state, prog->stateobj, FD6_GROUP_PROG);
631          fd6_state_add_group(&emit->state, prog->binning_stateobj,
632                              FD6_GROUP_PROG_BINNING);
633 
634          /* emit remaining streaming program state, ie. what depends on
635           * other emit state, so cannot be pre-baked.
636           */
637          fd6_state_take_group(&emit->state, fd6_program_interp_state(emit),
638                               FD6_GROUP_PROG_INTERP);
639          break;
640       case FD6_GROUP_RASTERIZER:
641          state = fd6_rasterizer_state<CHIP>(ctx, emit->primitive_restart);
642          fd6_state_add_group(&emit->state, state, FD6_GROUP_RASTERIZER);
643          break;
644       case FD6_GROUP_PROG_FB_RAST:
645          state = build_prog_fb_rast(emit);
646          fd6_state_take_group(&emit->state, state, FD6_GROUP_PROG_FB_RAST);
647          break;
648       case FD6_GROUP_BLEND:
649          state = fd6_blend_variant<CHIP>(ctx->blend, pfb->samples, ctx->sample_mask)
650                     ->stateobj;
651          fd6_state_add_group(&emit->state, state, FD6_GROUP_BLEND);
652          break;
653       case FD6_GROUP_BLEND_COLOR:
654          state = build_blend_color(emit);
655          fd6_state_take_group(&emit->state, state, FD6_GROUP_BLEND_COLOR);
656          break;
657       case FD6_GROUP_SAMPLE_LOCATIONS:
658          state = build_sample_locations(emit);
659          fd6_state_take_group(&emit->state, state, FD6_GROUP_SAMPLE_LOCATIONS);
660          break;
661       case FD6_GROUP_VS_BINDLESS:
662          state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_VERTEX, false);
663          fd6_state_take_group(&emit->state, state, FD6_GROUP_VS_BINDLESS);
664          break;
665       case FD6_GROUP_HS_BINDLESS:
666          state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_TESS_CTRL, false);
667          fd6_state_take_group(&emit->state, state, FD6_GROUP_HS_BINDLESS);
668          break;
669       case FD6_GROUP_DS_BINDLESS:
670          state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_TESS_EVAL, false);
671          fd6_state_take_group(&emit->state, state, FD6_GROUP_DS_BINDLESS);
672          break;
673       case FD6_GROUP_GS_BINDLESS:
674          state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_GEOMETRY, false);
675          fd6_state_take_group(&emit->state, state, FD6_GROUP_GS_BINDLESS);
676          break;
677       case FD6_GROUP_FS_BINDLESS:
678          state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_FRAGMENT, fs->fb_read);
679          fd6_state_take_group(&emit->state, state, FD6_GROUP_FS_BINDLESS);
680          break;
681       case FD6_GROUP_CONST:
682          state = fd6_build_user_consts<PIPELINE>(emit);
683          fd6_state_take_group(&emit->state, state, FD6_GROUP_CONST);
684          break;
685       case FD6_GROUP_DRIVER_PARAMS:
686          state = fd6_build_driver_params<PIPELINE>(emit);
687          fd6_state_take_group(&emit->state, state, FD6_GROUP_DRIVER_PARAMS);
688          break;
689       case FD6_GROUP_PRIMITIVE_PARAMS:
690          if (PIPELINE == HAS_TESS_GS) {
691             state = fd6_build_tess_consts(emit);
692             fd6_state_take_group(&emit->state, state, FD6_GROUP_PRIMITIVE_PARAMS);
693          }
694          break;
695       case FD6_GROUP_VS_TEX:
696          state = tex_state(ctx, PIPE_SHADER_VERTEX);
697          fd6_state_take_group(&emit->state, state, FD6_GROUP_VS_TEX);
698          break;
699       case FD6_GROUP_HS_TEX:
700          state = tex_state(ctx, PIPE_SHADER_TESS_CTRL);
701          fd6_state_take_group(&emit->state, state, FD6_GROUP_HS_TEX);
702          break;
703       case FD6_GROUP_DS_TEX:
704          state = tex_state(ctx, PIPE_SHADER_TESS_EVAL);
705          fd6_state_take_group(&emit->state, state, FD6_GROUP_DS_TEX);
706          break;
707       case FD6_GROUP_GS_TEX:
708          state = tex_state(ctx, PIPE_SHADER_GEOMETRY);
709          fd6_state_take_group(&emit->state, state, FD6_GROUP_GS_TEX);
710          break;
711       case FD6_GROUP_FS_TEX:
712          state = tex_state(ctx, PIPE_SHADER_FRAGMENT);
713          fd6_state_take_group(&emit->state, state, FD6_GROUP_FS_TEX);
714          break;
715       case FD6_GROUP_SO:
716          fd6_emit_streamout<CHIP>(ring, emit);
717          break;
718       case FD6_GROUP_PRIM_MODE_SYSMEM:
719          state = build_prim_mode(emit, ctx, false);
720          fd6_state_take_group(&emit->state, state, FD6_GROUP_PRIM_MODE_SYSMEM);
721          break;
722       case FD6_GROUP_PRIM_MODE_GMEM:
723          state = build_prim_mode(emit, ctx, true);
724          fd6_state_take_group(&emit->state, state, FD6_GROUP_PRIM_MODE_GMEM);
725          break;
726       case FD6_GROUP_NON_GROUP:
727          fd6_emit_non_ring(ring, emit);
728          break;
729       default:
730          break;
731       }
732    }
733 
734    fd6_state_emit(&emit->state, ring);
735 }
736 
737 template void fd6_emit_3d_state<A6XX, NO_TESS_GS>(struct fd_ringbuffer *ring, struct fd6_emit *emit);
738 template void fd6_emit_3d_state<A7XX, NO_TESS_GS>(struct fd_ringbuffer *ring, struct fd6_emit *emit);
739 template void fd6_emit_3d_state<A6XX, HAS_TESS_GS>(struct fd_ringbuffer *ring, struct fd6_emit *emit);
740 template void fd6_emit_3d_state<A7XX, HAS_TESS_GS>(struct fd_ringbuffer *ring, struct fd6_emit *emit);
741 
742 template <chip CHIP>
743 void
fd6_emit_cs_state(struct fd_context * ctx,struct fd_ringbuffer * ring,struct fd6_compute_state * cs)744 fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
745                   struct fd6_compute_state *cs)
746 {
747    struct fd6_state state = {};
748 
749    /* We want CP_SET_DRAW_STATE to execute immediately, otherwise we need to
750     * emit consts as draw state groups (which otherwise has no benefit outside
751     * of GMEM 3d using viz stream from binning pass).
752     *
753     * In particular, the PROG state group sets up the configuration for the
754     * const state, so it must execute before we start loading consts, rather
755     * than be deferred until CP_EXEC_CS.
756     */
757    OUT_PKT7(ring, CP_SET_MODE, 1);
758    OUT_RING(ring, 1);
759 
760    uint32_t gen_dirty = ctx->gen_dirty &
761          (BIT(FD6_GROUP_PROG) | BIT(FD6_GROUP_CS_TEX) | BIT(FD6_GROUP_CS_BINDLESS));
762 
763    u_foreach_bit (b, gen_dirty) {
764       enum fd6_state_id group = (enum fd6_state_id)b;
765 
766       switch (group) {
767       case FD6_GROUP_PROG:
768          fd6_state_add_group(&state, cs->stateobj, FD6_GROUP_PROG);
769          break;
770       case FD6_GROUP_CS_TEX:
771          fd6_state_take_group(
772                &state,
773                tex_state(ctx, PIPE_SHADER_COMPUTE),
774                FD6_GROUP_CS_TEX);
775          break;
776       case FD6_GROUP_CS_BINDLESS:
777          fd6_state_take_group(
778                &state,
779                fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_COMPUTE, false),
780                FD6_GROUP_CS_BINDLESS);
781          break;
782       default:
783          /* State-group unused for compute shaders */
784          break;
785       }
786    }
787 
788    fd6_state_emit(&state, ring);
789 }
790 FD_GENX(fd6_emit_cs_state);
791 
792 template <chip CHIP>
793 void
fd6_emit_ccu_cntl(struct fd_ringbuffer * ring,struct fd_screen * screen,bool gmem)794 fd6_emit_ccu_cntl(struct fd_ringbuffer *ring, struct fd_screen *screen, bool gmem)
795 {
796    const struct fd6_gmem_config *cfg = gmem ? &screen->config_gmem : &screen->config_sysmem;
797    enum a6xx_ccu_cache_size color_cache_size = !gmem ? CCU_CACHE_SIZE_FULL :
798       (enum a6xx_ccu_cache_size)(screen->info->a6xx.gmem_ccu_color_cache_fraction);
799    uint32_t color_offset = cfg->color_ccu_offset & 0x1fffff;
800    uint32_t color_offset_hi = cfg->color_ccu_offset >> 21;
801 
802    uint32_t depth_offset = cfg->depth_ccu_offset & 0x1fffff;
803    uint32_t depth_offset_hi = cfg->depth_ccu_offset >> 21;
804 
805    if (CHIP == A7XX) {
806       OUT_REG(ring,
807          A7XX_RB_CCU_CNTL2(
808             .depth_offset_hi = depth_offset_hi,
809             .color_offset_hi = color_offset_hi,
810             .depth_cache_size = CCU_CACHE_SIZE_FULL,
811             .depth_offset = depth_offset,
812             .color_cache_size = color_cache_size,
813             .color_offset = color_offset,
814          )
815       );
816 
817       if (screen->info->a7xx.has_gmem_vpc_attr_buf) {
818          OUT_REG(ring,
819             A7XX_VPC_ATTR_BUF_SIZE_GMEM(.size_gmem = cfg->vpc_attr_buf_size),
820             A7XX_VPC_ATTR_BUF_BASE_GMEM(.base_gmem = cfg->vpc_attr_buf_offset)
821          );
822          OUT_REG(ring,
823             A7XX_PC_ATTR_BUF_SIZE_GMEM(.size_gmem = cfg->vpc_attr_buf_size)
824          );
825       }
826    } else {
827       OUT_REG(ring,
828          RB_CCU_CNTL(
829             CHIP,
830             .gmem_fast_clear_disable =
831                !screen->info->a6xx.has_gmem_fast_clear,
832             .concurrent_resolve =
833                screen->info->a6xx.concurrent_resolve,
834             .depth_offset_hi = depth_offset_hi,
835             .color_offset_hi = color_offset_hi,
836             .depth_cache_size = CCU_CACHE_SIZE_FULL,
837             .depth_offset = depth_offset,
838             .color_cache_size = color_cache_size,
839             .color_offset = color_offset,
840          )
841       );
842    }
843 }
844 FD_GENX(fd6_emit_ccu_cntl);
845 
846 template <chip CHIP>
847 static void
fd6_emit_stomp(struct fd_ringbuffer * ring,const uint16_t * regs,size_t count)848 fd6_emit_stomp(struct fd_ringbuffer *ring, const uint16_t *regs, size_t count)
849 {
850    for (size_t i = 0; i < count; i++) {
851       if (fd_reg_stomp_allowed(CHIP, regs[i])) {
852          WRITE(regs[i], 0xffffffff);
853       }
854    }
855 }
856 
857 /* emit setup at begin of new cmdstream buffer (don't rely on previous
858  * state, there could have been a context switch between ioctls):
859  */
860 template <chip CHIP>
861 void
fd6_emit_restore(struct fd_batch * batch,struct fd_ringbuffer * ring)862 fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
863 {
864    struct fd_context *ctx = batch->ctx;
865    struct fd_screen *screen = ctx->screen;
866 
867    if (!batch->nondraw) {
868       trace_start_state_restore(&batch->trace, ring);
869    }
870 
871    if (FD_DBG(STOMP)) {
872       fd6_emit_stomp<CHIP>(ring, &RP_BLIT_REGS<CHIP>[0], ARRAY_SIZE(RP_BLIT_REGS<CHIP>));
873       fd6_emit_stomp<CHIP>(ring, &CMD_REGS<CHIP>[0], ARRAY_SIZE(CMD_REGS<CHIP>));
874    }
875 
876    OUT_PKT7(ring, CP_SET_MODE, 1);
877    OUT_RING(ring, 0);
878 
879    if (CHIP == A6XX) {
880       fd6_cache_inv<CHIP>(ctx, ring);
881    } else {
882       OUT_PKT7(ring, CP_THREAD_CONTROL, 1);
883       OUT_RING(ring, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
884                      CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
885 
886       fd6_event_write<CHIP>(ctx, ring, FD_CCU_INVALIDATE_COLOR);
887       fd6_event_write<CHIP>(ctx, ring, FD_CCU_INVALIDATE_DEPTH);
888 
889       OUT_PKT7(ring, CP_EVENT_WRITE, 1);
890       OUT_RING(ring, UNK_40);
891 
892       fd6_event_write<CHIP>(ctx, ring, FD_CACHE_INVALIDATE);
893       OUT_WFI5(ring);
894    }
895 
896    OUT_REG(ring,
897       HLSQ_INVALIDATE_CMD(CHIP,
898          .vs_state = true, .hs_state = true,
899          .ds_state = true, .gs_state = true,
900          .fs_state = true, .cs_state = true,
901          .cs_ibo = true,   .gfx_ibo = true,
902          .cs_shared_const = true,
903          .gfx_shared_const = true,
904          .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
905          .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,
906       )
907    );
908 
909    OUT_WFI5(ring);
910 
911    if (CHIP >= A7XX) {
912       /* On A7XX, RB_CCU_CNTL was broken into two registers, RB_CCU_CNTL which has
913        * static properties that can be set once, this requires a WFI to take effect.
914        * While the newly introduced register RB_CCU_CNTL2 has properties that may
915        * change per-RP and don't require a WFI to take effect, only CCU inval/flush
916        * events are required.
917        */
918       OUT_REG(ring,
919          RB_CCU_CNTL(
920             CHIP,
921             .gmem_fast_clear_disable = true, // !screen->info->a6xx.has_gmem_fast_clear,
922             .concurrent_resolve = screen->info->a6xx.concurrent_resolve,
923          )
924       );
925       OUT_WFI5(ring);
926    }
927 
928    fd6_emit_ccu_cntl<CHIP>(ring, screen, false);
929 
930    for (size_t i = 0; i < ARRAY_SIZE(screen->info->a6xx.magic_raw); i++) {
931       auto magic_reg = screen->info->a6xx.magic_raw[i];
932       if (!magic_reg.reg)
933          break;
934 
935       uint32_t value = magic_reg.value;
936       switch(magic_reg.reg) {
937          case REG_A6XX_TPL1_DBG_ECO_CNTL1:
938             value = (value & ~A6XX_TPL1_DBG_ECO_CNTL1_TP_UBWC_FLAG_HINT) |
939                     (screen->info->a7xx.enable_tp_ubwc_flag_hint
940                         ? A6XX_TPL1_DBG_ECO_CNTL1_TP_UBWC_FLAG_HINT
941                         : 0);
942             break;
943       }
944 
945       WRITE(magic_reg.reg, value);
946    }
947 
948    WRITE(REG_A6XX_RB_DBG_ECO_CNTL, screen->info->a6xx.magic.RB_DBG_ECO_CNTL);
949    WRITE(REG_A6XX_SP_FLOAT_CNTL, A6XX_SP_FLOAT_CNTL_F16_NO_INF);
950    WRITE(REG_A6XX_SP_DBG_ECO_CNTL, screen->info->a6xx.magic.SP_DBG_ECO_CNTL);
951    WRITE(REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);
952    if (CHIP == A6XX)
953       WRITE(REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
954    WRITE(REG_A6XX_TPL1_DBG_ECO_CNTL, screen->info->a6xx.magic.TPL1_DBG_ECO_CNTL);
955    if (CHIP == A6XX) {
956       WRITE(REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
957       WRITE(REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
958    }
959 
960    WRITE(REG_A6XX_VPC_DBG_ECO_CNTL, screen->info->a6xx.magic.VPC_DBG_ECO_CNTL);
961    WRITE(REG_A6XX_GRAS_DBG_ECO_CNTL, screen->info->a6xx.magic.GRAS_DBG_ECO_CNTL);
962    if (CHIP == A6XX)
963       WRITE(REG_A6XX_HLSQ_DBG_ECO_CNTL, screen->info->a6xx.magic.HLSQ_DBG_ECO_CNTL);
964    WRITE(REG_A6XX_SP_CHICKEN_BITS, screen->info->a6xx.magic.SP_CHICKEN_BITS);
965    WRITE(REG_A6XX_SP_IBO_COUNT, 0);
966    WRITE(REG_A6XX_SP_UNKNOWN_B182, 0);
967    if (CHIP == A6XX)
968       WRITE(REG_A6XX_HLSQ_SHARED_CONSTS, 0);
969    WRITE(REG_A6XX_UCHE_UNKNOWN_0E12, screen->info->a6xx.magic.UCHE_UNKNOWN_0E12);
970    WRITE(REG_A6XX_UCHE_CLIENT_PF, screen->info->a6xx.magic.UCHE_CLIENT_PF);
971    WRITE(REG_A6XX_RB_UNKNOWN_8E01, screen->info->a6xx.magic.RB_UNKNOWN_8E01);
972    WRITE(REG_A6XX_SP_UNKNOWN_A9A8, 0);
973    OUT_REG(ring,
974       A6XX_SP_MODE_CONTROL(
975          .constant_demotion_enable = true,
976          .isammode = ISAMMODE_GL,
977          .shared_consts_enable = false,
978       )
979    );
980    WRITE(REG_A6XX_SP_MODE_CONTROL,
981          A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);
982    WRITE(REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
983    WRITE(REG_A6XX_VPC_UNKNOWN_9107, 0);
984    WRITE(REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
985    WRITE(REG_A6XX_PC_MODE_CNTL, screen->info->a6xx.magic.PC_MODE_CNTL);
986 
987    WRITE(REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 0);
988    WRITE(REG_A6XX_GRAS_SAMPLE_CNTL, 0);
989    WRITE(REG_A6XX_GRAS_UNKNOWN_8110, 0x2);
990 
991    WRITE(REG_A6XX_RB_UNKNOWN_8818, 0);
992 
993    if (CHIP == A6XX) {
994       WRITE(REG_A6XX_RB_UNKNOWN_8819, 0);
995       WRITE(REG_A6XX_RB_UNKNOWN_881A, 0);
996       WRITE(REG_A6XX_RB_UNKNOWN_881B, 0);
997       WRITE(REG_A6XX_RB_UNKNOWN_881C, 0);
998       WRITE(REG_A6XX_RB_UNKNOWN_881D, 0);
999       WRITE(REG_A6XX_RB_UNKNOWN_881E, 0);
1000    }
1001 
1002    WRITE(REG_A6XX_RB_UNKNOWN_88F0, 0);
1003 
1004    WRITE(REG_A6XX_VPC_POINT_COORD_INVERT, A6XX_VPC_POINT_COORD_INVERT(0).value);
1005    WRITE(REG_A6XX_VPC_UNKNOWN_9300, 0);
1006 
1007    WRITE(REG_A6XX_VPC_SO_DISABLE, A6XX_VPC_SO_DISABLE(true).value);
1008 
1009    OUT_REG(ring, PC_RASTER_CNTL(CHIP));
1010 
1011    WRITE(REG_A6XX_PC_MULTIVIEW_CNTL, 0);
1012 
1013    WRITE(REG_A6XX_SP_UNKNOWN_B183, 0);
1014 
1015    WRITE(REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0);
1016    WRITE(REG_A6XX_GRAS_VS_LAYER_CNTL, 0);
1017    WRITE(REG_A6XX_GRAS_SC_CNTL, A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1018    WRITE(REG_A6XX_GRAS_UNKNOWN_80AF, 0);
1019    if (CHIP == A6XX) {
1020       WRITE(REG_A6XX_VPC_UNKNOWN_9210, 0);
1021       WRITE(REG_A6XX_VPC_UNKNOWN_9211, 0);
1022    }
1023    WRITE(REG_A6XX_VPC_UNKNOWN_9602, 0);
1024    WRITE(REG_A6XX_PC_UNKNOWN_9E72, 0);
1025    /* NOTE blob seems to (mostly?) use 0xb2 for SP_TP_MODE_CNTL
1026     * but this seems to kill texture gather offsets.
1027     */
1028    WRITE(REG_A6XX_SP_TP_MODE_CNTL, 0xa0 |
1029          A6XX_SP_TP_MODE_CNTL_ISAMMODE(ISAMMODE_GL));
1030 
1031    OUT_REG(ring, HLSQ_CONTROL_5_REG(
1032          CHIP,
1033          .linelengthregid = INVALID_REG,
1034          .foveationqualityregid = INVALID_REG,
1035    ));
1036 
1037    emit_marker6(ring, 7);
1038 
1039    OUT_PKT4(ring, REG_A6XX_VFD_MODE_CNTL, 1);
1040    OUT_RING(ring, 0x00000000); /* VFD_MODE_CNTL */
1041 
1042    WRITE(REG_A6XX_VFD_MULTIVIEW_CNTL, 0);
1043 
1044    /* Clear any potential pending state groups to be safe: */
1045    OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
1046    OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
1047                      CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1048                      CP_SET_DRAW_STATE__0_GROUP_ID(0));
1049    OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1050    OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1051 
1052    OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_CNTL, 1);
1053    OUT_RING(ring, 0x00000000); /* VPC_SO_STREAM_CNTL */
1054 
1055    if (CHIP >= A7XX) {
1056       OUT_REG(ring, A6XX_GRAS_LRZ_CNTL());
1057       OUT_REG(ring, A7XX_GRAS_LRZ_CNTL2());
1058    } else {
1059       OUT_REG(ring, A6XX_GRAS_LRZ_CNTL());
1060    }
1061 
1062    OUT_REG(ring, A6XX_RB_LRZ_CNTL());
1063    OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL());
1064    OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1065 
1066    OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1);
1067    OUT_RING(ring, 0x00000000);
1068 
1069    OUT_PKT4(ring, REG_A6XX_RB_LRZ_CNTL, 1);
1070    OUT_RING(ring, 0x00000000);
1071 
1072    /* Initialize VFD_FETCH[n].SIZE to zero to avoid iova faults trying
1073     * to fetch from a VFD_FETCH[n].BASE which we've potentially inherited
1074     * from another process:
1075     */
1076    for (int32_t i = 0; i < 32; i++) {
1077       OUT_PKT4(ring, REG_A6XX_VFD_FETCH_SIZE(i), 1);
1078       OUT_RING(ring, 0);
1079    }
1080 
1081    /* This happens after all drawing has been emitted to the draw CS, so we know
1082     * whether we need the tess BO pointers.
1083     */
1084    if (batch->tessellation) {
1085       assert(screen->tess_bo);
1086       fd_ringbuffer_attach_bo(ring, screen->tess_bo);
1087       OUT_REG(ring, PC_TESSFACTOR_ADDR(CHIP, screen->tess_bo));
1088       /* Updating PC_TESSFACTOR_ADDR could race with the next draw which uses it. */
1089       OUT_WFI5(ring);
1090    }
1091 
1092    struct fd6_context *fd6_ctx = fd6_context(ctx);
1093    struct fd_bo *bcolor_mem = fd6_ctx->bcolor_mem;
1094 
1095    OUT_PKT4(ring, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR, 2);
1096    OUT_RELOC(ring, bcolor_mem, 0, 0, 0);
1097 
1098    OUT_PKT4(ring, REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, 2);
1099    OUT_RELOC(ring, bcolor_mem, 0, 0, 0);
1100 
1101    /* These regs are blocked (CP_PROTECT) on a6xx: */
1102    if (CHIP >= A7XX) {
1103       OUT_REG(ring,
1104          TPL1_BICUBIC_WEIGHTS_TABLE_0(CHIP, 0),
1105          TPL1_BICUBIC_WEIGHTS_TABLE_1(CHIP, 0x3fe05ff4),
1106          TPL1_BICUBIC_WEIGHTS_TABLE_2(CHIP, 0x3fa0ebee),
1107          TPL1_BICUBIC_WEIGHTS_TABLE_3(CHIP, 0x3f5193ed),
1108          TPL1_BICUBIC_WEIGHTS_TABLE_4(CHIP, 0x3f0243f0),
1109       );
1110    }
1111 
1112    if (CHIP >= A7XX) {
1113       /* Blob sets these two per draw. */
1114       OUT_REG(ring, A7XX_PC_TESS_PARAM_SIZE(FD6_TESS_PARAM_SIZE));
1115       /* Blob adds a bit more space ({0x10, 0x20, 0x30, 0x40} bytes)
1116        * but the meaning of this additional space is not known,
1117        * so we play safe and don't add it.
1118        */
1119       OUT_REG(ring, A7XX_PC_TESS_FACTOR_SIZE(FD6_TESS_FACTOR_SIZE));
1120    }
1121 
1122    /* There is an optimization to skip executing draw states for draws with no
1123     * instances. Instead of simply skipping the draw, internally the firmware
1124     * sets a bit in PC_DRAW_INITIATOR that seemingly skips the draw. However
1125     * there is a hardware bug where this bit does not always cause the FS
1126     * early preamble to be skipped. Because the draw states were skipped,
1127     * SP_FS_CTRL_REG0, SP_FS_OBJ_START and so on are never updated and a
1128     * random FS preamble from the last draw is executed. If the last visible
1129     * draw is from the same submit, it shouldn't be a problem because we just
1130     * re-execute the same preamble and preambles don't have side effects, but
1131     * if it's from another process then we could execute a garbage preamble
1132     * leading to hangs and faults. To make sure this doesn't happen, we reset
1133     * SP_FS_CTRL_REG0 here, making sure that the EARLYPREAMBLE bit isn't set
1134     * so any leftover early preamble doesn't get executed. Other stages don't
1135     * seem to be affected.
1136     */
1137    if (screen->info->a6xx.has_early_preamble) {
1138       WRITE(REG_A6XX_SP_FS_CTRL_REG0, 0);
1139    }
1140 
1141    if (!batch->nondraw) {
1142       trace_end_state_restore(&batch->trace, ring);
1143    }
1144 }
1145 FD_GENX(fd6_emit_restore);
1146 
1147 static void
fd6_mem_to_mem(struct fd_ringbuffer * ring,struct pipe_resource * dst,unsigned dst_off,struct pipe_resource * src,unsigned src_off,unsigned sizedwords)1148 fd6_mem_to_mem(struct fd_ringbuffer *ring, struct pipe_resource *dst,
1149                unsigned dst_off, struct pipe_resource *src, unsigned src_off,
1150                unsigned sizedwords)
1151 {
1152    struct fd_bo *src_bo = fd_resource(src)->bo;
1153    struct fd_bo *dst_bo = fd_resource(dst)->bo;
1154    unsigned i;
1155 
1156    fd_ringbuffer_attach_bo(ring, dst_bo);
1157    fd_ringbuffer_attach_bo(ring, src_bo);
1158 
1159    for (i = 0; i < sizedwords; i++) {
1160       OUT_PKT7(ring, CP_MEM_TO_MEM, 5);
1161       OUT_RING(ring, 0x00000000);
1162       OUT_RELOC(ring, dst_bo, dst_off, 0, 0);
1163       OUT_RELOC(ring, src_bo, src_off, 0, 0);
1164 
1165       dst_off += 4;
1166       src_off += 4;
1167    }
1168 }
1169 
1170 void
fd6_emit_init_screen(struct pipe_screen * pscreen)1171 fd6_emit_init_screen(struct pipe_screen *pscreen)
1172 {
1173    struct fd_screen *screen = fd_screen(pscreen);
1174    screen->mem_to_mem = fd6_mem_to_mem;
1175 }
1176