xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/freedreno/freedreno_batch.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016 Rob Clark <[email protected]>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <[email protected]>
7  */
8 
9 #ifndef FREEDRENO_BATCH_H_
10 #define FREEDRENO_BATCH_H_
11 
12 #include "util/list.h"
13 #include "util/simple_mtx.h"
14 #include "util/u_inlines.h"
15 #include "util/u_queue.h"
16 #include "util/perf/u_trace.h"
17 
18 #include "freedreno_context.h"
19 #include "freedreno_fence.h"
20 #include "freedreno_util.h"
21 
22 #ifdef __cplusplus
23 extern "C" {
24 #endif
25 
26 struct fd_resource;
27 struct fd_batch_key;
28 struct fd_batch_result;
29 
30 /**
31  * A subpass is a fragment of a batch potentially starting with a clear.
32  * If the app does a mid-batch clear, that clear and subsequent draws
33  * can be split out into another sub-pass.  At gmem time, the appropriate
34  * sysmem or gmem clears can be interleaved with the CP_INDIRECT_BUFFER
35  * to the subpass's draw cmdstream.
36  *
37  * For depth clears, a replacement LRZ buffer can be allocated (clear
38  * still inserted into the prologue cmdstream since it needs be executed
39  * even in sysmem or if we aren't binning, since later batches could
40  * depend in the LRZ state).  The alternative would be to invalidate
41  * LRZ for draws after the start of the new subpass.
42  */
43 struct fd_batch_subpass {
44    struct list_head node;
45 
46    /** draw pass cmdstream: */
47    struct fd_ringbuffer *draw;
48 
49    /** for the gmem code to stash per tile per subpass clears */
50    struct fd_ringbuffer *subpass_clears;
51 
52    BITMASK_ENUM(fd_buffer_mask) fast_cleared;
53 
54    union pipe_color_union clear_color[MAX_RENDER_TARGETS];
55    double clear_depth;
56    unsigned clear_stencil;
57 
58    /**
59     * The number of draws emitted to this subpass.  If it is greater than
60     * zero, a clear triggers creating a new subpass (because clears must
61     * always come at the start of a subpass).
62     */
63    unsigned num_draws;
64 
65    /**
66     * If a subpass starts with a LRZ clear, it gets a new LRZ buffer.
67     * The fd_resource::lrz always tracks the current lrz buffer, but at
68     * binning/gmem time we need to know what was the current lrz buffer
69     * at the time draws were emitted to the subpass.  Which is tracked
70     * here.
71     */
72    struct fd_bo *lrz;
73 };
74 
75 /**
76  * A batch tracks everything about a cmdstream batch/submit, including the
77  * ringbuffers used for binning, draw, and gmem cmds, list of associated
78  * fd_resource-s, etc.
79  */
80 struct fd_batch {
81    struct pipe_reference reference;
82    unsigned seqno;
83    unsigned idx; /* index into cache->batches[] */
84 
85    struct u_trace trace;
86 
87    /* To detect cases where we can skip cmdstream to record timestamp: */
88    uint32_t *last_timestamp_cmd;
89 
90    int in_fence_fd;
91    struct pipe_fence_handle *fence;
92 
93    struct fd_context *ctx;
94 
95    /* update seqno of most recent draw/etc to the batch. */
96    uint32_t update_seqno;
97 
98    /* do we need to mem2gmem before rendering.  We don't, if for example,
99     * there was a glClear() that invalidated the entire previous buffer
100     * contents.  Keep track of which buffer(s) are cleared, or needs
101     * restore.  Masks of PIPE_CLEAR_*
102     *
103     * The 'cleared' bits will be set for buffers which are *entirely*
104     * cleared.
105     *
106     * The 'invalidated' bits are set for cleared buffers, and buffers
107     * where the contents are undefined, ie. what we don't need to restore
108     * to gmem.
109     */
110    BITMASK_ENUM(fd_buffer_mask) invalidated, cleared, restore, resolve;
111 
112    /* is this a non-draw batch (ie compute/blit which has no pfb state)? */
113    bool nondraw : 1;
114    bool needs_flush : 1;
115    bool flushed : 1;
116    bool tessellation : 1; /* tessellation used in batch */
117 
118    /* Keep track if WAIT_FOR_IDLE is needed for registers we need
119     * to update via RMW:
120     */
121    bool needs_wfi : 1;
122 
123    /* To decide whether to render to system memory, keep track of the
124     * number of draws, and whether any of them require multisample,
125     * depth_test (or depth write), stencil_test, blending, and
126     * color_logic_Op (since those functions are disabled when by-
127     * passing GMEM.
128     */
129    BITMASK_ENUM(fd_gmem_reason) gmem_reason;
130 
131    /* At submit time, once we've decided that this batch will use GMEM
132     * rendering, the appropriate gmem state is looked up:
133     */
134    const struct fd_gmem_stateobj *gmem_state;
135 
136    /* Driver specific barrier/flush flags: */
137    unsigned barrier;
138 
139    /* A calculated "draw cost" value for the batch, which tries to
140     * estimate the bandwidth-per-sample of all the draws according
141     * to:
142     *
143     *    foreach_draw (...) {
144     *      cost += num_mrt;
145     *      if (blend_enabled)
146     *        cost += num_mrt;
147     *      if (depth_test_enabled)
148     *        cost++;
149     *      if (depth_write_enabled)
150     *        cost++;
151     *    }
152     *
153     * The idea is that each sample-passed minimally does one write
154     * per MRT.  If blend is enabled, the hw will additionally do
155     * a framebuffer read per sample-passed (for each MRT with blend
156     * enabled).  If depth-test is enabled, the hw will additionally
157     * a depth buffer read.  If depth-write is enable, the hw will
158     * additionally do a depth buffer write.
159     *
160     * This does ignore depth buffer traffic for samples which do not
161     * pass do to depth-test fail, and some other details.  But it is
162     * just intended to be a rough estimate that is easy to calculate.
163     */
164    unsigned cost;
165 
166    /* Tells the gen specific backend where to write stats used for
167     * the autotune module.
168     *
169     * Pointer only valid during gmem emit code.
170     */
171    struct fd_batch_result *autotune_result;
172 
173    unsigned num_draws;    /* number of draws in current batch */
174    unsigned num_vertices; /* number of vertices in current batch */
175 
176    /* Currently only used on a6xx, to calculate vsc prim/draw stream
177     * sizes:
178     */
179    unsigned num_bins_per_pipe;
180    unsigned prim_strm_bits;
181    unsigned draw_strm_bits;
182 
183    /* Track the maximal bounds of the scissor of all the draws within a
184     * batch.  Used at the tile rendering step (fd_gmem_render_tiles(),
185     * mem2gmem/gmem2mem) to avoid needlessly moving data in/out of gmem.
186     *
187     * Note that unlike gallium state, maxx/maxy are inclusive (for
188     * fully covered 512x512 the scissor would be 0,0+511,511)
189     */
190    struct pipe_scissor_state max_scissor;
191 
192    /* Keep track of DRAW initiators that need to be patched up depending
193     * on whether we using binning or not:
194     */
195    struct util_dynarray draw_patches;
196 
197    /* texture state that needs patching for fb_read: */
198    struct util_dynarray fb_read_patches;
199 
200    /* Keep track of writes to RB_RENDER_CONTROL which need to be patched
201     * once we know whether or not to use GMEM, and GMEM tile pitch.
202     *
203     * (only for a3xx.. but having gen specific subclasses of fd_batch
204     * seemed overkill for now)
205     */
206    struct util_dynarray rbrc_patches;
207 
208    /* Keep track of GMEM related values that need to be patched up once we
209     * know the gmem layout:
210     */
211    struct util_dynarray gmem_patches;
212 
213    /* Keep track of pointer to start of MEM exports for a20x binning shaders
214     *
215     * this is so the end of the shader can be cut off at the right point
216     * depending on the GMEM configuration
217     */
218    struct util_dynarray shader_patches;
219 
220    struct pipe_framebuffer_state framebuffer;
221 
222    struct fd_submit *submit;
223 
224    /**
225     * List of fd_batch_subpass.
226     */
227    struct list_head subpasses;
228 
229 #define foreach_subpass(subpass, batch) \
230    list_for_each_entry (struct fd_batch_subpass, subpass, &batch->subpasses, node)
231 #define foreach_subpass_safe(subpass, batch) \
232    list_for_each_entry_safe (struct fd_batch_subpass, subpass, &batch->subpasses, node)
233 
234    /**
235     * The current subpass.
236     */
237    struct fd_batch_subpass *subpass;
238 
239    /**
240     * just a reference to the current subpass's draw cmds for backwards compat.
241     */
242    struct fd_ringbuffer *draw;
243    /** binning pass cmdstream: */
244    struct fd_ringbuffer *binning;
245    /** tiling/gmem (IB0) cmdstream: */
246    struct fd_ringbuffer *gmem;
247 
248    /** preemble cmdstream (executed once before first tile): */
249    struct fd_ringbuffer *prologue;
250 
251    /** epilogue cmdstream (executed after each tile): */
252    struct fd_ringbuffer *tile_epilogue;
253 
254    /** epilogue cmdstream (executed after all tiles): */
255    struct fd_ringbuffer *epilogue;
256 
257    struct fd_ringbuffer *tile_loads;
258    struct fd_ringbuffer *tile_store;
259 
260    /**
261     * hw query related state:
262     */
263    /*@{*/
264    /* next sample offset.. incremented for each sample in the batch/
265     * submit, reset to zero on next submit.
266     */
267    uint32_t next_sample_offset;
268 
269    /* The # of pipeline-stats queries running.  In case of nested
270     * queries using {START/STOP}_{PRIMITIVE,FRAGMENT,COMPUTE}_CNTRS,
271     * we need to start only on the first one and stop only on the
272     * last one.
273     */
274    uint8_t pipeline_stats_queries_active[3];
275 
276    /* cached samples (in case multiple queries need to reference
277     * the same sample snapshot)
278     */
279    struct fd_hw_sample *sample_cache[MAX_HW_SAMPLE_PROVIDERS];
280 
281    /* which sample providers were used in the current batch: */
282    uint32_t query_providers_used;
283 
284    /* which sample providers are currently enabled in the batch: */
285    uint32_t query_providers_active;
286 
287    /* list of samples in current batch: */
288    struct util_dynarray samples;
289 
290    /* current query result bo and tile stride: */
291    struct pipe_resource *query_buf;
292    uint32_t query_tile_stride;
293    /*@}*/
294 
295    /* Set of resources used by currently-unsubmitted batch (read or
296     * write).. does not hold a reference to the resource.
297     */
298    struct set *resources;
299 
300    /** key in batch-cache (if not null): */
301    struct fd_batch_key *key;
302    uint32_t hash;
303 
304    /** set of dependent batches.. holds refs to dependent batches: */
305    uint32_t dependents_mask;
306 };
307 
308 struct fd_batch *fd_batch_create(struct fd_context *ctx, bool nondraw);
309 
310 struct fd_batch_subpass *fd_batch_create_subpass(struct fd_batch *batch) assert_dt;
311 
312 void fd_batch_set_fb(struct fd_batch *batch, const struct pipe_framebuffer_state *pfb) assert_dt;
313 
314 void fd_batch_flush(struct fd_batch *batch) assert_dt;
315 bool fd_batch_has_dep(struct fd_batch *batch, struct fd_batch *dep) assert_dt;
316 void fd_batch_add_dep(struct fd_batch *batch, struct fd_batch *dep) assert_dt;
317 void fd_batch_resource_write(struct fd_batch *batch,
318                              struct fd_resource *rsc) assert_dt;
319 void fd_batch_resource_read_slowpath(struct fd_batch *batch,
320                                      struct fd_resource *rsc) assert_dt;
321 void fd_batch_check_size(struct fd_batch *batch) assert_dt;
322 
323 uint32_t fd_batch_key_hash(const void *_key);
324 bool fd_batch_key_equals(const void *_a, const void *_b);
325 struct fd_batch_key *fd_batch_key_clone(void *mem_ctx,
326                                         const struct fd_batch_key *key);
327 
328 /* not called directly: */
329 void __fd_batch_describe(char *buf, const struct fd_batch *batch) assert_dt;
330 void __fd_batch_destroy_locked(struct fd_batch *batch);
331 void __fd_batch_destroy(struct fd_batch *batch);
332 
333 /*
334  * NOTE the rule is, you need to hold the screen->lock when destroying
335  * a batch..  so either use fd_batch_reference() (which grabs the lock
336  * for you) if you don't hold the lock, or fd_batch_reference_locked()
337  * if you do hold the lock.
338  *
339  * WARNING the _locked() version can briefly drop the lock.  Without
340  * recursive mutexes, I'm not sure there is much else we can do (since
341  * __fd_batch_destroy() needs to unref resources)
342  *
343  * WARNING you must acquire the screen->lock and use the _locked()
344  * version in case that the batch being ref'd can disappear under
345  * you.
346  */
347 
348 static inline void
fd_batch_reference_locked(struct fd_batch ** ptr,struct fd_batch * batch)349 fd_batch_reference_locked(struct fd_batch **ptr, struct fd_batch *batch)
350 {
351    struct fd_batch *old_batch = *ptr;
352 
353    /* only need lock if a reference is dropped: */
354    if (old_batch)
355       fd_screen_assert_locked(old_batch->ctx->screen);
356 
357    if (pipe_reference_described(
358           &(*ptr)->reference, &batch->reference,
359           (debug_reference_descriptor)__fd_batch_describe))
360       __fd_batch_destroy_locked(old_batch);
361 
362    *ptr = batch;
363 }
364 
365 static inline void
fd_batch_reference(struct fd_batch ** ptr,struct fd_batch * batch)366 fd_batch_reference(struct fd_batch **ptr, struct fd_batch *batch)
367 {
368    struct fd_batch *old_batch = *ptr;
369 
370    if (pipe_reference_described(
371           &(*ptr)->reference, &batch->reference,
372           (debug_reference_descriptor)__fd_batch_describe))
373       __fd_batch_destroy(old_batch);
374 
375    *ptr = batch;
376 }
377 
378 /**
379  * Mark the batch as having something worth flushing (rendering, blit, query,
380  * etc)
381  */
382 static inline void
fd_batch_needs_flush(struct fd_batch * batch)383 fd_batch_needs_flush(struct fd_batch *batch)
384 {
385    batch->needs_flush = true;
386    batch->update_seqno = ++batch->ctx->update_count;
387    fd_pipe_fence_ref(&batch->ctx->last_fence, NULL);
388 }
389 
390 /* Since we reorder batches and can pause/resume queries (notably for disabling
391  * queries dueing some meta operations), we update the current query state for
392  * the batch before each draw.
393  */
394 static inline void
fd_batch_update_queries(struct fd_batch * batch)395 fd_batch_update_queries(struct fd_batch *batch) assert_dt
396 {
397    struct fd_context *ctx = batch->ctx;
398 
399    if (!(ctx->dirty & FD_DIRTY_QUERY))
400       return;
401 
402    ctx->query_update_batch(batch, false);
403 }
404 
405 static inline void
fd_batch_finish_queries(struct fd_batch * batch)406 fd_batch_finish_queries(struct fd_batch *batch) assert_dt
407 {
408    struct fd_context *ctx = batch->ctx;
409 
410    ctx->query_update_batch(batch, true);
411 }
412 
413 static inline void
fd_reset_wfi(struct fd_batch * batch)414 fd_reset_wfi(struct fd_batch *batch)
415 {
416    batch->needs_wfi = true;
417 }
418 
419 void fd_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt;
420 
421 /* emit a CP_EVENT_WRITE:
422  * (a6xx+ cannot use this, use fd6_event_write<chip>.)
423  */
424 #ifndef __cplusplus
425 static inline void
fd_event_write(struct fd_batch * batch,struct fd_ringbuffer * ring,enum vgt_event_type evt)426 fd_event_write(struct fd_batch *batch, struct fd_ringbuffer *ring,
427                enum vgt_event_type evt)
428 {
429    OUT_PKT3(ring, CP_EVENT_WRITE, 1);
430    OUT_RING(ring, evt);
431    fd_reset_wfi(batch);
432 }
433 #endif
434 
435 /* Get per-tile epilogue */
436 static inline struct fd_ringbuffer *
fd_batch_get_tile_epilogue(struct fd_batch * batch)437 fd_batch_get_tile_epilogue(struct fd_batch *batch)
438 {
439    if (batch->tile_epilogue == NULL) {
440       batch->tile_epilogue = fd_submit_new_ringbuffer(batch->submit, 0x1000,
441                                                  FD_RINGBUFFER_GROWABLE);
442    }
443 
444    return batch->tile_epilogue;
445 }
446 
447 /* Get epilogue run after all tiles*/
448 static inline struct fd_ringbuffer *
fd_batch_get_epilogue(struct fd_batch * batch)449 fd_batch_get_epilogue(struct fd_batch *batch)
450 {
451    if (batch->epilogue == NULL) {
452       batch->epilogue = fd_submit_new_ringbuffer(batch->submit, 0x1000,
453                                                  FD_RINGBUFFER_GROWABLE);
454    }
455 
456    return batch->epilogue;
457 }
458 
459 struct fd_ringbuffer *fd_batch_get_prologue(struct fd_batch *batch);
460 
461 #ifdef __cplusplus
462 }
463 #endif
464 
465 #endif /* FREEDRENO_BATCH_H_ */
466