1 /*
2 * Copyright © 2016 Rob Clark <[email protected]>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <[email protected]>
7 */
8
9 #ifndef FREEDRENO_BATCH_H_
10 #define FREEDRENO_BATCH_H_
11
12 #include "util/list.h"
13 #include "util/simple_mtx.h"
14 #include "util/u_inlines.h"
15 #include "util/u_queue.h"
16 #include "util/perf/u_trace.h"
17
18 #include "freedreno_context.h"
19 #include "freedreno_fence.h"
20 #include "freedreno_util.h"
21
22 #ifdef __cplusplus
23 extern "C" {
24 #endif
25
26 struct fd_resource;
27 struct fd_batch_key;
28 struct fd_batch_result;
29
30 /**
31 * A subpass is a fragment of a batch potentially starting with a clear.
32 * If the app does a mid-batch clear, that clear and subsequent draws
33 * can be split out into another sub-pass. At gmem time, the appropriate
34 * sysmem or gmem clears can be interleaved with the CP_INDIRECT_BUFFER
35 * to the subpass's draw cmdstream.
36 *
37 * For depth clears, a replacement LRZ buffer can be allocated (clear
38 * still inserted into the prologue cmdstream since it needs be executed
39 * even in sysmem or if we aren't binning, since later batches could
40 * depend in the LRZ state). The alternative would be to invalidate
41 * LRZ for draws after the start of the new subpass.
42 */
43 struct fd_batch_subpass {
44 struct list_head node;
45
46 /** draw pass cmdstream: */
47 struct fd_ringbuffer *draw;
48
49 /** for the gmem code to stash per tile per subpass clears */
50 struct fd_ringbuffer *subpass_clears;
51
52 BITMASK_ENUM(fd_buffer_mask) fast_cleared;
53
54 union pipe_color_union clear_color[MAX_RENDER_TARGETS];
55 double clear_depth;
56 unsigned clear_stencil;
57
58 /**
59 * The number of draws emitted to this subpass. If it is greater than
60 * zero, a clear triggers creating a new subpass (because clears must
61 * always come at the start of a subpass).
62 */
63 unsigned num_draws;
64
65 /**
66 * If a subpass starts with a LRZ clear, it gets a new LRZ buffer.
67 * The fd_resource::lrz always tracks the current lrz buffer, but at
68 * binning/gmem time we need to know what was the current lrz buffer
69 * at the time draws were emitted to the subpass. Which is tracked
70 * here.
71 */
72 struct fd_bo *lrz;
73 };
74
75 /**
76 * A batch tracks everything about a cmdstream batch/submit, including the
77 * ringbuffers used for binning, draw, and gmem cmds, list of associated
78 * fd_resource-s, etc.
79 */
80 struct fd_batch {
81 struct pipe_reference reference;
82 unsigned seqno;
83 unsigned idx; /* index into cache->batches[] */
84
85 struct u_trace trace;
86
87 /* To detect cases where we can skip cmdstream to record timestamp: */
88 uint32_t *last_timestamp_cmd;
89
90 int in_fence_fd;
91 struct pipe_fence_handle *fence;
92
93 struct fd_context *ctx;
94
95 /* update seqno of most recent draw/etc to the batch. */
96 uint32_t update_seqno;
97
98 /* do we need to mem2gmem before rendering. We don't, if for example,
99 * there was a glClear() that invalidated the entire previous buffer
100 * contents. Keep track of which buffer(s) are cleared, or needs
101 * restore. Masks of PIPE_CLEAR_*
102 *
103 * The 'cleared' bits will be set for buffers which are *entirely*
104 * cleared.
105 *
106 * The 'invalidated' bits are set for cleared buffers, and buffers
107 * where the contents are undefined, ie. what we don't need to restore
108 * to gmem.
109 */
110 BITMASK_ENUM(fd_buffer_mask) invalidated, cleared, restore, resolve;
111
112 /* is this a non-draw batch (ie compute/blit which has no pfb state)? */
113 bool nondraw : 1;
114 bool needs_flush : 1;
115 bool flushed : 1;
116 bool tessellation : 1; /* tessellation used in batch */
117
118 /* Keep track if WAIT_FOR_IDLE is needed for registers we need
119 * to update via RMW:
120 */
121 bool needs_wfi : 1;
122
123 /* To decide whether to render to system memory, keep track of the
124 * number of draws, and whether any of them require multisample,
125 * depth_test (or depth write), stencil_test, blending, and
126 * color_logic_Op (since those functions are disabled when by-
127 * passing GMEM.
128 */
129 BITMASK_ENUM(fd_gmem_reason) gmem_reason;
130
131 /* At submit time, once we've decided that this batch will use GMEM
132 * rendering, the appropriate gmem state is looked up:
133 */
134 const struct fd_gmem_stateobj *gmem_state;
135
136 /* Driver specific barrier/flush flags: */
137 unsigned barrier;
138
139 /* A calculated "draw cost" value for the batch, which tries to
140 * estimate the bandwidth-per-sample of all the draws according
141 * to:
142 *
143 * foreach_draw (...) {
144 * cost += num_mrt;
145 * if (blend_enabled)
146 * cost += num_mrt;
147 * if (depth_test_enabled)
148 * cost++;
149 * if (depth_write_enabled)
150 * cost++;
151 * }
152 *
153 * The idea is that each sample-passed minimally does one write
154 * per MRT. If blend is enabled, the hw will additionally do
155 * a framebuffer read per sample-passed (for each MRT with blend
156 * enabled). If depth-test is enabled, the hw will additionally
157 * a depth buffer read. If depth-write is enable, the hw will
158 * additionally do a depth buffer write.
159 *
160 * This does ignore depth buffer traffic for samples which do not
161 * pass do to depth-test fail, and some other details. But it is
162 * just intended to be a rough estimate that is easy to calculate.
163 */
164 unsigned cost;
165
166 /* Tells the gen specific backend where to write stats used for
167 * the autotune module.
168 *
169 * Pointer only valid during gmem emit code.
170 */
171 struct fd_batch_result *autotune_result;
172
173 unsigned num_draws; /* number of draws in current batch */
174 unsigned num_vertices; /* number of vertices in current batch */
175
176 /* Currently only used on a6xx, to calculate vsc prim/draw stream
177 * sizes:
178 */
179 unsigned num_bins_per_pipe;
180 unsigned prim_strm_bits;
181 unsigned draw_strm_bits;
182
183 /* Track the maximal bounds of the scissor of all the draws within a
184 * batch. Used at the tile rendering step (fd_gmem_render_tiles(),
185 * mem2gmem/gmem2mem) to avoid needlessly moving data in/out of gmem.
186 *
187 * Note that unlike gallium state, maxx/maxy are inclusive (for
188 * fully covered 512x512 the scissor would be 0,0+511,511)
189 */
190 struct pipe_scissor_state max_scissor;
191
192 /* Keep track of DRAW initiators that need to be patched up depending
193 * on whether we using binning or not:
194 */
195 struct util_dynarray draw_patches;
196
197 /* texture state that needs patching for fb_read: */
198 struct util_dynarray fb_read_patches;
199
200 /* Keep track of writes to RB_RENDER_CONTROL which need to be patched
201 * once we know whether or not to use GMEM, and GMEM tile pitch.
202 *
203 * (only for a3xx.. but having gen specific subclasses of fd_batch
204 * seemed overkill for now)
205 */
206 struct util_dynarray rbrc_patches;
207
208 /* Keep track of GMEM related values that need to be patched up once we
209 * know the gmem layout:
210 */
211 struct util_dynarray gmem_patches;
212
213 /* Keep track of pointer to start of MEM exports for a20x binning shaders
214 *
215 * this is so the end of the shader can be cut off at the right point
216 * depending on the GMEM configuration
217 */
218 struct util_dynarray shader_patches;
219
220 struct pipe_framebuffer_state framebuffer;
221
222 struct fd_submit *submit;
223
224 /**
225 * List of fd_batch_subpass.
226 */
227 struct list_head subpasses;
228
229 #define foreach_subpass(subpass, batch) \
230 list_for_each_entry (struct fd_batch_subpass, subpass, &batch->subpasses, node)
231 #define foreach_subpass_safe(subpass, batch) \
232 list_for_each_entry_safe (struct fd_batch_subpass, subpass, &batch->subpasses, node)
233
234 /**
235 * The current subpass.
236 */
237 struct fd_batch_subpass *subpass;
238
239 /**
240 * just a reference to the current subpass's draw cmds for backwards compat.
241 */
242 struct fd_ringbuffer *draw;
243 /** binning pass cmdstream: */
244 struct fd_ringbuffer *binning;
245 /** tiling/gmem (IB0) cmdstream: */
246 struct fd_ringbuffer *gmem;
247
248 /** preemble cmdstream (executed once before first tile): */
249 struct fd_ringbuffer *prologue;
250
251 /** epilogue cmdstream (executed after each tile): */
252 struct fd_ringbuffer *tile_epilogue;
253
254 /** epilogue cmdstream (executed after all tiles): */
255 struct fd_ringbuffer *epilogue;
256
257 struct fd_ringbuffer *tile_loads;
258 struct fd_ringbuffer *tile_store;
259
260 /**
261 * hw query related state:
262 */
263 /*@{*/
264 /* next sample offset.. incremented for each sample in the batch/
265 * submit, reset to zero on next submit.
266 */
267 uint32_t next_sample_offset;
268
269 /* The # of pipeline-stats queries running. In case of nested
270 * queries using {START/STOP}_{PRIMITIVE,FRAGMENT,COMPUTE}_CNTRS,
271 * we need to start only on the first one and stop only on the
272 * last one.
273 */
274 uint8_t pipeline_stats_queries_active[3];
275
276 /* cached samples (in case multiple queries need to reference
277 * the same sample snapshot)
278 */
279 struct fd_hw_sample *sample_cache[MAX_HW_SAMPLE_PROVIDERS];
280
281 /* which sample providers were used in the current batch: */
282 uint32_t query_providers_used;
283
284 /* which sample providers are currently enabled in the batch: */
285 uint32_t query_providers_active;
286
287 /* list of samples in current batch: */
288 struct util_dynarray samples;
289
290 /* current query result bo and tile stride: */
291 struct pipe_resource *query_buf;
292 uint32_t query_tile_stride;
293 /*@}*/
294
295 /* Set of resources used by currently-unsubmitted batch (read or
296 * write).. does not hold a reference to the resource.
297 */
298 struct set *resources;
299
300 /** key in batch-cache (if not null): */
301 struct fd_batch_key *key;
302 uint32_t hash;
303
304 /** set of dependent batches.. holds refs to dependent batches: */
305 uint32_t dependents_mask;
306 };
307
308 struct fd_batch *fd_batch_create(struct fd_context *ctx, bool nondraw);
309
310 struct fd_batch_subpass *fd_batch_create_subpass(struct fd_batch *batch) assert_dt;
311
312 void fd_batch_set_fb(struct fd_batch *batch, const struct pipe_framebuffer_state *pfb) assert_dt;
313
314 void fd_batch_flush(struct fd_batch *batch) assert_dt;
315 bool fd_batch_has_dep(struct fd_batch *batch, struct fd_batch *dep) assert_dt;
316 void fd_batch_add_dep(struct fd_batch *batch, struct fd_batch *dep) assert_dt;
317 void fd_batch_resource_write(struct fd_batch *batch,
318 struct fd_resource *rsc) assert_dt;
319 void fd_batch_resource_read_slowpath(struct fd_batch *batch,
320 struct fd_resource *rsc) assert_dt;
321 void fd_batch_check_size(struct fd_batch *batch) assert_dt;
322
323 uint32_t fd_batch_key_hash(const void *_key);
324 bool fd_batch_key_equals(const void *_a, const void *_b);
325 struct fd_batch_key *fd_batch_key_clone(void *mem_ctx,
326 const struct fd_batch_key *key);
327
328 /* not called directly: */
329 void __fd_batch_describe(char *buf, const struct fd_batch *batch) assert_dt;
330 void __fd_batch_destroy_locked(struct fd_batch *batch);
331 void __fd_batch_destroy(struct fd_batch *batch);
332
333 /*
334 * NOTE the rule is, you need to hold the screen->lock when destroying
335 * a batch.. so either use fd_batch_reference() (which grabs the lock
336 * for you) if you don't hold the lock, or fd_batch_reference_locked()
337 * if you do hold the lock.
338 *
339 * WARNING the _locked() version can briefly drop the lock. Without
340 * recursive mutexes, I'm not sure there is much else we can do (since
341 * __fd_batch_destroy() needs to unref resources)
342 *
343 * WARNING you must acquire the screen->lock and use the _locked()
344 * version in case that the batch being ref'd can disappear under
345 * you.
346 */
347
348 static inline void
fd_batch_reference_locked(struct fd_batch ** ptr,struct fd_batch * batch)349 fd_batch_reference_locked(struct fd_batch **ptr, struct fd_batch *batch)
350 {
351 struct fd_batch *old_batch = *ptr;
352
353 /* only need lock if a reference is dropped: */
354 if (old_batch)
355 fd_screen_assert_locked(old_batch->ctx->screen);
356
357 if (pipe_reference_described(
358 &(*ptr)->reference, &batch->reference,
359 (debug_reference_descriptor)__fd_batch_describe))
360 __fd_batch_destroy_locked(old_batch);
361
362 *ptr = batch;
363 }
364
365 static inline void
fd_batch_reference(struct fd_batch ** ptr,struct fd_batch * batch)366 fd_batch_reference(struct fd_batch **ptr, struct fd_batch *batch)
367 {
368 struct fd_batch *old_batch = *ptr;
369
370 if (pipe_reference_described(
371 &(*ptr)->reference, &batch->reference,
372 (debug_reference_descriptor)__fd_batch_describe))
373 __fd_batch_destroy(old_batch);
374
375 *ptr = batch;
376 }
377
378 /**
379 * Mark the batch as having something worth flushing (rendering, blit, query,
380 * etc)
381 */
382 static inline void
fd_batch_needs_flush(struct fd_batch * batch)383 fd_batch_needs_flush(struct fd_batch *batch)
384 {
385 batch->needs_flush = true;
386 batch->update_seqno = ++batch->ctx->update_count;
387 fd_pipe_fence_ref(&batch->ctx->last_fence, NULL);
388 }
389
390 /* Since we reorder batches and can pause/resume queries (notably for disabling
391 * queries dueing some meta operations), we update the current query state for
392 * the batch before each draw.
393 */
394 static inline void
fd_batch_update_queries(struct fd_batch * batch)395 fd_batch_update_queries(struct fd_batch *batch) assert_dt
396 {
397 struct fd_context *ctx = batch->ctx;
398
399 if (!(ctx->dirty & FD_DIRTY_QUERY))
400 return;
401
402 ctx->query_update_batch(batch, false);
403 }
404
405 static inline void
fd_batch_finish_queries(struct fd_batch * batch)406 fd_batch_finish_queries(struct fd_batch *batch) assert_dt
407 {
408 struct fd_context *ctx = batch->ctx;
409
410 ctx->query_update_batch(batch, true);
411 }
412
413 static inline void
fd_reset_wfi(struct fd_batch * batch)414 fd_reset_wfi(struct fd_batch *batch)
415 {
416 batch->needs_wfi = true;
417 }
418
419 void fd_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt;
420
421 /* emit a CP_EVENT_WRITE:
422 * (a6xx+ cannot use this, use fd6_event_write<chip>.)
423 */
424 #ifndef __cplusplus
425 static inline void
fd_event_write(struct fd_batch * batch,struct fd_ringbuffer * ring,enum vgt_event_type evt)426 fd_event_write(struct fd_batch *batch, struct fd_ringbuffer *ring,
427 enum vgt_event_type evt)
428 {
429 OUT_PKT3(ring, CP_EVENT_WRITE, 1);
430 OUT_RING(ring, evt);
431 fd_reset_wfi(batch);
432 }
433 #endif
434
435 /* Get per-tile epilogue */
436 static inline struct fd_ringbuffer *
fd_batch_get_tile_epilogue(struct fd_batch * batch)437 fd_batch_get_tile_epilogue(struct fd_batch *batch)
438 {
439 if (batch->tile_epilogue == NULL) {
440 batch->tile_epilogue = fd_submit_new_ringbuffer(batch->submit, 0x1000,
441 FD_RINGBUFFER_GROWABLE);
442 }
443
444 return batch->tile_epilogue;
445 }
446
447 /* Get epilogue run after all tiles*/
448 static inline struct fd_ringbuffer *
fd_batch_get_epilogue(struct fd_batch * batch)449 fd_batch_get_epilogue(struct fd_batch *batch)
450 {
451 if (batch->epilogue == NULL) {
452 batch->epilogue = fd_submit_new_ringbuffer(batch->submit, 0x1000,
453 FD_RINGBUFFER_GROWABLE);
454 }
455
456 return batch->epilogue;
457 }
458
459 struct fd_ringbuffer *fd_batch_get_prologue(struct fd_batch *batch);
460
461 #ifdef __cplusplus
462 }
463 #endif
464
465 #endif /* FREEDRENO_BATCH_H_ */
466