1 /*
2 * Copyright 2022 Alyssa Rosenzweig
3 * Copyright 2019-2020 Collabora, Ltd.
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include <xf86drm.h>
8 #include "asahi/lib/agx_device_virtio.h"
9 #include "asahi/lib/decode.h"
10 #include "util/bitset.h"
11 #include "util/u_dynarray.h"
12 #include "util/u_range.h"
13 #include "agx_state.h"
14 #include "vdrm.h"
15
16 #define foreach_active(ctx, idx) \
17 BITSET_FOREACH_SET(idx, ctx->batches.active, AGX_MAX_BATCHES)
18
19 #define foreach_submitted(ctx, idx) \
20 BITSET_FOREACH_SET(idx, ctx->batches.submitted, AGX_MAX_BATCHES)
21
22 #define batch_debug(batch, fmt, ...) \
23 do { \
24 if (unlikely(agx_device(batch->ctx->base.screen)->debug & \
25 AGX_DBG_BATCH)) \
26 agx_msg("[Queue %u Batch %u] " fmt "\n", batch->ctx->queue_id, \
27 agx_batch_idx(batch), ##__VA_ARGS__); \
28 } while (0)
29
30 bool
agx_batch_is_active(struct agx_batch * batch)31 agx_batch_is_active(struct agx_batch *batch)
32 {
33 return BITSET_TEST(batch->ctx->batches.active, agx_batch_idx(batch));
34 }
35
36 bool
agx_batch_is_submitted(struct agx_batch * batch)37 agx_batch_is_submitted(struct agx_batch *batch)
38 {
39 return BITSET_TEST(batch->ctx->batches.submitted, agx_batch_idx(batch));
40 }
41
42 static void
agx_batch_mark_active(struct agx_batch * batch)43 agx_batch_mark_active(struct agx_batch *batch)
44 {
45 unsigned batch_idx = agx_batch_idx(batch);
46
47 batch_debug(batch, "ACTIVE");
48
49 assert(!BITSET_TEST(batch->ctx->batches.submitted, batch_idx));
50 assert(!BITSET_TEST(batch->ctx->batches.active, batch_idx));
51 BITSET_SET(batch->ctx->batches.active, batch_idx);
52 }
53
54 static void
agx_batch_mark_submitted(struct agx_batch * batch)55 agx_batch_mark_submitted(struct agx_batch *batch)
56 {
57 unsigned batch_idx = agx_batch_idx(batch);
58
59 batch_debug(batch, "SUBMIT");
60
61 assert(BITSET_TEST(batch->ctx->batches.active, batch_idx));
62 assert(!BITSET_TEST(batch->ctx->batches.submitted, batch_idx));
63 BITSET_CLEAR(batch->ctx->batches.active, batch_idx);
64 BITSET_SET(batch->ctx->batches.submitted, batch_idx);
65 }
66
67 static void
agx_batch_mark_complete(struct agx_batch * batch)68 agx_batch_mark_complete(struct agx_batch *batch)
69 {
70 unsigned batch_idx = agx_batch_idx(batch);
71
72 batch_debug(batch, "COMPLETE");
73
74 assert(!BITSET_TEST(batch->ctx->batches.active, batch_idx));
75 assert(BITSET_TEST(batch->ctx->batches.submitted, batch_idx));
76 BITSET_CLEAR(batch->ctx->batches.submitted, batch_idx);
77 }
78
79 struct agx_encoder
agx_encoder_allocate(struct agx_batch * batch,struct agx_device * dev)80 agx_encoder_allocate(struct agx_batch *batch, struct agx_device *dev)
81 {
82 struct agx_bo *bo = agx_bo_create(dev, 0x80000, 0, 0, "Encoder");
83
84 return (struct agx_encoder){
85 .bo = bo,
86 .current = bo->map,
87 .end = (uint8_t *)bo->map + bo->size,
88 };
89 }
90
91 static void
agx_batch_init(struct agx_context * ctx,const struct pipe_framebuffer_state * key,struct agx_batch * batch)92 agx_batch_init(struct agx_context *ctx,
93 const struct pipe_framebuffer_state *key,
94 struct agx_batch *batch)
95 {
96 struct agx_device *dev = agx_device(ctx->base.screen);
97
98 batch->ctx = ctx;
99 util_copy_framebuffer_state(&batch->key, key);
100 batch->seqnum = ++ctx->batches.seqnum;
101
102 agx_pool_init(&batch->pool, dev, 0, true);
103 agx_pool_init(&batch->pipeline_pool, dev, AGX_BO_LOW_VA, true);
104
105 /* These allocations can happen only once and will just be zeroed (not freed)
106 * during batch clean up. The memory is owned by the context.
107 */
108 if (!batch->bo_list.set) {
109 batch->bo_list.set = rzalloc_array(ctx, BITSET_WORD, 128);
110 batch->bo_list.bit_count = 128 * sizeof(BITSET_WORD) * 8;
111 } else {
112 memset(batch->bo_list.set, 0, batch->bo_list.bit_count / 8);
113 }
114
115 if (agx_batch_is_compute(batch)) {
116 batch->cdm = agx_encoder_allocate(batch, dev);
117 memset(&batch->vdm, 0, sizeof(batch->vdm));
118 } else {
119 batch->vdm = agx_encoder_allocate(batch, dev);
120 memset(&batch->cdm, 0, sizeof(batch->cdm));
121 }
122
123 util_dynarray_init(&batch->scissor, ctx);
124 util_dynarray_init(&batch->depth_bias, ctx);
125 util_dynarray_init(&batch->timestamps, ctx);
126
127 batch->clear = 0;
128 batch->draw = 0;
129 batch->load = 0;
130 batch->resolve = 0;
131 memset(batch->uploaded_clear_color, 0, sizeof(batch->uploaded_clear_color));
132 batch->clear_depth = 0;
133 batch->clear_stencil = 0;
134 batch->varyings = 0;
135 batch->geometry_state = 0;
136 batch->initialized = false;
137 batch->draws = 0;
138 batch->incoherent_writes = false;
139 agx_bo_unreference(dev, batch->sampler_heap.bo);
140 batch->sampler_heap.bo = NULL;
141 batch->sampler_heap.count = 0;
142 batch->vs_scratch = false;
143 batch->fs_scratch = false;
144 batch->cs_scratch = false;
145 batch->vs_preamble_scratch = 0;
146 batch->fs_preamble_scratch = 0;
147 batch->cs_preamble_scratch = 0;
148
149 /* May get read before write, need to initialize to 0 to avoid GPU-side UAF
150 * conditions.
151 */
152 batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] = 0;
153
154 /* We need to emit prim state at the start. Max collides with all. */
155 batch->reduced_prim = MESA_PRIM_COUNT;
156
157 if (!batch->syncobj) {
158 int ret = drmSyncobjCreate(dev->fd, 0, &batch->syncobj);
159 assert(!ret && batch->syncobj);
160 }
161
162 batch->result_off =
163 (2 * sizeof(union agx_batch_result)) * agx_batch_idx(batch);
164 batch->result =
165 (void *)(((uint8_t *)ctx->result_buf->map) + batch->result_off);
166 memset(batch->result, 0, sizeof(union agx_batch_result) * 2);
167
168 agx_batch_mark_active(batch);
169 }
170
171 const char *status_str[] = {
172 [DRM_ASAHI_STATUS_PENDING] = "(pending)",
173 [DRM_ASAHI_STATUS_COMPLETE] = "Complete",
174 [DRM_ASAHI_STATUS_UNKNOWN_ERROR] = "UNKNOWN ERROR",
175 [DRM_ASAHI_STATUS_TIMEOUT] = "TIMEOUT",
176 [DRM_ASAHI_STATUS_FAULT] = "FAULT",
177 [DRM_ASAHI_STATUS_KILLED] = "KILLED",
178 [DRM_ASAHI_STATUS_NO_DEVICE] = "NO DEVICE",
179 };
180
181 const char *fault_type_str[] = {
182 [DRM_ASAHI_FAULT_NONE] = "(none)",
183 [DRM_ASAHI_FAULT_UNKNOWN] = "Unknown",
184 [DRM_ASAHI_FAULT_UNMAPPED] = "Unmapped",
185 [DRM_ASAHI_FAULT_AF_FAULT] = "AF Fault",
186 [DRM_ASAHI_FAULT_WRITE_ONLY] = "Write Only",
187 [DRM_ASAHI_FAULT_READ_ONLY] = "Read Only",
188 [DRM_ASAHI_FAULT_NO_ACCESS] = "No Access",
189 };
190
191 const char *low_unit_str[16] = {
192 "DCMP", "UL1C", "CMP", "GSL1", "IAP", "VCE", "TE", "RAS",
193 "VDM", "PPP", "IPF", "IPF_CPF", "VF", "VF_CPF", "ZLS", "UNK",
194 };
195
196 const char *mid_unit_str[16] = {
197 "UNK", "dPM", "dCDM_KS0", "dCDM_KS1", "dCDM_KS2", "dIPP",
198 "dIPP_CS", "dVDM_CSD", "dVDM_SSD", "dVDM_ILF", "dVDM_ILD", "dRDE0",
199 "dRDE1", "FC", "GSL2", "UNK",
200 };
201
202 const char *high_unit_str[16] = {
203 "gPM_SP", "gVDM_CSD_SP", "gVDM_SSD_SP", "gVDM_ILF_SP",
204 "gVDM_TFP_SP", "gVDM_MMB_SP", "gCDM_CS_KS0_SP", "gCDM_CS_KS1_SP",
205 "gCDM_CS_KS2_SP", "gCDM_KS0_SP", "gCDM_KS1_SP", "gCDM_KS2_SP",
206 "gIPP_SP", "gIPP_CS_SP", "gRDE0_SP", "gRDE1_SP",
207 };
208
209 static void
agx_print_result(struct agx_device * dev,struct agx_context * ctx,struct drm_asahi_result_info * info,unsigned batch_idx,bool is_compute)210 agx_print_result(struct agx_device *dev, struct agx_context *ctx,
211 struct drm_asahi_result_info *info, unsigned batch_idx,
212 bool is_compute)
213 {
214 if (unlikely(info->status != DRM_ASAHI_STATUS_COMPLETE)) {
215 ctx->any_faults = true;
216 }
217
218 if (likely(info->status == DRM_ASAHI_STATUS_COMPLETE &&
219 !((dev)->debug & AGX_DBG_STATS)))
220 return;
221
222 if (is_compute) {
223 struct drm_asahi_result_compute *r = (void *)info;
224 float time = (r->ts_end - r->ts_start) / dev->params.timer_frequency_hz;
225
226 mesa_logw(
227 "[Batch %d] Compute %s: %.06f\n", batch_idx,
228 info->status < ARRAY_SIZE(status_str) ? status_str[info->status] : "?",
229 time);
230 } else {
231 struct drm_asahi_result_render *r = (void *)info;
232 float time_vtx = (r->vertex_ts_end - r->vertex_ts_start) /
233 (float)dev->params.timer_frequency_hz;
234 float time_frag = (r->fragment_ts_end - r->fragment_ts_start) /
235 (float)dev->params.timer_frequency_hz;
236 mesa_logw(
237 "[Batch %d] Render %s: TVB %9ld/%9ld bytes (%d ovf) %c%c%c | vtx %.06f frag %.06f\n",
238 batch_idx,
239 info->status < ARRAY_SIZE(status_str) ? status_str[info->status] : "?",
240 (long)r->tvb_usage_bytes, (long)r->tvb_size_bytes,
241 (int)r->num_tvb_overflows,
242 r->flags & DRM_ASAHI_RESULT_RENDER_TVB_GROW_OVF ? 'G' : ' ',
243 r->flags & DRM_ASAHI_RESULT_RENDER_TVB_GROW_MIN ? 'M' : ' ',
244 r->flags & DRM_ASAHI_RESULT_RENDER_TVB_OVERFLOWED ? 'O' : ' ',
245 time_vtx, time_frag);
246 }
247
248 if (info->fault_type != DRM_ASAHI_FAULT_NONE) {
249 const char *unit_name;
250 int unit_index;
251
252 switch (info->unit) {
253 case 0x00 ... 0x9f:
254 unit_name = low_unit_str[info->unit & 0xf];
255 unit_index = info->unit >> 4;
256 break;
257 case 0xa0 ... 0xaf:
258 unit_name = mid_unit_str[info->unit & 0xf];
259 unit_index = 0;
260 break;
261 case 0xb0 ... 0xb7:
262 unit_name = "GL2CC_META";
263 unit_index = info->unit & 0x7;
264 break;
265 case 0xb8:
266 unit_name = "GL2CC_MB";
267 unit_index = 0;
268 break;
269 case 0xe0 ... 0xff:
270 unit_name = high_unit_str[info->unit & 0xf];
271 unit_index = (info->unit >> 4) & 1;
272 break;
273 default:
274 unit_name = "UNK";
275 unit_index = 0;
276 break;
277 }
278
279 mesa_logw(
280 "[Batch %d] Fault: %s : Addr 0x%llx %c Unit %02x (%s/%d) SB 0x%02x L%d Extra 0x%x\n",
281 batch_idx,
282 info->fault_type < ARRAY_SIZE(fault_type_str)
283 ? fault_type_str[info->fault_type]
284 : "?",
285 (long long)info->address, info->is_read ? 'r' : 'W', info->unit,
286 unit_name, unit_index, info->sideband, info->level, info->extra);
287
288 agx_debug_fault(dev, info->address);
289 }
290
291 /* Obscurely, we need to tolerate faults to pass the robustness parts of the
292 * CTS, so we can't assert that we don't fault. But it's helpful for any sort
293 * of debugging to crash on fault.
294 */
295 if (dev->debug) {
296 assert(info->status == DRM_ASAHI_STATUS_COMPLETE ||
297 info->status == DRM_ASAHI_STATUS_KILLED);
298 }
299 }
300
301 static void
agx_batch_print_stats(struct agx_device * dev,struct agx_batch * batch)302 agx_batch_print_stats(struct agx_device *dev, struct agx_batch *batch)
303 {
304 unsigned batch_idx = agx_batch_idx(batch);
305
306 if (!batch->result)
307 return;
308
309 if (batch->cdm.bo) {
310 agx_print_result(dev, batch->ctx, &batch->result[0].compute.info,
311 batch_idx, true);
312 }
313
314 if (batch->vdm.bo) {
315 agx_print_result(dev, batch->ctx, &batch->result[1].render.info,
316 batch_idx, false);
317 }
318 }
319
320 static void
agx_batch_cleanup(struct agx_context * ctx,struct agx_batch * batch,bool reset)321 agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
322 {
323 struct agx_device *dev = agx_device(ctx->base.screen);
324 assert(batch->ctx == ctx);
325 assert(agx_batch_is_submitted(batch));
326
327 assert(ctx->batch != batch);
328
329 uint64_t begin_ts = ~0, end_ts = 0;
330 if (batch->result) {
331 if (batch->cdm.bo) {
332 begin_ts = MIN2(begin_ts, batch->result[0].compute.ts_start);
333 end_ts = MAX2(end_ts, batch->result[0].compute.ts_end);
334 }
335
336 if (batch->vdm.bo) {
337 begin_ts = MIN2(begin_ts, batch->result[1].render.vertex_ts_start);
338 end_ts = MAX2(end_ts, batch->result[1].render.fragment_ts_end);
339 }
340 }
341
342 agx_finish_batch_queries(batch, begin_ts, end_ts);
343
344 if (reset) {
345 int handle;
346 AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
347 /* We should write no buffers if this is an empty batch */
348 assert(agx_writer_get(ctx, handle) != batch);
349
350 agx_bo_unreference(dev, agx_lookup_bo(dev, handle));
351 }
352 } else {
353 int handle;
354 AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
355 struct agx_bo *bo = agx_lookup_bo(dev, handle);
356
357 /* There is no more writer on this context for anything we wrote */
358 struct agx_batch *writer = agx_writer_get(ctx, handle);
359
360 if (writer == batch)
361 agx_writer_remove(ctx, handle);
362
363 p_atomic_cmpxchg(&bo->writer,
364 agx_bo_writer(ctx->queue_id, batch->syncobj), 0);
365
366 agx_bo_unreference(dev, agx_lookup_bo(dev, handle));
367 }
368 }
369
370 agx_bo_unreference(dev, batch->vdm.bo);
371 agx_bo_unreference(dev, batch->cdm.bo);
372 agx_pool_cleanup(&batch->pool);
373 agx_pool_cleanup(&batch->pipeline_pool);
374
375 util_dynarray_fini(&batch->scissor);
376 util_dynarray_fini(&batch->depth_bias);
377 util_dynarray_fini(&batch->timestamps);
378
379 if (!(dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC))) {
380 agx_batch_print_stats(dev, batch);
381 }
382
383 util_unreference_framebuffer_state(&batch->key);
384 agx_batch_mark_complete(batch);
385 }
386
387 int
agx_cleanup_batches(struct agx_context * ctx)388 agx_cleanup_batches(struct agx_context *ctx)
389 {
390 struct agx_device *dev = agx_device(ctx->base.screen);
391
392 unsigned i;
393 unsigned count = 0;
394 struct agx_batch *batches[AGX_MAX_BATCHES];
395 uint32_t syncobjs[AGX_MAX_BATCHES];
396 uint32_t first = 0;
397
398 foreach_submitted(ctx, i) {
399 batches[count] = &ctx->batches.slots[i];
400 syncobjs[count++] = ctx->batches.slots[i].syncobj;
401 }
402
403 if (!count)
404 return -1;
405
406 int ret = drmSyncobjWait(dev->fd, syncobjs, count, 0, 0, &first);
407 assert(!ret || ret == -ETIME);
408 if (ret)
409 return -1;
410
411 assert(first < AGX_MAX_BATCHES);
412 agx_batch_cleanup(ctx, batches[first], false);
413 return agx_batch_idx(batches[first]);
414 }
415
416 static struct agx_batch *
agx_get_batch_for_framebuffer(struct agx_context * ctx,const struct pipe_framebuffer_state * state)417 agx_get_batch_for_framebuffer(struct agx_context *ctx,
418 const struct pipe_framebuffer_state *state)
419 {
420 /* Look if we have a matching batch */
421 unsigned i;
422 foreach_active(ctx, i) {
423 struct agx_batch *candidate = &ctx->batches.slots[i];
424
425 if (util_framebuffer_state_equal(&candidate->key, state)) {
426 /* We found a match, increase the seqnum for the LRU
427 * eviction logic.
428 */
429 candidate->seqnum = ++ctx->batches.seqnum;
430 return candidate;
431 }
432 }
433
434 /* Look for a free batch */
435 for (i = 0; i < AGX_MAX_BATCHES; ++i) {
436 if (!BITSET_TEST(ctx->batches.active, i) &&
437 !BITSET_TEST(ctx->batches.submitted, i)) {
438 struct agx_batch *batch = &ctx->batches.slots[i];
439 agx_batch_init(ctx, state, batch);
440 return batch;
441 }
442 }
443
444 /* Try to clean up one batch */
445 int freed = agx_cleanup_batches(ctx);
446 if (freed >= 0) {
447 struct agx_batch *batch = &ctx->batches.slots[freed];
448 agx_batch_init(ctx, state, batch);
449 return batch;
450 }
451
452 /* Else, evict something */
453 struct agx_batch *batch = NULL;
454 bool submitted = false;
455 for (i = 0; i < AGX_MAX_BATCHES; ++i) {
456 struct agx_batch *candidate = &ctx->batches.slots[i];
457 bool cand_submitted = BITSET_TEST(ctx->batches.submitted, i);
458
459 /* Prefer submitted batches first */
460 if (!cand_submitted && submitted)
461 continue;
462
463 if (!batch || batch->seqnum > candidate->seqnum) {
464 batch = candidate;
465 submitted = cand_submitted;
466 }
467 }
468 assert(batch);
469
470 agx_sync_batch_for_reason(ctx, batch, "Too many batches");
471
472 /* Batch is now free */
473 agx_batch_init(ctx, state, batch);
474 return batch;
475 }
476
477 struct agx_batch *
agx_get_batch(struct agx_context * ctx)478 agx_get_batch(struct agx_context *ctx)
479 {
480 if (!ctx->batch || agx_batch_is_compute(ctx->batch)) {
481 ctx->batch = agx_get_batch_for_framebuffer(ctx, &ctx->framebuffer);
482 agx_dirty_all(ctx);
483 }
484
485 assert(util_framebuffer_state_equal(&ctx->framebuffer, &ctx->batch->key));
486 return ctx->batch;
487 }
488
489 struct agx_batch *
agx_get_compute_batch(struct agx_context * ctx)490 agx_get_compute_batch(struct agx_context *ctx)
491 {
492 agx_dirty_all(ctx);
493
494 struct pipe_framebuffer_state key = {.width = AGX_COMPUTE_BATCH_WIDTH};
495 ctx->batch = agx_get_batch_for_framebuffer(ctx, &key);
496 return ctx->batch;
497 }
498
499 void
agx_flush_all(struct agx_context * ctx,const char * reason)500 agx_flush_all(struct agx_context *ctx, const char *reason)
501 {
502 unsigned idx;
503 foreach_active(ctx, idx) {
504 if (reason)
505 perf_debug_ctx(ctx, "Flushing due to: %s\n", reason);
506
507 agx_flush_batch(ctx, &ctx->batches.slots[idx]);
508 }
509 }
510
511 void
agx_flush_batch_for_reason(struct agx_context * ctx,struct agx_batch * batch,const char * reason)512 agx_flush_batch_for_reason(struct agx_context *ctx, struct agx_batch *batch,
513 const char *reason)
514 {
515 if (reason)
516 perf_debug_ctx(ctx, "Flushing due to: %s\n", reason);
517
518 if (agx_batch_is_active(batch))
519 agx_flush_batch(ctx, batch);
520 }
521
522 static void
agx_flush_readers_except(struct agx_context * ctx,struct agx_resource * rsrc,struct agx_batch * except,const char * reason,bool sync)523 agx_flush_readers_except(struct agx_context *ctx, struct agx_resource *rsrc,
524 struct agx_batch *except, const char *reason,
525 bool sync)
526 {
527 unsigned idx;
528
529 /* Flush everything to the hardware first */
530 foreach_active(ctx, idx) {
531 struct agx_batch *batch = &ctx->batches.slots[idx];
532
533 if (batch == except)
534 continue;
535
536 if (agx_batch_uses_bo(batch, rsrc->bo)) {
537 perf_debug_ctx(ctx, "Flush reader due to: %s\n", reason);
538 agx_flush_batch(ctx, batch);
539 }
540 }
541
542 /* Then wait on everything if necessary */
543 if (sync) {
544 foreach_submitted(ctx, idx) {
545 struct agx_batch *batch = &ctx->batches.slots[idx];
546
547 if (batch == except)
548 continue;
549
550 if (agx_batch_uses_bo(batch, rsrc->bo)) {
551 perf_debug_ctx(ctx, "Sync reader due to: %s\n", reason);
552 agx_sync_batch(ctx, batch);
553 }
554 }
555 }
556 }
557
558 static void
agx_flush_writer_except(struct agx_context * ctx,struct agx_resource * rsrc,struct agx_batch * except,const char * reason,bool sync)559 agx_flush_writer_except(struct agx_context *ctx, struct agx_resource *rsrc,
560 struct agx_batch *except, const char *reason, bool sync)
561 {
562 struct agx_batch *writer = agx_writer_get(ctx, rsrc->bo->handle);
563
564 if (writer && writer != except &&
565 (agx_batch_is_active(writer) || agx_batch_is_submitted(writer))) {
566 if (agx_batch_is_active(writer) || sync) {
567 perf_debug_ctx(ctx, "%s writer due to: %s\n", sync ? "Sync" : "Flush",
568 reason);
569 }
570 if (agx_batch_is_active(writer))
571 agx_flush_batch(ctx, writer);
572 /* Check for submitted state, because if the batch was a no-op it'll
573 * already be cleaned up */
574 if (sync && agx_batch_is_submitted(writer))
575 agx_sync_batch(ctx, writer);
576 }
577 }
578
579 bool
agx_any_batch_uses_resource(struct agx_context * ctx,struct agx_resource * rsrc)580 agx_any_batch_uses_resource(struct agx_context *ctx, struct agx_resource *rsrc)
581 {
582 unsigned idx;
583 foreach_active(ctx, idx) {
584 struct agx_batch *batch = &ctx->batches.slots[idx];
585
586 if (agx_batch_uses_bo(batch, rsrc->bo))
587 return true;
588 }
589
590 foreach_submitted(ctx, idx) {
591 struct agx_batch *batch = &ctx->batches.slots[idx];
592
593 if (agx_batch_uses_bo(batch, rsrc->bo))
594 return true;
595 }
596
597 return false;
598 }
599
600 void
agx_flush_readers(struct agx_context * ctx,struct agx_resource * rsrc,const char * reason)601 agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc,
602 const char *reason)
603 {
604 agx_flush_readers_except(ctx, rsrc, NULL, reason, false);
605 }
606
607 void
agx_sync_readers(struct agx_context * ctx,struct agx_resource * rsrc,const char * reason)608 agx_sync_readers(struct agx_context *ctx, struct agx_resource *rsrc,
609 const char *reason)
610 {
611 agx_flush_readers_except(ctx, rsrc, NULL, reason, true);
612 }
613
614 void
agx_flush_writer(struct agx_context * ctx,struct agx_resource * rsrc,const char * reason)615 agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc,
616 const char *reason)
617 {
618 agx_flush_writer_except(ctx, rsrc, NULL, reason, false);
619 }
620
621 void
agx_sync_writer(struct agx_context * ctx,struct agx_resource * rsrc,const char * reason)622 agx_sync_writer(struct agx_context *ctx, struct agx_resource *rsrc,
623 const char *reason)
624 {
625 agx_flush_writer_except(ctx, rsrc, NULL, reason, true);
626 }
627
628 void
agx_batch_reads(struct agx_batch * batch,struct agx_resource * rsrc)629 agx_batch_reads(struct agx_batch *batch, struct agx_resource *rsrc)
630 {
631 agx_batch_add_bo(batch, rsrc->bo);
632
633 if (rsrc->separate_stencil)
634 agx_batch_add_bo(batch, rsrc->separate_stencil->bo);
635
636 /* Don't hazard track fake resources internally created for meta */
637 if (!rsrc->base.screen)
638 return;
639
640 /* Hazard: read-after-write */
641 agx_flush_writer_except(batch->ctx, rsrc, batch, "Read from another batch",
642 false);
643 }
644
645 static void
agx_batch_writes_internal(struct agx_batch * batch,struct agx_resource * rsrc,unsigned level)646 agx_batch_writes_internal(struct agx_batch *batch, struct agx_resource *rsrc,
647 unsigned level)
648 {
649 struct agx_context *ctx = batch->ctx;
650 struct agx_batch *writer = agx_writer_get(ctx, rsrc->bo->handle);
651
652 assert(batch->initialized);
653
654 agx_flush_readers_except(ctx, rsrc, batch, "Write from other batch", false);
655
656 BITSET_SET(rsrc->data_valid, level);
657
658 /* Nothing to do if we're already writing */
659 if (writer == batch)
660 return;
661
662 /* Hazard: writer-after-write, write-after-read */
663 if (writer)
664 agx_flush_writer(ctx, rsrc, "Multiple writers");
665
666 /* Write is strictly stronger than a read */
667 agx_batch_reads(batch, rsrc);
668
669 writer = agx_writer_get(ctx, rsrc->bo->handle);
670 assert(!writer || agx_batch_is_submitted(writer));
671
672 /* We are now the new writer. Disregard the previous writer -- anything that
673 * needs to wait for the writer going forward needs to wait for us.
674 */
675 agx_writer_remove(ctx, rsrc->bo->handle);
676 agx_writer_add(ctx, agx_batch_idx(batch), rsrc->bo->handle);
677 assert(agx_batch_is_active(batch));
678 }
679
680 void
agx_batch_writes(struct agx_batch * batch,struct agx_resource * rsrc,unsigned level)681 agx_batch_writes(struct agx_batch *batch, struct agx_resource *rsrc,
682 unsigned level)
683 {
684 agx_batch_writes_internal(batch, rsrc, level);
685
686 if (rsrc->base.target == PIPE_BUFFER) {
687 /* Assume BOs written by the GPU are fully valid */
688 rsrc->valid_buffer_range.start = 0;
689 rsrc->valid_buffer_range.end = ~0;
690 }
691 }
692
693 void
agx_batch_writes_range(struct agx_batch * batch,struct agx_resource * rsrc,unsigned offset,unsigned size)694 agx_batch_writes_range(struct agx_batch *batch, struct agx_resource *rsrc,
695 unsigned offset, unsigned size)
696 {
697 assert(rsrc->base.target == PIPE_BUFFER);
698 agx_batch_writes_internal(batch, rsrc, 0);
699 util_range_add(&rsrc->base, &rsrc->valid_buffer_range, offset,
700 offset + size);
701 }
702
703 static int
agx_get_in_sync(struct agx_context * ctx)704 agx_get_in_sync(struct agx_context *ctx)
705 {
706 struct agx_device *dev = agx_device(ctx->base.screen);
707
708 if (ctx->in_sync_fd >= 0) {
709 int ret =
710 drmSyncobjImportSyncFile(dev->fd, ctx->in_sync_obj, ctx->in_sync_fd);
711 assert(!ret);
712
713 close(ctx->in_sync_fd);
714 ctx->in_sync_fd = -1;
715
716 return ctx->in_sync_obj;
717 } else {
718 return 0;
719 }
720 }
721
722 static void
agx_add_sync(struct drm_asahi_sync * syncs,unsigned * count,uint32_t handle)723 agx_add_sync(struct drm_asahi_sync *syncs, unsigned *count, uint32_t handle)
724 {
725 if (!handle)
726 return;
727
728 syncs[(*count)++] = (struct drm_asahi_sync){
729 .sync_type = DRM_ASAHI_SYNC_SYNCOBJ,
730 .handle = handle,
731 };
732 }
733
734 void
agx_batch_submit(struct agx_context * ctx,struct agx_batch * batch,struct drm_asahi_cmd_compute * compute,struct drm_asahi_cmd_render * render)735 agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
736 struct drm_asahi_cmd_compute *compute,
737 struct drm_asahi_cmd_render *render)
738 {
739 struct agx_device *dev = agx_device(ctx->base.screen);
740 struct agx_screen *screen = agx_screen(ctx->base.screen);
741
742 bool feedback = dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC | AGX_DBG_STATS);
743
744 #ifndef NDEBUG
745 /* Debug builds always get feedback (for fault checks) */
746 feedback = true;
747 #endif
748
749 /* Timer queries use the feedback timestamping */
750 feedback |= (batch->timestamps.size > 0);
751
752 if (!feedback)
753 batch->result = NULL;
754
755 /* We allocate the worst-case sync array size since this won't be excessive
756 * for most workloads
757 */
758 unsigned max_syncs = batch->bo_list.bit_count + 2;
759 unsigned in_sync_count = 0;
760 unsigned shared_bo_count = 0;
761 struct drm_asahi_sync *in_syncs =
762 malloc(max_syncs * sizeof(struct drm_asahi_sync));
763 struct agx_bo **shared_bos = malloc(max_syncs * sizeof(struct agx_bo *));
764
765 uint64_t wait_seqid = p_atomic_read(&screen->flush_wait_seqid);
766
767 /* Elide syncing against our own queue */
768 if (wait_seqid && wait_seqid == ctx->flush_my_seqid) {
769 batch_debug(batch,
770 "Wait sync point %" PRIu64 " is ours, waiting on %" PRIu64
771 " instead",
772 wait_seqid, ctx->flush_other_seqid);
773 wait_seqid = ctx->flush_other_seqid;
774 }
775
776 uint64_t seqid = p_atomic_inc_return(&screen->flush_cur_seqid);
777 assert(seqid > wait_seqid);
778
779 batch_debug(batch, "Sync point is %" PRIu64, seqid);
780
781 /* Subtle concurrency note: Since we assign seqids atomically and do
782 * not lock submission across contexts, it is possible for two threads
783 * to submit timeline syncobj updates out of order. As far as I can
784 * tell, this case is handled in the kernel conservatively: it triggers
785 * a fence context bump and effectively "splits" the timeline at the
786 * larger point, causing future lookups for earlier points to return a
787 * later point, waiting more. The signaling code still makes sure all
788 * prior fences have to be signaled before considering a given point
789 * signaled, regardless of order. That's good enough for us.
790 *
791 * (Note: this case breaks drm_syncobj_query_ioctl and for this reason
792 * triggers a DRM_DEBUG message on submission, but we don't use that
793 * so we don't care.)
794 *
795 * This case can be tested by setting seqid = 1 unconditionally here,
796 * causing every single syncobj update to reuse the same timeline point.
797 * Everything still works (but over-synchronizes because this effectively
798 * serializes all submissions once any context flushes once).
799 */
800 struct drm_asahi_sync out_syncs[2] = {
801 {
802 .sync_type = DRM_ASAHI_SYNC_SYNCOBJ,
803 .handle = batch->syncobj,
804 },
805 {
806 .sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ,
807 .handle = screen->flush_syncobj,
808 .timeline_value = seqid,
809 },
810 };
811
812 /* This lock protects against a subtle race scenario:
813 * - Context 1 submits and registers itself as writer for a BO
814 * - Context 2 runs the below loop, and finds the writer syncobj
815 * - Context 1 is destroyed,
816 * - flushing all batches, unregistering itself as a writer, and
817 * - Destroying syncobjs for all batches
818 * - Context 2 submits, with a now invalid syncobj ID
819 *
820 * Since batch syncobjs are only destroyed on context destruction, we can
821 * protect against this scenario with a screen-wide rwlock to ensure that
822 * the syncobj destroy code cannot run concurrently with any other
823 * submission. If a submit runs before the wrlock is taken, the syncobjs
824 * must still exist (even if the batch was flushed and no longer a writer).
825 * If it runs after the wrlock is released, then by definition the
826 * just-destroyed syncobjs cannot be writers for any BO at that point.
827 *
828 * A screen-wide (not device-wide) rwlock is sufficient because by definition
829 * resources can only be implicitly shared within a screen. Any shared
830 * resources across screens must have been imported and will go through the
831 * AGX_BO_SHARED path instead, which has no race (but is slower).
832 */
833 u_rwlock_rdlock(&screen->destroy_lock);
834
835 int handle;
836 AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
837 struct agx_bo *bo = agx_lookup_bo(dev, handle);
838
839 if (bo->flags & AGX_BO_SHARED) {
840 batch_debug(batch, "Waits on shared BO @ 0x%" PRIx64, bo->va->addr);
841
842 /* Get a sync file fd from the buffer */
843 int in_sync_fd = agx_export_sync_file(dev, bo);
844 assert(in_sync_fd >= 0);
845
846 /* Create a new syncobj */
847 uint32_t sync_handle;
848 int ret = drmSyncobjCreate(dev->fd, 0, &sync_handle);
849 assert(ret >= 0);
850
851 /* Import the sync file into it */
852 ret = drmSyncobjImportSyncFile(dev->fd, sync_handle, in_sync_fd);
853 assert(ret >= 0);
854 assert(sync_handle);
855 close(in_sync_fd);
856
857 /* Add it to our wait list */
858 agx_add_sync(in_syncs, &in_sync_count, sync_handle);
859
860 /* And keep track of the BO for cloning the out_sync */
861 shared_bos[shared_bo_count++] = bo;
862 } else {
863 /* Deal with BOs which are not externally shared, but which have been
864 * written from another context within the same screen. We also need to
865 * wait on these using their syncobj.
866 */
867 uint64_t writer = p_atomic_read_relaxed(&bo->writer);
868 uint32_t queue_id = agx_bo_writer_queue(writer);
869 if (writer && queue_id != ctx->queue_id) {
870 batch_debug(
871 batch, "Waits on inter-context BO @ 0x%" PRIx64 " from queue %u",
872 bo->va->addr, queue_id);
873
874 agx_add_sync(in_syncs, &in_sync_count,
875 agx_bo_writer_syncobj(writer));
876 shared_bos[shared_bo_count++] = NULL;
877 }
878 }
879 }
880
881 if (dev->debug & AGX_DBG_SCRATCH) {
882 if (compute)
883 agx_scratch_debug_pre(&ctx->scratch_cs);
884 if (render) {
885 agx_scratch_debug_pre(&ctx->scratch_vs);
886 agx_scratch_debug_pre(&ctx->scratch_fs);
887 }
888 }
889
890 /* Add an explicit fence from gallium, if any */
891 agx_add_sync(in_syncs, &in_sync_count, agx_get_in_sync(ctx));
892
893 /* Add an implicit cross-context flush sync point, if any */
894 if (wait_seqid) {
895 batch_debug(batch, "Waits on inter-context sync point %" PRIu64,
896 wait_seqid);
897 in_syncs[in_sync_count++] = (struct drm_asahi_sync){
898 .sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ,
899 .handle = screen->flush_syncobj,
900 .timeline_value = wait_seqid,
901 };
902 }
903
904 /* Submit! */
905 struct drm_asahi_command commands[2];
906 unsigned command_count = 0;
907
908 if (compute) {
909 commands[command_count++] = (struct drm_asahi_command){
910 .cmd_type = DRM_ASAHI_CMD_COMPUTE,
911 .flags = 0,
912 .cmd_buffer = (uint64_t)(uintptr_t)compute,
913 .cmd_buffer_size = sizeof(struct drm_asahi_cmd_compute),
914 .result_offset = feedback ? batch->result_off : 0,
915 .result_size = feedback ? sizeof(union agx_batch_result) : 0,
916 /* Barrier on previous submission */
917 .barriers = {0, 0},
918 };
919 }
920
921 if (render) {
922 commands[command_count++] = (struct drm_asahi_command){
923 .cmd_type = DRM_ASAHI_CMD_RENDER,
924 .flags = 0,
925 .cmd_buffer = (uint64_t)(uintptr_t)render,
926 .cmd_buffer_size = sizeof(struct drm_asahi_cmd_render),
927 .result_offset =
928 feedback ? (batch->result_off + sizeof(union agx_batch_result)) : 0,
929 .result_size = feedback ? sizeof(union agx_batch_result) : 0,
930 /* Barrier on previous submission */
931 .barriers = {compute ? DRM_ASAHI_BARRIER_NONE : 0, compute ? 1 : 0},
932 };
933 }
934
935 struct drm_asahi_submit submit = {
936 .flags = 0,
937 .queue_id = ctx->queue_id,
938 .result_handle = feedback ? ctx->result_buf->handle : 0,
939 .in_sync_count = in_sync_count,
940 .out_sync_count = 2,
941 .command_count = command_count,
942 .in_syncs = (uint64_t)(uintptr_t)(in_syncs),
943 .out_syncs = (uint64_t)(uintptr_t)(out_syncs),
944 .commands = (uint64_t)(uintptr_t)(&commands[0]),
945 };
946
947 int ret = dev->ops.submit(dev, &submit, ctx->result_buf->vbo_res_id);
948
949 u_rwlock_rdunlock(&screen->destroy_lock);
950
951 if (ret) {
952 if (compute) {
953 fprintf(stderr, "DRM_IOCTL_ASAHI_SUBMIT compute failed: %m\n");
954 }
955
956 if (render) {
957 struct drm_asahi_cmd_render *c = render;
958 fprintf(
959 stderr,
960 "DRM_IOCTL_ASAHI_SUBMIT render failed: %m (%dx%d tile %dx%d layers %d samples %d)\n",
961 c->fb_width, c->fb_height, c->utile_width, c->utile_height,
962 c->layers, c->samples);
963 }
964
965 assert(0);
966 }
967
968 if (ret == ENODEV)
969 abort();
970
971 /* Now stash our batch fence into any shared BOs. */
972 if (shared_bo_count) {
973 /* Convert our handle to a sync file */
974 int out_sync_fd = -1;
975 int ret = drmSyncobjExportSyncFile(dev->fd, batch->syncobj, &out_sync_fd);
976 assert(ret >= 0);
977 assert(out_sync_fd >= 0);
978
979 for (unsigned i = 0; i < shared_bo_count; i++) {
980 if (!shared_bos[i])
981 continue;
982
983 batch_debug(batch, "Signals shared BO @ 0x%" PRIx64,
984 shared_bos[i]->va->addr);
985
986 /* Free the in_sync handle we just acquired */
987 ret = drmSyncobjDestroy(dev->fd, in_syncs[i].handle);
988 assert(ret >= 0);
989 /* And then import the out_sync sync file into it */
990 ret = agx_import_sync_file(dev, shared_bos[i], out_sync_fd);
991 assert(ret >= 0);
992 }
993
994 close(out_sync_fd);
995 }
996
997 /* Record the syncobj on each BO we write, so it can be added post-facto as a
998 * fence if the BO is exported later...
999 */
1000 AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
1001 struct agx_bo *bo = agx_lookup_bo(dev, handle);
1002 struct agx_batch *writer = agx_writer_get(ctx, handle);
1003
1004 if (!writer)
1005 continue;
1006
1007 /* Skip BOs that are written by submitted batches, they're not ours */
1008 if (agx_batch_is_submitted(writer))
1009 continue;
1010
1011 /* But any BOs written by active batches are ours */
1012 assert(writer == batch && "exclusive writer");
1013 p_atomic_set(&bo->writer, agx_bo_writer(ctx->queue_id, batch->syncobj));
1014 batch_debug(batch, "Writes to BO @ 0x%" PRIx64, bo->va->addr);
1015 }
1016
1017 free(in_syncs);
1018 free(shared_bos);
1019
1020 if (dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC | AGX_DBG_SCRATCH)) {
1021 if (dev->debug & AGX_DBG_TRACE) {
1022 if (compute) {
1023 agxdecode_drm_cmd_compute(dev->agxdecode, &dev->params, compute,
1024 true);
1025 }
1026
1027 if (render) {
1028 agxdecode_drm_cmd_render(dev->agxdecode, &dev->params, render,
1029 true);
1030 }
1031
1032 agxdecode_next_frame();
1033 }
1034
1035 /* Wait so we can get errors reported back */
1036 int ret = drmSyncobjWait(dev->fd, &batch->syncobj, 1, INT64_MAX, 0, NULL);
1037 assert(!ret);
1038
1039 agx_batch_print_stats(dev, batch);
1040
1041 if (dev->debug & AGX_DBG_SCRATCH) {
1042 if (compute) {
1043 fprintf(stderr, "CS scratch:\n");
1044 agx_scratch_debug_post(&ctx->scratch_cs);
1045 }
1046 if (render) {
1047 fprintf(stderr, "VS scratch:\n");
1048 agx_scratch_debug_post(&ctx->scratch_vs);
1049 fprintf(stderr, "FS scratch:\n");
1050 agx_scratch_debug_post(&ctx->scratch_fs);
1051 }
1052 }
1053 }
1054
1055 agx_batch_mark_submitted(batch);
1056
1057 /* Record the last syncobj for fence creation */
1058 ctx->syncobj = batch->syncobj;
1059
1060 /* Update the last seqid in the context (must only happen if the submit
1061 * succeeded, otherwise the timeline point would not be valid).
1062 */
1063 ctx->flush_last_seqid = seqid;
1064
1065 if (ctx->batch == batch)
1066 ctx->batch = NULL;
1067
1068 /* Try to clean up up to two batches, to keep memory usage down */
1069 if (agx_cleanup_batches(ctx) >= 0)
1070 agx_cleanup_batches(ctx);
1071 }
1072
1073 void
agx_sync_batch(struct agx_context * ctx,struct agx_batch * batch)1074 agx_sync_batch(struct agx_context *ctx, struct agx_batch *batch)
1075 {
1076 struct agx_device *dev = agx_device(ctx->base.screen);
1077
1078 if (agx_batch_is_active(batch))
1079 agx_flush_batch(ctx, batch);
1080
1081 /* Empty batch case, already cleaned up */
1082 if (!agx_batch_is_submitted(batch))
1083 return;
1084
1085 assert(batch->syncobj);
1086 int ret = drmSyncobjWait(dev->fd, &batch->syncobj, 1, INT64_MAX, 0, NULL);
1087 assert(!ret);
1088 agx_batch_cleanup(ctx, batch, false);
1089 }
1090
1091 void
agx_sync_batch_for_reason(struct agx_context * ctx,struct agx_batch * batch,const char * reason)1092 agx_sync_batch_for_reason(struct agx_context *ctx, struct agx_batch *batch,
1093 const char *reason)
1094 {
1095 if (reason)
1096 perf_debug_ctx(ctx, "Syncing due to: %s\n", reason);
1097
1098 agx_sync_batch(ctx, batch);
1099 }
1100
1101 void
agx_sync_all(struct agx_context * ctx,const char * reason)1102 agx_sync_all(struct agx_context *ctx, const char *reason)
1103 {
1104 if (reason)
1105 perf_debug_ctx(ctx, "Syncing all due to: %s\n", reason);
1106
1107 unsigned idx;
1108 foreach_active(ctx, idx) {
1109 agx_flush_batch(ctx, &ctx->batches.slots[idx]);
1110 }
1111
1112 foreach_submitted(ctx, idx) {
1113 agx_sync_batch(ctx, &ctx->batches.slots[idx]);
1114 }
1115 }
1116
1117 void
agx_batch_reset(struct agx_context * ctx,struct agx_batch * batch)1118 agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch)
1119 {
1120 batch_debug(batch, "RESET");
1121
1122 assert(!batch->initialized);
1123
1124 /* Reset an empty batch. Like submit, but does nothing. */
1125 agx_batch_mark_submitted(batch);
1126
1127 if (ctx->batch == batch)
1128 ctx->batch = NULL;
1129
1130 /* Elide printing stats */
1131 batch->result = NULL;
1132
1133 agx_batch_cleanup(ctx, batch, true);
1134 }
1135
1136 /*
1137 * Timestamp queries record the time after all current work is finished,
1138 * which we handle as the time after all current batches finish (since we're a
1139 * tiler and would rather not split the batch). So add a query to all active
1140 * batches.
1141 */
1142 void
agx_add_timestamp_end_query(struct agx_context * ctx,struct agx_query * q)1143 agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q)
1144 {
1145 unsigned idx;
1146 foreach_active(ctx, idx) {
1147 agx_batch_add_timestamp_query(&ctx->batches.slots[idx], q);
1148 }
1149 }
1150
1151 /*
1152 * To implement a memory barrier conservatively, flush any batch that contains
1153 * an incoherent memory write (requiring a memory barrier to synchronize). This
1154 * could be further optimized.
1155 */
1156 void
agx_memory_barrier(struct pipe_context * pctx,unsigned flags)1157 agx_memory_barrier(struct pipe_context *pctx, unsigned flags)
1158 {
1159 struct agx_context *ctx = agx_context(pctx);
1160
1161 unsigned i;
1162 foreach_active(ctx, i) {
1163 struct agx_batch *batch = &ctx->batches.slots[i];
1164
1165 if (batch->incoherent_writes)
1166 agx_flush_batch_for_reason(ctx, batch, "Memory barrier");
1167 }
1168 }
1169