1 /*
2 * Copyright © 2024 Collabora Ltd.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "drm-uapi/panthor_drm.h"
8
9 #include "genxml/cs_builder.h"
10 #include "genxml/decode.h"
11
12 #include "panvk_cmd_buffer.h"
13 #include "panvk_macros.h"
14 #include "panvk_queue.h"
15
16 #include "vk_drm_syncobj.h"
17 #include "vk_log.h"
18
19 static void
finish_render_desc_ringbuf(struct panvk_queue * queue)20 finish_render_desc_ringbuf(struct panvk_queue *queue)
21 {
22 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
23 struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf;
24
25 panvk_pool_free_mem(&dev->mempools.rw, ringbuf->syncobj);
26
27 if (dev->debug.decode_ctx && ringbuf->addr.dev) {
28 pandecode_inject_free(dev->debug.decode_ctx, ringbuf->addr.dev,
29 RENDER_DESC_RINGBUF_SIZE);
30 pandecode_inject_free(dev->debug.decode_ctx,
31 ringbuf->addr.dev + RENDER_DESC_RINGBUF_SIZE,
32 RENDER_DESC_RINGBUF_SIZE);
33 }
34
35 if (ringbuf->addr.dev) {
36 struct pan_kmod_vm_op op = {
37 .type = PAN_KMOD_VM_OP_TYPE_UNMAP,
38 .va = {
39 .start = ringbuf->addr.dev,
40 .size = RENDER_DESC_RINGBUF_SIZE * 2,
41 },
42 };
43
44 ASSERTED int ret =
45 pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &op, 1);
46 assert(!ret);
47 }
48
49 if (ringbuf->addr.host) {
50 ASSERTED int ret =
51 os_munmap(ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE);
52 assert(!ret);
53 }
54
55 pan_kmod_bo_put(ringbuf->bo);
56 }
57
58 static VkResult
init_render_desc_ringbuf(struct panvk_queue * queue)59 init_render_desc_ringbuf(struct panvk_queue *queue)
60 {
61 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
62 const struct panvk_physical_device *phys_dev =
63 to_panvk_physical_device(queue->vk.base.device->physical);
64 uint32_t flags = panvk_device_adjust_bo_flags(dev, PAN_KMOD_BO_FLAG_NO_MMAP);
65 struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf;
66 const size_t size = RENDER_DESC_RINGBUF_SIZE;
67 VkResult result;
68 int ret;
69
70 ringbuf->bo = pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, size, flags);
71 if (!ringbuf->bo)
72 return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
73 "Failed to create a descriptor ring buffer context");
74
75 if (!(flags & PAN_KMOD_BO_FLAG_NO_MMAP)) {
76 ringbuf->addr.host = pan_kmod_bo_mmap(
77 ringbuf->bo, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED, NULL);
78 if (ringbuf->addr.host == MAP_FAILED) {
79 result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
80 "Failed to CPU map ringbuf BO");
81 goto err_finish_ringbuf;
82 }
83 }
84
85 /* We choose the alignment to guarantee that we won't ever cross a 4G
86 * boundary when accessing the mapping. This way we can encode the wraparound
87 * using 32-bit operations. */
88 uint64_t dev_addr = util_vma_heap_alloc(&dev->as.heap, size * 2, size * 2);
89
90 struct pan_kmod_vm_op vm_ops[] = {
91 {
92 .type = PAN_KMOD_VM_OP_TYPE_MAP,
93 .va = {
94 .start = dev_addr,
95 .size = RENDER_DESC_RINGBUF_SIZE,
96 },
97 .map = {
98 .bo = ringbuf->bo,
99 .bo_offset = 0,
100 },
101 },
102 {
103 .type = PAN_KMOD_VM_OP_TYPE_MAP,
104 .va = {
105 .start = dev_addr + RENDER_DESC_RINGBUF_SIZE,
106 .size = RENDER_DESC_RINGBUF_SIZE,
107 },
108 .map = {
109 .bo = ringbuf->bo,
110 .bo_offset = 0,
111 },
112 },
113 };
114
115 ret = pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, vm_ops,
116 ARRAY_SIZE(vm_ops));
117 if (ret) {
118 result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
119 "Failed to GPU map ringbuf BO");
120 goto err_finish_ringbuf;
121 }
122
123 ringbuf->addr.dev = dev_addr;
124
125 if (dev->debug.decode_ctx) {
126 pandecode_inject_mmap(dev->debug.decode_ctx, ringbuf->addr.dev,
127 ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE, NULL);
128 pandecode_inject_mmap(dev->debug.decode_ctx,
129 ringbuf->addr.dev + RENDER_DESC_RINGBUF_SIZE,
130 ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE, NULL);
131 }
132
133 struct panvk_pool_alloc_info alloc_info = {
134 .size = sizeof(struct panvk_cs_sync32),
135 .alignment = 64,
136 };
137
138 ringbuf->syncobj = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
139
140 struct panvk_cs_sync32 *syncobj = panvk_priv_mem_host_addr(ringbuf->syncobj);
141
142 if (!syncobj) {
143 result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
144 "Failed to create the render desc ringbuf context");
145 goto err_finish_ringbuf;
146 }
147
148 *syncobj = (struct panvk_cs_sync32){
149 .seqno = RENDER_DESC_RINGBUF_SIZE,
150 };
151
152 return VK_SUCCESS;
153
154 err_finish_ringbuf:
155 finish_render_desc_ringbuf(queue);
156 return result;
157 }
158
159 static VkResult
init_subqueue(struct panvk_queue * queue,enum panvk_subqueue_id subqueue)160 init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue)
161 {
162 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
163 struct panvk_subqueue *subq = &queue->subqueues[subqueue];
164 const struct panvk_physical_device *phys_dev =
165 to_panvk_physical_device(queue->vk.base.device->physical);
166 struct panvk_instance *instance =
167 to_panvk_instance(dev->vk.physical->instance);
168 unsigned debug = instance->debug_flags;
169 struct panvk_cs_sync64 *syncobjs = panvk_priv_mem_host_addr(queue->syncobjs);
170
171 struct panvk_pool_alloc_info alloc_info = {
172 .size = sizeof(struct panvk_cs_subqueue_context),
173 .alignment = 64,
174 };
175
176 subq->context = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
177 if (!panvk_priv_mem_host_addr(subq->context))
178 return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
179 "Failed to create a queue context");
180
181 struct panvk_cs_subqueue_context *cs_ctx =
182 panvk_priv_mem_host_addr(subq->context);
183
184 *cs_ctx = (struct panvk_cs_subqueue_context){
185 .syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs),
186 .debug_syncobjs = panvk_priv_mem_dev_addr(queue->debug_syncobjs),
187 .iter_sb = 0,
188 };
189
190 /* We use the geometry buffer for our temporary CS buffer. */
191 struct cs_buffer root_cs = {
192 .cpu = panvk_priv_mem_host_addr(queue->tiler_heap.desc) + 4096,
193 .gpu = panvk_priv_mem_dev_addr(queue->tiler_heap.desc) + 4096,
194 .capacity = 64 * 1024 / sizeof(uint64_t),
195 };
196 const struct cs_builder_conf conf = {
197 .nr_registers = 96,
198 .nr_kernel_registers = 4,
199 };
200 struct cs_builder b;
201
202 assert(panvk_priv_mem_dev_addr(queue->tiler_heap.desc) != 0);
203
204 cs_builder_init(&b, &conf, root_cs);
205 /* Pass the context. */
206 cs_move64_to(&b, cs_subqueue_ctx_reg(&b),
207 panvk_priv_mem_dev_addr(subq->context));
208
209 /* Intialize scoreboard slots used for asynchronous operations. */
210 cs_set_scoreboard_entry(&b, SB_ITER(0), SB_ID(LS));
211
212 /* We do greater than test on sync objects, and given the reference seqno
213 * registers are all zero at init time, we need to initialize all syncobjs
214 * with a seqno of one. */
215 syncobjs[subqueue].seqno = 1;
216
217 if (subqueue != PANVK_SUBQUEUE_COMPUTE) {
218 cs_ctx->render.tiler_heap =
219 panvk_priv_mem_dev_addr(queue->tiler_heap.desc);
220 /* Our geometry buffer comes 4k after the tiler heap, and we encode the
221 * size in the lower 12 bits so the address can be copied directly
222 * to the tiler descriptors. */
223 cs_ctx->render.geom_buf =
224 (cs_ctx->render.tiler_heap + 4096) | ((64 * 1024) >> 12);
225
226 /* Initialize the ringbuf */
227 cs_ctx->render.desc_ringbuf = (struct panvk_cs_desc_ringbuf){
228 .syncobj = panvk_priv_mem_dev_addr(queue->render_desc_ringbuf.syncobj),
229 .ptr = queue->render_desc_ringbuf.addr.dev,
230 .pos = 0,
231 };
232
233 struct cs_index heap_ctx_addr = cs_scratch_reg64(&b, 0);
234
235 /* Pre-set the heap context on the vertex-tiler/fragment queues. */
236 cs_move64_to(&b, heap_ctx_addr, queue->tiler_heap.context.dev_addr);
237 cs_heap_set(&b, heap_ctx_addr);
238 }
239
240 cs_finish(&b);
241
242 assert(cs_is_valid(&b));
243
244 struct drm_panthor_sync_op syncop = {
245 .flags =
246 DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ | DRM_PANTHOR_SYNC_OP_SIGNAL,
247 .handle = queue->syncobj_handle,
248 .timeline_value = 0,
249 };
250 struct drm_panthor_queue_submit qsubmit = {
251 .queue_index = subqueue,
252 .stream_size = cs_root_chunk_size(&b),
253 .stream_addr = cs_root_chunk_gpu_addr(&b),
254 .latest_flush = 0,
255 .syncs = DRM_PANTHOR_OBJ_ARRAY(1, &syncop),
256 };
257 struct drm_panthor_group_submit gsubmit = {
258 .group_handle = queue->group_handle,
259 .queue_submits = DRM_PANTHOR_OBJ_ARRAY(1, &qsubmit),
260 };
261
262 int ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_SUBMIT, &gsubmit);
263 if (ret)
264 return vk_errorf(dev->vk.physical, VK_ERROR_INITIALIZATION_FAILED,
265 "Failed to initialized subqueue: %m");
266
267 ret = drmSyncobjWait(dev->vk.drm_fd, &queue->syncobj_handle, 1, INT64_MAX, 0,
268 NULL);
269 if (ret)
270 return vk_errorf(dev->vk.physical, VK_ERROR_INITIALIZATION_FAILED,
271 "SyncobjWait failed: %m");
272
273 if (debug & PANVK_DEBUG_TRACE) {
274 uint32_t regs[256] = {0};
275
276 pandecode_cs(dev->debug.decode_ctx, qsubmit.stream_addr,
277 qsubmit.stream_size, phys_dev->kmod.props.gpu_prod_id, regs);
278 pandecode_next_frame(dev->debug.decode_ctx);
279 }
280
281 return VK_SUCCESS;
282 }
283
284 static void
cleanup_queue(struct panvk_queue * queue)285 cleanup_queue(struct panvk_queue *queue)
286 {
287 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
288
289 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
290 panvk_pool_free_mem(&dev->mempools.rw, queue->subqueues[i].context);
291
292 finish_render_desc_ringbuf(queue);
293
294 panvk_pool_free_mem(&dev->mempools.rw, queue->debug_syncobjs);
295 panvk_pool_free_mem(&dev->mempools.rw, queue->syncobjs);
296 }
297
298 static VkResult
init_queue(struct panvk_queue * queue)299 init_queue(struct panvk_queue *queue)
300 {
301 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
302 const struct panvk_physical_device *phys_dev =
303 to_panvk_physical_device(queue->vk.base.device->physical);
304 struct panvk_instance *instance =
305 to_panvk_instance(dev->vk.physical->instance);
306 VkResult result;
307
308 struct panvk_pool_alloc_info alloc_info = {
309 .size =
310 ALIGN_POT(sizeof(struct panvk_cs_sync64), 64) * PANVK_SUBQUEUE_COUNT,
311 .alignment = 64,
312 };
313
314 queue->syncobjs = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
315 if (!panvk_priv_mem_host_addr(queue->syncobjs))
316 return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
317 "Failed to allocate subqueue sync objects");
318
319 if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) {
320 alloc_info.size =
321 ALIGN_POT(sizeof(struct panvk_cs_sync32), 64) * PANVK_SUBQUEUE_COUNT,
322 queue->debug_syncobjs =
323 panvk_pool_alloc_mem(&dev->mempools.rw_nc, alloc_info);
324 if (!panvk_priv_mem_host_addr(queue->debug_syncobjs)) {
325 result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
326 "Failed to allocate subqueue sync objects");
327 goto err_cleanup_queue;
328 }
329 }
330
331 result = init_render_desc_ringbuf(queue);
332 if (result != VK_SUCCESS)
333 goto err_cleanup_queue;
334
335 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
336 result = init_subqueue(queue, i);
337 if (result != VK_SUCCESS)
338 goto err_cleanup_queue;
339 }
340
341 return VK_SUCCESS;
342
343 err_cleanup_queue:
344 cleanup_queue(queue);
345 return result;
346 }
347
348 static VkResult
create_group(struct panvk_queue * queue)349 create_group(struct panvk_queue *queue)
350 {
351 const struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
352 const struct panvk_physical_device *phys_dev =
353 to_panvk_physical_device(queue->vk.base.device->physical);
354
355 struct drm_panthor_queue_create qc[] = {
356 [PANVK_SUBQUEUE_VERTEX_TILER] =
357 {
358 .priority = 1,
359 .ringbuf_size = 64 * 1024,
360 },
361 [PANVK_SUBQUEUE_FRAGMENT] =
362 {
363 .priority = 1,
364 .ringbuf_size = 64 * 1024,
365 },
366 [PANVK_SUBQUEUE_COMPUTE] =
367 {
368 .priority = 1,
369 .ringbuf_size = 64 * 1024,
370 },
371 };
372
373 struct drm_panthor_group_create gc = {
374 .compute_core_mask = phys_dev->kmod.props.shader_present,
375 .fragment_core_mask = phys_dev->kmod.props.shader_present,
376 .tiler_core_mask = 1,
377 .max_compute_cores = util_bitcount64(phys_dev->kmod.props.shader_present),
378 .max_fragment_cores =
379 util_bitcount64(phys_dev->kmod.props.shader_present),
380 .max_tiler_cores = 1,
381 .priority = PANTHOR_GROUP_PRIORITY_MEDIUM,
382 .queues = DRM_PANTHOR_OBJ_ARRAY(ARRAY_SIZE(qc), qc),
383 .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
384 };
385
386 int ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_CREATE, &gc);
387 if (ret)
388 return vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED,
389 "Failed to create a scheduling group");
390
391 queue->group_handle = gc.group_handle;
392 return VK_SUCCESS;
393 }
394
395 static void
destroy_group(struct panvk_queue * queue)396 destroy_group(struct panvk_queue *queue)
397 {
398 const struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
399 struct drm_panthor_group_destroy gd = {
400 .group_handle = queue->group_handle,
401 };
402
403 ASSERTED int ret =
404 drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
405 assert(!ret);
406 }
407
408 static VkResult
init_tiler(struct panvk_queue * queue)409 init_tiler(struct panvk_queue *queue)
410 {
411 const struct panvk_physical_device *phys_dev =
412 to_panvk_physical_device(queue->vk.base.device->physical);
413 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
414 struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap;
415 VkResult result;
416
417 /* We allocate the tiler heap descriptor and geometry buffer in one go,
418 * so we can pass it through a single 64-bit register to the VERTEX_TILER
419 * command streams. */
420 struct panvk_pool_alloc_info alloc_info = {
421 .size = (64 * 1024) + 4096,
422 .alignment = 4096,
423 };
424
425 tiler_heap->desc = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
426 if (!panvk_priv_mem_host_addr(tiler_heap->desc)) {
427 result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
428 "Failed to create a tiler heap context");
429 goto err_free_desc;
430 }
431
432 tiler_heap->chunk_size = 2 * 1024 * 1024;
433
434 struct drm_panthor_tiler_heap_create thc = {
435 .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
436 .chunk_size = tiler_heap->chunk_size,
437 .initial_chunk_count = 5,
438 .max_chunks = 64,
439 .target_in_flight = 65535,
440 };
441
442 int ret =
443 drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE, &thc);
444 if (ret) {
445 result = vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED,
446 "Failed to create a tiler heap context");
447 goto err_free_desc;
448 }
449
450 tiler_heap->context.handle = thc.handle;
451 tiler_heap->context.dev_addr = thc.tiler_heap_ctx_gpu_va;
452
453 pan_pack(panvk_priv_mem_host_addr(tiler_heap->desc), TILER_HEAP, cfg) {
454 cfg.size = tiler_heap->chunk_size;
455 cfg.base = thc.first_heap_chunk_gpu_va;
456 cfg.bottom = cfg.base + 64;
457 cfg.top = cfg.base + cfg.size;
458 }
459
460 return VK_SUCCESS;
461
462 err_free_desc:
463 panvk_pool_free_mem(&dev->mempools.rw, tiler_heap->desc);
464 return result;
465 }
466
467 static void
cleanup_tiler(struct panvk_queue * queue)468 cleanup_tiler(struct panvk_queue *queue)
469 {
470 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
471 struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap;
472 struct drm_panthor_tiler_heap_destroy thd = {
473 .handle = tiler_heap->context.handle,
474 };
475 ASSERTED int ret =
476 drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY, &thd);
477 assert(!ret);
478
479 panvk_pool_free_mem(&dev->mempools.rw, tiler_heap->desc);
480 }
481
482 static VkResult
panvk_queue_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)483 panvk_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit)
484 {
485 struct panvk_queue *queue = container_of(vk_queue, struct panvk_queue, vk);
486 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
487 const struct panvk_physical_device *phys_dev =
488 to_panvk_physical_device(queue->vk.base.device->physical);
489 VkResult result = VK_SUCCESS;
490 int ret;
491
492 if (vk_queue_is_lost(&queue->vk))
493 return VK_ERROR_DEVICE_LOST;
494
495 struct panvk_instance *instance =
496 to_panvk_instance(dev->vk.physical->instance);
497 unsigned debug = instance->debug_flags;
498 bool force_sync = debug & (PANVK_DEBUG_TRACE | PANVK_DEBUG_SYNC);
499 uint32_t qsubmit_count = 0;
500 uint32_t used_queue_mask = 0;
501 for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
502 struct panvk_cmd_buffer *cmdbuf =
503 container_of(submit->command_buffers[i], struct panvk_cmd_buffer, vk);
504
505 for (uint32_t j = 0; j < ARRAY_SIZE(cmdbuf->state.cs); j++) {
506 assert(cs_is_valid(&cmdbuf->state.cs[j].builder));
507 if (!cs_is_empty(&cmdbuf->state.cs[j].builder)) {
508 used_queue_mask |= BITFIELD_BIT(j);
509 qsubmit_count++;
510 }
511 }
512 }
513
514 /* Synchronize all subqueues if we have no command buffer submitted. */
515 if (!qsubmit_count)
516 used_queue_mask = BITFIELD_MASK(PANVK_SUBQUEUE_COUNT);
517
518 /* We add sync-only queue submits to place our wait/signal operations. */
519 if (submit->wait_count > 0)
520 qsubmit_count += util_bitcount(used_queue_mask);
521
522 if (submit->signal_count > 0)
523 qsubmit_count += util_bitcount(used_queue_mask);
524
525 uint32_t syncop_count = submit->wait_count + util_bitcount(used_queue_mask);
526
527 STACK_ARRAY(struct drm_panthor_queue_submit, qsubmits, qsubmit_count);
528 STACK_ARRAY(struct drm_panthor_sync_op, syncops, syncop_count);
529 struct drm_panthor_sync_op *wait_ops = syncops;
530 struct drm_panthor_sync_op *signal_ops = syncops + submit->wait_count;
531
532 qsubmit_count = 0;
533 if (submit->wait_count) {
534 for (uint32_t i = 0; i < submit->wait_count; i++) {
535 assert(vk_sync_type_is_drm_syncobj(submit->waits[i].sync->type));
536 struct vk_drm_syncobj *syncobj =
537 vk_sync_as_drm_syncobj(submit->waits[i].sync);
538
539 wait_ops[i] = (struct drm_panthor_sync_op){
540 .flags = (submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE
541 ? DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ
542 : DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ) |
543 DRM_PANTHOR_SYNC_OP_WAIT,
544 .handle = syncobj->syncobj,
545 .timeline_value = submit->waits[i].wait_value,
546 };
547 }
548
549 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
550 if (used_queue_mask & BITFIELD_BIT(i)) {
551 qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){
552 .queue_index = i,
553 .syncs = DRM_PANTHOR_OBJ_ARRAY(submit->wait_count, wait_ops),
554 };
555 }
556 }
557 }
558
559 for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
560 struct panvk_cmd_buffer *cmdbuf =
561 container_of(submit->command_buffers[i], struct panvk_cmd_buffer, vk);
562
563 for (uint32_t j = 0; j < ARRAY_SIZE(cmdbuf->state.cs); j++) {
564 if (cs_is_empty(&cmdbuf->state.cs[j].builder))
565 continue;
566
567 qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){
568 .queue_index = j,
569 .stream_size = cs_root_chunk_size(&cmdbuf->state.cs[j].builder),
570 .stream_addr = cs_root_chunk_gpu_addr(&cmdbuf->state.cs[j].builder),
571 .latest_flush = 0,
572 };
573 }
574 }
575
576 if (submit->signal_count || force_sync) {
577 uint32_t signal_op = 0;
578 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
579 if (used_queue_mask & BITFIELD_BIT(i)) {
580 signal_ops[signal_op] = (struct drm_panthor_sync_op){
581 .flags = DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ |
582 DRM_PANTHOR_SYNC_OP_SIGNAL,
583 .handle = queue->syncobj_handle,
584 .timeline_value = signal_op + 1,
585 };
586
587 qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){
588 .queue_index = i,
589 .syncs = DRM_PANTHOR_OBJ_ARRAY(1, &signal_ops[signal_op++]),
590 };
591 }
592 }
593 }
594
595 if (force_sync) {
596 struct panvk_cs_sync32 *debug_syncs =
597 panvk_priv_mem_host_addr(queue->debug_syncobjs);
598
599 assert(debug_syncs);
600 memset(debug_syncs, 0, sizeof(*debug_syncs) * PANVK_SUBQUEUE_COUNT);
601 }
602
603 struct drm_panthor_group_submit gsubmit = {
604 .group_handle = queue->group_handle,
605 .queue_submits = DRM_PANTHOR_OBJ_ARRAY(qsubmit_count, qsubmits),
606 };
607
608 ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_SUBMIT, &gsubmit);
609 if (ret) {
610 result = vk_queue_set_lost(&queue->vk, "GROUP_SUBMIT: %m");
611 goto out;
612 }
613
614 if (submit->signal_count || force_sync) {
615 if (force_sync) {
616 uint64_t point = util_bitcount(used_queue_mask);
617 ret = drmSyncobjTimelineWait(dev->vk.drm_fd, &queue->syncobj_handle,
618 &point, 1, INT64_MAX,
619 DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
620 assert(!ret);
621 }
622
623 for (uint32_t i = 0; i < submit->signal_count; i++) {
624 assert(vk_sync_type_is_drm_syncobj(submit->signals[i].sync->type));
625 struct vk_drm_syncobj *syncobj =
626 vk_sync_as_drm_syncobj(submit->signals[i].sync);
627
628 drmSyncobjTransfer(dev->vk.drm_fd, syncobj->syncobj,
629 submit->signals[i].signal_value,
630 queue->syncobj_handle, 0, 0);
631 }
632
633 drmSyncobjReset(dev->vk.drm_fd, &queue->syncobj_handle, 1);
634 }
635
636 if (debug & PANVK_DEBUG_TRACE) {
637 for (uint32_t i = 0; i < qsubmit_count; i++) {
638 if (!qsubmits[i].stream_size)
639 continue;
640
641 uint32_t subqueue = qsubmits[i].queue_index;
642 uint32_t regs[256] = {0};
643 uint64_t ctx =
644 panvk_priv_mem_dev_addr(queue->subqueues[subqueue].context);
645
646 regs[PANVK_CS_REG_SUBQUEUE_CTX_START] = ctx;
647 regs[PANVK_CS_REG_SUBQUEUE_CTX_START + 1] = ctx >> 32;
648
649 simple_mtx_lock(&dev->debug.decode_ctx->lock);
650 pandecode_dump_file_open(dev->debug.decode_ctx);
651 pandecode_log(dev->debug.decode_ctx, "CS%d\n",
652 qsubmits[i].queue_index);
653 simple_mtx_unlock(&dev->debug.decode_ctx->lock);
654 pandecode_cs(dev->debug.decode_ctx, qsubmits[i].stream_addr,
655 qsubmits[i].stream_size, phys_dev->kmod.props.gpu_prod_id,
656 regs);
657 }
658 }
659
660 if (debug & PANVK_DEBUG_DUMP)
661 pandecode_dump_mappings(dev->debug.decode_ctx);
662
663 if (force_sync) {
664 struct panvk_cs_sync32 *debug_syncs =
665 panvk_priv_mem_host_addr(queue->debug_syncobjs);
666 uint32_t debug_sync_points[PANVK_SUBQUEUE_COUNT] = {0};
667
668 for (uint32_t i = 0; i < qsubmit_count; i++) {
669 if (qsubmits[i].stream_size)
670 debug_sync_points[qsubmits[i].queue_index]++;
671 }
672
673 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
674 if (debug_syncs[i].seqno != debug_sync_points[i] ||
675 debug_syncs[i].error != 0)
676 assert(!"Incomplete job or timeout\n");
677 }
678 }
679
680 if (debug & PANVK_DEBUG_TRACE)
681 pandecode_next_frame(dev->debug.decode_ctx);
682
683 out:
684 STACK_ARRAY_FINISH(syncops);
685 STACK_ARRAY_FINISH(qsubmits);
686 return result;
687 }
688
689 VkResult
panvk_per_arch(queue_init)690 panvk_per_arch(queue_init)(struct panvk_device *dev, struct panvk_queue *queue,
691 int idx, const VkDeviceQueueCreateInfo *create_info)
692 {
693 const struct panvk_physical_device *phys_dev =
694 to_panvk_physical_device(dev->vk.physical);
695
696 VkResult result = vk_queue_init(&queue->vk, &dev->vk, create_info, idx);
697 if (result != VK_SUCCESS)
698 return result;
699
700 int ret = drmSyncobjCreate(dev->vk.drm_fd, 0, &queue->syncobj_handle);
701 if (ret) {
702 result = vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED,
703 "Failed to create our internal sync object");
704 goto err_finish_queue;
705 }
706
707 result = init_tiler(queue);
708 if (result != VK_SUCCESS)
709 goto err_destroy_syncobj;
710
711 result = create_group(queue);
712 if (result != VK_SUCCESS)
713 goto err_cleanup_tiler;
714
715 result = init_queue(queue);
716 if (result != VK_SUCCESS)
717 goto err_destroy_group;
718
719 queue->vk.driver_submit = panvk_queue_submit;
720 return VK_SUCCESS;
721
722 err_destroy_group:
723 destroy_group(queue);
724
725 err_cleanup_tiler:
726 cleanup_tiler(queue);
727
728 err_destroy_syncobj:
729 drmSyncobjDestroy(dev->vk.drm_fd, queue->syncobj_handle);
730
731 err_finish_queue:
732 vk_queue_finish(&queue->vk);
733 return result;
734 }
735
736 void
panvk_per_arch(queue_finish)737 panvk_per_arch(queue_finish)(struct panvk_queue *queue)
738 {
739 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
740
741 destroy_group(queue);
742 cleanup_tiler(queue);
743 drmSyncobjDestroy(dev->vk.drm_fd, queue->syncobj_handle);
744 vk_queue_finish(&queue->vk);
745 }
746