xref: /aosp_15_r20/external/mesa3d/src/panfrost/vulkan/csf/panvk_vX_queue.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2024 Collabora Ltd.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "drm-uapi/panthor_drm.h"
8 
9 #include "genxml/cs_builder.h"
10 #include "genxml/decode.h"
11 
12 #include "panvk_cmd_buffer.h"
13 #include "panvk_macros.h"
14 #include "panvk_queue.h"
15 
16 #include "vk_drm_syncobj.h"
17 #include "vk_log.h"
18 
19 static void
finish_render_desc_ringbuf(struct panvk_queue * queue)20 finish_render_desc_ringbuf(struct panvk_queue *queue)
21 {
22    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
23    struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf;
24 
25    panvk_pool_free_mem(&dev->mempools.rw, ringbuf->syncobj);
26 
27    if (dev->debug.decode_ctx && ringbuf->addr.dev) {
28       pandecode_inject_free(dev->debug.decode_ctx, ringbuf->addr.dev,
29                             RENDER_DESC_RINGBUF_SIZE);
30       pandecode_inject_free(dev->debug.decode_ctx,
31                             ringbuf->addr.dev + RENDER_DESC_RINGBUF_SIZE,
32                             RENDER_DESC_RINGBUF_SIZE);
33    }
34 
35    if (ringbuf->addr.dev) {
36       struct pan_kmod_vm_op op = {
37          .type = PAN_KMOD_VM_OP_TYPE_UNMAP,
38          .va = {
39             .start = ringbuf->addr.dev,
40             .size = RENDER_DESC_RINGBUF_SIZE * 2,
41          },
42       };
43 
44       ASSERTED int ret =
45          pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &op, 1);
46       assert(!ret);
47    }
48 
49    if (ringbuf->addr.host) {
50       ASSERTED int ret =
51          os_munmap(ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE);
52       assert(!ret);
53    }
54 
55    pan_kmod_bo_put(ringbuf->bo);
56 }
57 
58 static VkResult
init_render_desc_ringbuf(struct panvk_queue * queue)59 init_render_desc_ringbuf(struct panvk_queue *queue)
60 {
61    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
62    const struct panvk_physical_device *phys_dev =
63       to_panvk_physical_device(queue->vk.base.device->physical);
64    uint32_t flags = panvk_device_adjust_bo_flags(dev, PAN_KMOD_BO_FLAG_NO_MMAP);
65    struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf;
66    const size_t size = RENDER_DESC_RINGBUF_SIZE;
67    VkResult result;
68    int ret;
69 
70    ringbuf->bo = pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, size, flags);
71    if (!ringbuf->bo)
72       return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
73                        "Failed to create a descriptor ring buffer context");
74 
75    if (!(flags & PAN_KMOD_BO_FLAG_NO_MMAP)) {
76       ringbuf->addr.host = pan_kmod_bo_mmap(
77          ringbuf->bo, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED, NULL);
78       if (ringbuf->addr.host == MAP_FAILED) {
79          result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
80                             "Failed to CPU map ringbuf BO");
81          goto err_finish_ringbuf;
82       }
83    }
84 
85    /* We choose the alignment to guarantee that we won't ever cross a 4G
86     * boundary when accessing the mapping. This way we can encode the wraparound
87     * using 32-bit operations. */
88    uint64_t dev_addr = util_vma_heap_alloc(&dev->as.heap, size * 2, size * 2);
89 
90    struct pan_kmod_vm_op vm_ops[] = {
91       {
92          .type = PAN_KMOD_VM_OP_TYPE_MAP,
93          .va = {
94             .start = dev_addr,
95             .size = RENDER_DESC_RINGBUF_SIZE,
96          },
97          .map = {
98             .bo = ringbuf->bo,
99             .bo_offset = 0,
100          },
101       },
102       {
103          .type = PAN_KMOD_VM_OP_TYPE_MAP,
104          .va = {
105             .start = dev_addr + RENDER_DESC_RINGBUF_SIZE,
106             .size = RENDER_DESC_RINGBUF_SIZE,
107          },
108          .map = {
109             .bo = ringbuf->bo,
110             .bo_offset = 0,
111          },
112       },
113    };
114 
115    ret = pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, vm_ops,
116                           ARRAY_SIZE(vm_ops));
117    if (ret) {
118       result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
119                          "Failed to GPU map ringbuf BO");
120       goto err_finish_ringbuf;
121    }
122 
123    ringbuf->addr.dev = dev_addr;
124 
125    if (dev->debug.decode_ctx) {
126       pandecode_inject_mmap(dev->debug.decode_ctx, ringbuf->addr.dev,
127                             ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE, NULL);
128       pandecode_inject_mmap(dev->debug.decode_ctx,
129                             ringbuf->addr.dev + RENDER_DESC_RINGBUF_SIZE,
130                             ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE, NULL);
131    }
132 
133    struct panvk_pool_alloc_info alloc_info = {
134       .size = sizeof(struct panvk_cs_sync32),
135       .alignment = 64,
136    };
137 
138    ringbuf->syncobj = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
139 
140    struct panvk_cs_sync32 *syncobj = panvk_priv_mem_host_addr(ringbuf->syncobj);
141 
142    if (!syncobj) {
143       result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
144                          "Failed to create the render desc ringbuf context");
145       goto err_finish_ringbuf;
146    }
147 
148    *syncobj = (struct panvk_cs_sync32){
149       .seqno = RENDER_DESC_RINGBUF_SIZE,
150    };
151 
152    return VK_SUCCESS;
153 
154 err_finish_ringbuf:
155    finish_render_desc_ringbuf(queue);
156    return result;
157 }
158 
159 static VkResult
init_subqueue(struct panvk_queue * queue,enum panvk_subqueue_id subqueue)160 init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue)
161 {
162    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
163    struct panvk_subqueue *subq = &queue->subqueues[subqueue];
164    const struct panvk_physical_device *phys_dev =
165       to_panvk_physical_device(queue->vk.base.device->physical);
166    struct panvk_instance *instance =
167       to_panvk_instance(dev->vk.physical->instance);
168    unsigned debug = instance->debug_flags;
169    struct panvk_cs_sync64 *syncobjs = panvk_priv_mem_host_addr(queue->syncobjs);
170 
171    struct panvk_pool_alloc_info alloc_info = {
172       .size = sizeof(struct panvk_cs_subqueue_context),
173       .alignment = 64,
174    };
175 
176    subq->context = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
177    if (!panvk_priv_mem_host_addr(subq->context))
178       return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
179                        "Failed to create a queue context");
180 
181    struct panvk_cs_subqueue_context *cs_ctx =
182       panvk_priv_mem_host_addr(subq->context);
183 
184    *cs_ctx = (struct panvk_cs_subqueue_context){
185       .syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs),
186       .debug_syncobjs = panvk_priv_mem_dev_addr(queue->debug_syncobjs),
187       .iter_sb = 0,
188    };
189 
190    /* We use the geometry buffer for our temporary CS buffer. */
191    struct cs_buffer root_cs = {
192       .cpu = panvk_priv_mem_host_addr(queue->tiler_heap.desc) + 4096,
193       .gpu = panvk_priv_mem_dev_addr(queue->tiler_heap.desc) + 4096,
194       .capacity = 64 * 1024 / sizeof(uint64_t),
195    };
196    const struct cs_builder_conf conf = {
197       .nr_registers = 96,
198       .nr_kernel_registers = 4,
199    };
200    struct cs_builder b;
201 
202    assert(panvk_priv_mem_dev_addr(queue->tiler_heap.desc) != 0);
203 
204    cs_builder_init(&b, &conf, root_cs);
205    /* Pass the context. */
206    cs_move64_to(&b, cs_subqueue_ctx_reg(&b),
207                 panvk_priv_mem_dev_addr(subq->context));
208 
209    /* Intialize scoreboard slots used for asynchronous operations. */
210    cs_set_scoreboard_entry(&b, SB_ITER(0), SB_ID(LS));
211 
212    /* We do greater than test on sync objects, and given the reference seqno
213     * registers are all zero at init time, we need to initialize all syncobjs
214     * with a seqno of one. */
215    syncobjs[subqueue].seqno = 1;
216 
217    if (subqueue != PANVK_SUBQUEUE_COMPUTE) {
218       cs_ctx->render.tiler_heap =
219          panvk_priv_mem_dev_addr(queue->tiler_heap.desc);
220       /* Our geometry buffer comes 4k after the tiler heap, and we encode the
221        * size in the lower 12 bits so the address can be copied directly
222        * to the tiler descriptors. */
223       cs_ctx->render.geom_buf =
224          (cs_ctx->render.tiler_heap + 4096) | ((64 * 1024) >> 12);
225 
226       /* Initialize the ringbuf */
227       cs_ctx->render.desc_ringbuf = (struct panvk_cs_desc_ringbuf){
228          .syncobj = panvk_priv_mem_dev_addr(queue->render_desc_ringbuf.syncobj),
229          .ptr = queue->render_desc_ringbuf.addr.dev,
230          .pos = 0,
231       };
232 
233       struct cs_index heap_ctx_addr = cs_scratch_reg64(&b, 0);
234 
235       /* Pre-set the heap context on the vertex-tiler/fragment queues. */
236       cs_move64_to(&b, heap_ctx_addr, queue->tiler_heap.context.dev_addr);
237       cs_heap_set(&b, heap_ctx_addr);
238    }
239 
240    cs_finish(&b);
241 
242    assert(cs_is_valid(&b));
243 
244    struct drm_panthor_sync_op syncop = {
245       .flags =
246          DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ | DRM_PANTHOR_SYNC_OP_SIGNAL,
247       .handle = queue->syncobj_handle,
248       .timeline_value = 0,
249    };
250    struct drm_panthor_queue_submit qsubmit = {
251       .queue_index = subqueue,
252       .stream_size = cs_root_chunk_size(&b),
253       .stream_addr = cs_root_chunk_gpu_addr(&b),
254       .latest_flush = 0,
255       .syncs = DRM_PANTHOR_OBJ_ARRAY(1, &syncop),
256    };
257    struct drm_panthor_group_submit gsubmit = {
258       .group_handle = queue->group_handle,
259       .queue_submits = DRM_PANTHOR_OBJ_ARRAY(1, &qsubmit),
260    };
261 
262    int ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_SUBMIT, &gsubmit);
263    if (ret)
264       return vk_errorf(dev->vk.physical, VK_ERROR_INITIALIZATION_FAILED,
265                        "Failed to initialized subqueue: %m");
266 
267    ret = drmSyncobjWait(dev->vk.drm_fd, &queue->syncobj_handle, 1, INT64_MAX, 0,
268                         NULL);
269    if (ret)
270       return vk_errorf(dev->vk.physical, VK_ERROR_INITIALIZATION_FAILED,
271                        "SyncobjWait failed: %m");
272 
273    if (debug & PANVK_DEBUG_TRACE) {
274       uint32_t regs[256] = {0};
275 
276       pandecode_cs(dev->debug.decode_ctx, qsubmit.stream_addr,
277                    qsubmit.stream_size, phys_dev->kmod.props.gpu_prod_id, regs);
278       pandecode_next_frame(dev->debug.decode_ctx);
279    }
280 
281    return VK_SUCCESS;
282 }
283 
284 static void
cleanup_queue(struct panvk_queue * queue)285 cleanup_queue(struct panvk_queue *queue)
286 {
287    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
288 
289    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
290       panvk_pool_free_mem(&dev->mempools.rw, queue->subqueues[i].context);
291 
292    finish_render_desc_ringbuf(queue);
293 
294    panvk_pool_free_mem(&dev->mempools.rw, queue->debug_syncobjs);
295    panvk_pool_free_mem(&dev->mempools.rw, queue->syncobjs);
296 }
297 
298 static VkResult
init_queue(struct panvk_queue * queue)299 init_queue(struct panvk_queue *queue)
300 {
301    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
302    const struct panvk_physical_device *phys_dev =
303       to_panvk_physical_device(queue->vk.base.device->physical);
304    struct panvk_instance *instance =
305       to_panvk_instance(dev->vk.physical->instance);
306    VkResult result;
307 
308    struct panvk_pool_alloc_info alloc_info = {
309       .size =
310          ALIGN_POT(sizeof(struct panvk_cs_sync64), 64) * PANVK_SUBQUEUE_COUNT,
311       .alignment = 64,
312    };
313 
314    queue->syncobjs = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
315    if (!panvk_priv_mem_host_addr(queue->syncobjs))
316       return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
317                        "Failed to allocate subqueue sync objects");
318 
319    if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) {
320       alloc_info.size =
321          ALIGN_POT(sizeof(struct panvk_cs_sync32), 64) * PANVK_SUBQUEUE_COUNT,
322       queue->debug_syncobjs =
323          panvk_pool_alloc_mem(&dev->mempools.rw_nc, alloc_info);
324       if (!panvk_priv_mem_host_addr(queue->debug_syncobjs)) {
325          result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
326                             "Failed to allocate subqueue sync objects");
327          goto err_cleanup_queue;
328       }
329    }
330 
331    result = init_render_desc_ringbuf(queue);
332    if (result != VK_SUCCESS)
333       goto err_cleanup_queue;
334 
335    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
336       result = init_subqueue(queue, i);
337       if (result != VK_SUCCESS)
338          goto err_cleanup_queue;
339    }
340 
341    return VK_SUCCESS;
342 
343 err_cleanup_queue:
344    cleanup_queue(queue);
345    return result;
346 }
347 
348 static VkResult
create_group(struct panvk_queue * queue)349 create_group(struct panvk_queue *queue)
350 {
351    const struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
352    const struct panvk_physical_device *phys_dev =
353       to_panvk_physical_device(queue->vk.base.device->physical);
354 
355    struct drm_panthor_queue_create qc[] = {
356       [PANVK_SUBQUEUE_VERTEX_TILER] =
357          {
358             .priority = 1,
359             .ringbuf_size = 64 * 1024,
360          },
361       [PANVK_SUBQUEUE_FRAGMENT] =
362          {
363             .priority = 1,
364             .ringbuf_size = 64 * 1024,
365          },
366       [PANVK_SUBQUEUE_COMPUTE] =
367          {
368             .priority = 1,
369             .ringbuf_size = 64 * 1024,
370          },
371    };
372 
373    struct drm_panthor_group_create gc = {
374       .compute_core_mask = phys_dev->kmod.props.shader_present,
375       .fragment_core_mask = phys_dev->kmod.props.shader_present,
376       .tiler_core_mask = 1,
377       .max_compute_cores = util_bitcount64(phys_dev->kmod.props.shader_present),
378       .max_fragment_cores =
379          util_bitcount64(phys_dev->kmod.props.shader_present),
380       .max_tiler_cores = 1,
381       .priority = PANTHOR_GROUP_PRIORITY_MEDIUM,
382       .queues = DRM_PANTHOR_OBJ_ARRAY(ARRAY_SIZE(qc), qc),
383       .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
384    };
385 
386    int ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_CREATE, &gc);
387    if (ret)
388       return vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED,
389                        "Failed to create a scheduling group");
390 
391    queue->group_handle = gc.group_handle;
392    return VK_SUCCESS;
393 }
394 
395 static void
destroy_group(struct panvk_queue * queue)396 destroy_group(struct panvk_queue *queue)
397 {
398    const struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
399    struct drm_panthor_group_destroy gd = {
400       .group_handle = queue->group_handle,
401    };
402 
403    ASSERTED int ret =
404       drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
405    assert(!ret);
406 }
407 
408 static VkResult
init_tiler(struct panvk_queue * queue)409 init_tiler(struct panvk_queue *queue)
410 {
411    const struct panvk_physical_device *phys_dev =
412       to_panvk_physical_device(queue->vk.base.device->physical);
413    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
414    struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap;
415    VkResult result;
416 
417    /* We allocate the tiler heap descriptor and geometry buffer in one go,
418     * so we can pass it through a single 64-bit register to the VERTEX_TILER
419     * command streams. */
420    struct panvk_pool_alloc_info alloc_info = {
421       .size = (64 * 1024) + 4096,
422       .alignment = 4096,
423    };
424 
425    tiler_heap->desc = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
426    if (!panvk_priv_mem_host_addr(tiler_heap->desc)) {
427       result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
428                          "Failed to create a tiler heap context");
429       goto err_free_desc;
430    }
431 
432    tiler_heap->chunk_size = 2 * 1024 * 1024;
433 
434    struct drm_panthor_tiler_heap_create thc = {
435       .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
436       .chunk_size = tiler_heap->chunk_size,
437       .initial_chunk_count = 5,
438       .max_chunks = 64,
439       .target_in_flight = 65535,
440    };
441 
442    int ret =
443       drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE, &thc);
444    if (ret) {
445       result = vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED,
446                          "Failed to create a tiler heap context");
447       goto err_free_desc;
448    }
449 
450    tiler_heap->context.handle = thc.handle;
451    tiler_heap->context.dev_addr = thc.tiler_heap_ctx_gpu_va;
452 
453    pan_pack(panvk_priv_mem_host_addr(tiler_heap->desc), TILER_HEAP, cfg) {
454       cfg.size = tiler_heap->chunk_size;
455       cfg.base = thc.first_heap_chunk_gpu_va;
456       cfg.bottom = cfg.base + 64;
457       cfg.top = cfg.base + cfg.size;
458    }
459 
460    return VK_SUCCESS;
461 
462 err_free_desc:
463    panvk_pool_free_mem(&dev->mempools.rw, tiler_heap->desc);
464    return result;
465 }
466 
467 static void
cleanup_tiler(struct panvk_queue * queue)468 cleanup_tiler(struct panvk_queue *queue)
469 {
470    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
471    struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap;
472    struct drm_panthor_tiler_heap_destroy thd = {
473       .handle = tiler_heap->context.handle,
474    };
475    ASSERTED int ret =
476       drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY, &thd);
477    assert(!ret);
478 
479    panvk_pool_free_mem(&dev->mempools.rw, tiler_heap->desc);
480 }
481 
482 static VkResult
panvk_queue_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)483 panvk_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit)
484 {
485    struct panvk_queue *queue = container_of(vk_queue, struct panvk_queue, vk);
486    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
487    const struct panvk_physical_device *phys_dev =
488       to_panvk_physical_device(queue->vk.base.device->physical);
489    VkResult result = VK_SUCCESS;
490    int ret;
491 
492    if (vk_queue_is_lost(&queue->vk))
493       return VK_ERROR_DEVICE_LOST;
494 
495    struct panvk_instance *instance =
496       to_panvk_instance(dev->vk.physical->instance);
497    unsigned debug = instance->debug_flags;
498    bool force_sync = debug & (PANVK_DEBUG_TRACE | PANVK_DEBUG_SYNC);
499    uint32_t qsubmit_count = 0;
500    uint32_t used_queue_mask = 0;
501    for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
502       struct panvk_cmd_buffer *cmdbuf =
503          container_of(submit->command_buffers[i], struct panvk_cmd_buffer, vk);
504 
505       for (uint32_t j = 0; j < ARRAY_SIZE(cmdbuf->state.cs); j++) {
506          assert(cs_is_valid(&cmdbuf->state.cs[j].builder));
507          if (!cs_is_empty(&cmdbuf->state.cs[j].builder)) {
508             used_queue_mask |= BITFIELD_BIT(j);
509             qsubmit_count++;
510          }
511       }
512    }
513 
514    /* Synchronize all subqueues if we have no command buffer submitted. */
515    if (!qsubmit_count)
516       used_queue_mask = BITFIELD_MASK(PANVK_SUBQUEUE_COUNT);
517 
518    /* We add sync-only queue submits to place our wait/signal operations. */
519    if (submit->wait_count > 0)
520       qsubmit_count += util_bitcount(used_queue_mask);
521 
522    if (submit->signal_count > 0)
523       qsubmit_count += util_bitcount(used_queue_mask);
524 
525    uint32_t syncop_count = submit->wait_count + util_bitcount(used_queue_mask);
526 
527    STACK_ARRAY(struct drm_panthor_queue_submit, qsubmits, qsubmit_count);
528    STACK_ARRAY(struct drm_panthor_sync_op, syncops, syncop_count);
529    struct drm_panthor_sync_op *wait_ops = syncops;
530    struct drm_panthor_sync_op *signal_ops = syncops + submit->wait_count;
531 
532    qsubmit_count = 0;
533    if (submit->wait_count) {
534       for (uint32_t i = 0; i < submit->wait_count; i++) {
535          assert(vk_sync_type_is_drm_syncobj(submit->waits[i].sync->type));
536          struct vk_drm_syncobj *syncobj =
537             vk_sync_as_drm_syncobj(submit->waits[i].sync);
538 
539          wait_ops[i] = (struct drm_panthor_sync_op){
540             .flags = (submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE
541                          ? DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ
542                          : DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ) |
543                      DRM_PANTHOR_SYNC_OP_WAIT,
544             .handle = syncobj->syncobj,
545             .timeline_value = submit->waits[i].wait_value,
546          };
547       }
548 
549       for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
550          if (used_queue_mask & BITFIELD_BIT(i)) {
551             qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){
552                .queue_index = i,
553                .syncs = DRM_PANTHOR_OBJ_ARRAY(submit->wait_count, wait_ops),
554             };
555          }
556       }
557    }
558 
559    for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
560       struct panvk_cmd_buffer *cmdbuf =
561          container_of(submit->command_buffers[i], struct panvk_cmd_buffer, vk);
562 
563       for (uint32_t j = 0; j < ARRAY_SIZE(cmdbuf->state.cs); j++) {
564          if (cs_is_empty(&cmdbuf->state.cs[j].builder))
565             continue;
566 
567          qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){
568             .queue_index = j,
569             .stream_size = cs_root_chunk_size(&cmdbuf->state.cs[j].builder),
570             .stream_addr = cs_root_chunk_gpu_addr(&cmdbuf->state.cs[j].builder),
571             .latest_flush = 0,
572          };
573       }
574    }
575 
576    if (submit->signal_count || force_sync) {
577       uint32_t signal_op = 0;
578       for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
579          if (used_queue_mask & BITFIELD_BIT(i)) {
580             signal_ops[signal_op] = (struct drm_panthor_sync_op){
581                .flags = DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ |
582                         DRM_PANTHOR_SYNC_OP_SIGNAL,
583                .handle = queue->syncobj_handle,
584                .timeline_value = signal_op + 1,
585             };
586 
587             qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){
588                .queue_index = i,
589                .syncs = DRM_PANTHOR_OBJ_ARRAY(1, &signal_ops[signal_op++]),
590             };
591          }
592       }
593    }
594 
595    if (force_sync) {
596       struct panvk_cs_sync32 *debug_syncs =
597          panvk_priv_mem_host_addr(queue->debug_syncobjs);
598 
599       assert(debug_syncs);
600       memset(debug_syncs, 0, sizeof(*debug_syncs) * PANVK_SUBQUEUE_COUNT);
601    }
602 
603    struct drm_panthor_group_submit gsubmit = {
604       .group_handle = queue->group_handle,
605       .queue_submits = DRM_PANTHOR_OBJ_ARRAY(qsubmit_count, qsubmits),
606    };
607 
608    ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_SUBMIT, &gsubmit);
609    if (ret) {
610       result = vk_queue_set_lost(&queue->vk, "GROUP_SUBMIT: %m");
611       goto out;
612    }
613 
614    if (submit->signal_count || force_sync) {
615       if (force_sync) {
616          uint64_t point = util_bitcount(used_queue_mask);
617          ret = drmSyncobjTimelineWait(dev->vk.drm_fd, &queue->syncobj_handle,
618                                       &point, 1, INT64_MAX,
619                                       DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
620          assert(!ret);
621       }
622 
623       for (uint32_t i = 0; i < submit->signal_count; i++) {
624          assert(vk_sync_type_is_drm_syncobj(submit->signals[i].sync->type));
625          struct vk_drm_syncobj *syncobj =
626             vk_sync_as_drm_syncobj(submit->signals[i].sync);
627 
628          drmSyncobjTransfer(dev->vk.drm_fd, syncobj->syncobj,
629                             submit->signals[i].signal_value,
630                             queue->syncobj_handle, 0, 0);
631       }
632 
633       drmSyncobjReset(dev->vk.drm_fd, &queue->syncobj_handle, 1);
634    }
635 
636    if (debug & PANVK_DEBUG_TRACE) {
637       for (uint32_t i = 0; i < qsubmit_count; i++) {
638          if (!qsubmits[i].stream_size)
639             continue;
640 
641          uint32_t subqueue = qsubmits[i].queue_index;
642          uint32_t regs[256] = {0};
643          uint64_t ctx =
644             panvk_priv_mem_dev_addr(queue->subqueues[subqueue].context);
645 
646          regs[PANVK_CS_REG_SUBQUEUE_CTX_START] = ctx;
647          regs[PANVK_CS_REG_SUBQUEUE_CTX_START + 1] = ctx >> 32;
648 
649          simple_mtx_lock(&dev->debug.decode_ctx->lock);
650          pandecode_dump_file_open(dev->debug.decode_ctx);
651          pandecode_log(dev->debug.decode_ctx, "CS%d\n",
652                        qsubmits[i].queue_index);
653          simple_mtx_unlock(&dev->debug.decode_ctx->lock);
654          pandecode_cs(dev->debug.decode_ctx, qsubmits[i].stream_addr,
655                       qsubmits[i].stream_size, phys_dev->kmod.props.gpu_prod_id,
656                       regs);
657       }
658    }
659 
660    if (debug & PANVK_DEBUG_DUMP)
661       pandecode_dump_mappings(dev->debug.decode_ctx);
662 
663    if (force_sync) {
664       struct panvk_cs_sync32 *debug_syncs =
665          panvk_priv_mem_host_addr(queue->debug_syncobjs);
666       uint32_t debug_sync_points[PANVK_SUBQUEUE_COUNT] = {0};
667 
668       for (uint32_t i = 0; i < qsubmit_count; i++) {
669          if (qsubmits[i].stream_size)
670             debug_sync_points[qsubmits[i].queue_index]++;
671       }
672 
673       for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
674          if (debug_syncs[i].seqno != debug_sync_points[i] ||
675              debug_syncs[i].error != 0)
676             assert(!"Incomplete job or timeout\n");
677       }
678    }
679 
680    if (debug & PANVK_DEBUG_TRACE)
681       pandecode_next_frame(dev->debug.decode_ctx);
682 
683 out:
684    STACK_ARRAY_FINISH(syncops);
685    STACK_ARRAY_FINISH(qsubmits);
686    return result;
687 }
688 
689 VkResult
panvk_per_arch(queue_init)690 panvk_per_arch(queue_init)(struct panvk_device *dev, struct panvk_queue *queue,
691                            int idx, const VkDeviceQueueCreateInfo *create_info)
692 {
693    const struct panvk_physical_device *phys_dev =
694       to_panvk_physical_device(dev->vk.physical);
695 
696    VkResult result = vk_queue_init(&queue->vk, &dev->vk, create_info, idx);
697    if (result != VK_SUCCESS)
698       return result;
699 
700    int ret = drmSyncobjCreate(dev->vk.drm_fd, 0, &queue->syncobj_handle);
701    if (ret) {
702       result = vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED,
703                          "Failed to create our internal sync object");
704       goto err_finish_queue;
705    }
706 
707    result = init_tiler(queue);
708    if (result != VK_SUCCESS)
709       goto err_destroy_syncobj;
710 
711    result = create_group(queue);
712    if (result != VK_SUCCESS)
713       goto err_cleanup_tiler;
714 
715    result = init_queue(queue);
716    if (result != VK_SUCCESS)
717       goto err_destroy_group;
718 
719    queue->vk.driver_submit = panvk_queue_submit;
720    return VK_SUCCESS;
721 
722 err_destroy_group:
723    destroy_group(queue);
724 
725 err_cleanup_tiler:
726    cleanup_tiler(queue);
727 
728 err_destroy_syncobj:
729    drmSyncobjDestroy(dev->vk.drm_fd, queue->syncobj_handle);
730 
731 err_finish_queue:
732    vk_queue_finish(&queue->vk);
733    return result;
734 }
735 
736 void
panvk_per_arch(queue_finish)737 panvk_per_arch(queue_finish)(struct panvk_queue *queue)
738 {
739    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
740 
741    destroy_group(queue);
742    cleanup_tiler(queue);
743    drmSyncobjDestroy(dev->vk.drm_fd, queue->syncobj_handle);
744    vk_queue_finish(&queue->vk);
745 }
746