xref: /aosp_15_r20/external/mesa3d/src/freedreno/vulkan/tu_knl_drm_virtio.cc (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Google, Inc.
3  * Copyright © 2015 Intel Corporation
4  * SPDX-License-Identifier: MIT
5  *
6  * Kernel interface layer for turnip running on virtio_gpu (aka virtgpu)
7  */
8 
9 #include "tu_knl.h"
10 
11 #include <errno.h>
12 #include <fcntl.h>
13 #include <sys/ioctl.h>
14 #include <sys/mman.h>
15 #include <xf86drm.h>
16 
17 #include "vk_util.h"
18 
19 #include "drm-uapi/msm_drm.h"
20 #include "drm-uapi/virtgpu_drm.h"
21 #include "util/u_debug.h"
22 #include "util/hash_table.h"
23 #include "util/libsync.h"
24 #include "util/u_process.h"
25 
26 #include "tu_cmd_buffer.h"
27 #include "tu_cs.h"
28 #include "tu_device.h"
29 #include "tu_dynamic_rendering.h"
30 #include "tu_knl_drm.h"
31 
32 #include "virglrenderer_hw.h"
33 #include "msm_proto.h"
34 
35 #include "vdrm.h"
36 
37 struct tu_userspace_fence_cmd {
38    uint32_t pkt[4];    /* first 4 dwords of packet */
39    uint32_t fence;     /* fifth dword is fence value which is plugged in at runtime */
40    uint32_t _pad[11];
41 };
42 
43 struct tu_userspace_fence_cmds {
44    struct tu_userspace_fence_cmd cmds[64];
45 };
46 
47 struct tu_virtio_queue_submit {
48    struct vk_queue_submit *vk_submit;
49    struct tu_u_trace_submission_data *u_trace_submission_data;
50 
51    struct tu_cmd_buffer **cmd_buffers;
52    struct drm_msm_gem_submit_cmd *cmds;
53    struct drm_virtgpu_execbuffer_syncobj *in_syncobjs;
54    struct drm_virtgpu_execbuffer_syncobj *out_syncobjs;
55 
56    uint32_t nr_cmd_buffers;
57    uint32_t nr_in_syncobjs;
58    uint32_t nr_out_syncobjs;
59    uint32_t entry_count;
60    uint32_t perf_pass_index;
61 
62    bool     autotune_fence;
63 };
64 
65 struct tu_u_trace_syncobj {
66    uint32_t msm_queue_id;
67    uint32_t fence;
68 };
69 
70 struct tu_virtio_device {
71    struct vdrm_device *vdrm;
72    struct msm_shmem *shmem;
73    uint32_t next_blob_id;
74 
75    struct tu_userspace_fence_cmds *fence_cmds;
76    struct tu_bo *fence_cmds_mem;
77 
78    /**
79     * Processing zombie VMAs is a two step process, first we clear the iova
80     * and then we close the handles.  But to minimize waste of virtqueue
81     * space (and associated stalling and ping-ponging between guest and host)
82     * we want to batch up all the GEM_SET_IOVA ccmds before we flush them to
83     * the host and start closing handles.
84     *
85     * This gives us a place to stash the VMAs between the two steps.
86     */
87    struct u_vector zombie_vmas_stage_2;
88 };
89 
90 static int tu_drm_get_param(struct tu_device *dev, uint32_t param, uint64_t *value);
91 
92 /**
93  * Helper for simple pass-thru ioctls
94  */
95 static int
virtio_simple_ioctl(struct tu_device * dev,unsigned cmd,void * _req)96 virtio_simple_ioctl(struct tu_device *dev, unsigned cmd, void *_req)
97 {
98    MESA_TRACE_FUNC();
99    struct vdrm_device *vdrm = dev->vdev->vdrm;
100    unsigned req_len = sizeof(struct msm_ccmd_ioctl_simple_req);
101    unsigned rsp_len = sizeof(struct msm_ccmd_ioctl_simple_rsp);
102 
103    req_len += _IOC_SIZE(cmd);
104    if (cmd & IOC_OUT)
105       rsp_len += _IOC_SIZE(cmd);
106 
107    uint8_t buf[req_len];
108    struct msm_ccmd_ioctl_simple_req *req = (struct msm_ccmd_ioctl_simple_req *)buf;
109    struct msm_ccmd_ioctl_simple_rsp *rsp;
110 
111    req->hdr = MSM_CCMD(IOCTL_SIMPLE, req_len);
112    req->cmd = cmd;
113    memcpy(req->payload, _req, _IOC_SIZE(cmd));
114 
115    rsp = (struct msm_ccmd_ioctl_simple_rsp *)
116          vdrm_alloc_rsp(vdrm, &req->hdr, rsp_len);
117 
118    int ret = vdrm_send_req(vdrm, &req->hdr, true);
119 
120    if (cmd & IOC_OUT)
121       memcpy(_req, rsp->payload, _IOC_SIZE(cmd));
122 
123    ret = rsp->ret;
124 
125    return ret;
126 }
127 
128 static int
set_iova(struct tu_device * device,uint32_t res_id,uint64_t iova)129 set_iova(struct tu_device *device, uint32_t res_id, uint64_t iova)
130 {
131    struct msm_ccmd_gem_set_iova_req req = {
132          .hdr = MSM_CCMD(GEM_SET_IOVA, sizeof(req)),
133          .iova = iova,
134          .res_id = res_id,
135    };
136 
137    return vdrm_send_req(device->vdev->vdrm, &req.hdr, false);
138 }
139 
140 static int
query_faults(struct tu_device * dev,uint64_t * value)141 query_faults(struct tu_device *dev, uint64_t *value)
142 {
143    struct tu_virtio_device *vdev = dev->vdev;
144    uint32_t async_error = 0;
145    uint64_t global_faults;
146 
147    if (vdrm_shmem_has_field(vdev->shmem, async_error))
148       async_error = vdev->shmem->async_error;
149 
150    if (vdrm_shmem_has_field(vdev->shmem, global_faults)) {
151       global_faults = vdev->shmem->global_faults;
152    } else {
153       int ret = tu_drm_get_param(dev, MSM_PARAM_FAULTS, &global_faults);
154       if (ret)
155          return ret;
156    }
157 
158    *value = global_faults + async_error;
159 
160    return 0;
161 }
162 
163 static void
set_debuginfo(struct tu_device * dev)164 set_debuginfo(struct tu_device *dev)
165 {
166    const char *comm = util_get_process_name();
167    static char cmdline[0x1000+1];
168    int fd = open("/proc/self/cmdline", O_RDONLY);
169    if (fd < 0)
170       return;
171 
172    int n = read(fd, cmdline, sizeof(cmdline) - 1);
173    if (n < 0)
174       return;
175 
176    /* arguments are separated by NULL, convert to spaces: */
177    for (int i = 0; i < n; i++) {
178       if (cmdline[i] == '\0') {
179          cmdline[i] = ' ';
180       }
181    }
182 
183    cmdline[n] = '\0';
184 
185    unsigned comm_len = strlen(comm) + 1;
186    unsigned cmdline_len = strlen(cmdline) + 1;
187 
188    struct msm_ccmd_set_debuginfo_req *req;
189 
190    unsigned req_len = align(sizeof(*req) + comm_len + cmdline_len, 4);
191 
192    req = (struct msm_ccmd_set_debuginfo_req *)malloc(req_len);
193 
194    req->hdr         = MSM_CCMD(SET_DEBUGINFO, req_len);
195    req->comm_len    = comm_len;
196    req->cmdline_len = cmdline_len;
197 
198    memcpy(&req->payload[0], comm, comm_len);
199    memcpy(&req->payload[comm_len], cmdline, cmdline_len);
200 
201    vdrm_send_req(dev->vdev->vdrm, &req->hdr, false);
202 
203    free(req);
204 }
205 
206 static VkResult
virtio_device_init(struct tu_device * dev)207 virtio_device_init(struct tu_device *dev)
208 {
209    struct tu_instance *instance = dev->physical_device->instance;
210    int fd;
211 
212    fd = open(dev->physical_device->fd_path, O_RDWR | O_CLOEXEC);
213    if (fd < 0) {
214       return vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
215                                "failed to open device %s", dev->physical_device->fd_path);
216    }
217 
218    struct tu_virtio_device *vdev = (struct tu_virtio_device *)
219             vk_zalloc(&instance->vk.alloc, sizeof(*vdev), 8,
220                       VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
221    if (!vdev) {
222       close(fd);
223       return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
224    };
225 
226    u_vector_init(&vdev->zombie_vmas_stage_2, 64, sizeof(struct tu_zombie_vma));
227 
228    dev->vdev = vdev;
229    dev->fd = fd;
230 
231    vdev->vdrm = vdrm_device_connect(fd, VIRTGPU_DRM_CONTEXT_MSM);
232 
233    p_atomic_set(&vdev->next_blob_id, 1);
234    vdev->shmem = to_msm_shmem(vdev->vdrm->shmem);
235 
236    query_faults(dev, &dev->fault_count);
237 
238    set_debuginfo(dev);
239 
240    return VK_SUCCESS;
241 }
242 
243 static void
virtio_device_finish(struct tu_device * dev)244 virtio_device_finish(struct tu_device *dev)
245 {
246    struct tu_instance *instance = dev->physical_device->instance;
247    struct tu_virtio_device *vdev = dev->vdev;
248 
249    u_vector_finish(&vdev->zombie_vmas_stage_2);
250 
251    vdrm_device_close(vdev->vdrm);
252 
253    vk_free(&instance->vk.alloc, vdev);
254    dev->vdev = NULL;
255 
256    close(dev->fd);
257 }
258 
259 static int
tu_drm_get_param(struct tu_device * dev,uint32_t param,uint64_t * value)260 tu_drm_get_param(struct tu_device *dev, uint32_t param, uint64_t *value)
261 {
262    /* Technically this requires a pipe, but the kernel only supports one pipe
263     * anyway at the time of writing and most of these are clearly pipe
264     * independent. */
265    struct drm_msm_param req = {
266       .pipe = MSM_PIPE_3D0,
267       .param = param,
268    };
269 
270    int ret = virtio_simple_ioctl(dev, DRM_IOCTL_MSM_GET_PARAM, &req);
271    if (ret)
272       return ret;
273 
274    *value = req.value;
275 
276    return 0;
277 }
278 
279 static int
virtio_device_get_gpu_timestamp(struct tu_device * dev,uint64_t * ts)280 virtio_device_get_gpu_timestamp(struct tu_device *dev, uint64_t *ts)
281 {
282    return tu_drm_get_param(dev, MSM_PARAM_TIMESTAMP, ts);
283 }
284 
285 static int
virtio_device_get_suspend_count(struct tu_device * dev,uint64_t * suspend_count)286 virtio_device_get_suspend_count(struct tu_device *dev, uint64_t *suspend_count)
287 {
288    int ret = tu_drm_get_param(dev, MSM_PARAM_SUSPENDS, suspend_count);
289    return ret;
290 }
291 
292 static VkResult
virtio_device_check_status(struct tu_device * device)293 virtio_device_check_status(struct tu_device *device)
294 {
295    uint64_t last_fault_count = device->fault_count;
296 
297    query_faults(device, &device->fault_count);
298 
299    if (last_fault_count != device->fault_count)
300       return vk_device_set_lost(&device->vk, "GPU faulted or hung");
301 
302    return VK_SUCCESS;
303 }
304 
305 static int
virtio_submitqueue_new(struct tu_device * dev,int priority,uint32_t * queue_id)306 virtio_submitqueue_new(struct tu_device *dev,
307                        int priority,
308                        uint32_t *queue_id)
309 {
310    assert(priority >= 0 &&
311           priority < dev->physical_device->submitqueue_priority_count);
312 
313    struct drm_msm_submitqueue req = {
314       .flags = 0,
315       .prio = priority,
316    };
317 
318    int ret = virtio_simple_ioctl(dev, DRM_IOCTL_MSM_SUBMITQUEUE_NEW, &req);
319    if (ret)
320       return ret;
321 
322    *queue_id = req.id;
323    return 0;
324 }
325 
326 static void
virtio_submitqueue_close(struct tu_device * dev,uint32_t queue_id)327 virtio_submitqueue_close(struct tu_device *dev, uint32_t queue_id)
328 {
329    virtio_simple_ioctl(dev, DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE, &queue_id);
330 }
331 
332 static VkResult
tu_wait_fence(struct tu_device * dev,uint32_t queue_id,int fence,uint64_t timeout_ns)333 tu_wait_fence(struct tu_device *dev,
334               uint32_t queue_id,
335               int fence,
336               uint64_t timeout_ns)
337 {
338    struct vdrm_device *vdrm = dev->vdev->vdrm;
339 
340    if (!fence_before(dev->global_bo_map->userspace_fence, fence))
341       return VK_SUCCESS;
342 
343    if (!timeout_ns)
344       return VK_TIMEOUT;
345 
346    MESA_TRACE_FUNC();
347 
348    struct msm_ccmd_wait_fence_req req = {
349          .hdr = MSM_CCMD(WAIT_FENCE, sizeof(req)),
350          .queue_id = queue_id,
351          .fence = fence,
352    };
353    struct msm_ccmd_submitqueue_query_rsp *rsp;
354    int64_t end_time = os_time_get_nano() + timeout_ns;
355    int ret;
356 
357    do {
358       rsp = (struct msm_ccmd_submitqueue_query_rsp *)
359             vdrm_alloc_rsp(vdrm, &req.hdr, sizeof(*rsp));
360 
361       ret = vdrm_send_req(vdrm, &req.hdr, true);
362       if (ret)
363          goto out;
364 
365       if (os_time_get_nano() >= end_time)
366          break;
367 
368       ret = rsp->ret;
369    } while (ret == -ETIMEDOUT);
370 
371 out:
372    if (!ret) return VK_SUCCESS;
373    if (ret == -ETIMEDOUT) return VK_TIMEOUT;
374    return VK_ERROR_UNKNOWN;
375 }
376 
377 static VkResult
tu_free_zombie_vma_locked(struct tu_device * dev,bool wait)378 tu_free_zombie_vma_locked(struct tu_device *dev, bool wait)
379 {
380    struct tu_virtio_device *vdev = dev->vdev;
381 
382    if (!u_vector_length(&dev->zombie_vmas))
383       return VK_SUCCESS;
384 
385    if (wait) {
386       struct tu_zombie_vma *vma = (struct tu_zombie_vma *)
387             u_vector_head(&dev->zombie_vmas);
388       /* Wait for 3s (arbitrary timeout) */
389       VkResult ret = tu_wait_fence(dev, dev->queues[0]->msm_queue_id,
390                                    vma->fence, 3000000000);
391 
392       if (ret != VK_SUCCESS)
393          return ret;
394    }
395 
396    /* Clear the iova of all finished objects in first pass so the SET_IOVA
397     * ccmd's can be buffered and sent together to the host.  *Then* delete
398     * the handles.  This avoids filling up the virtqueue with tiny messages,
399     * since each execbuf ends up needing to be page aligned.
400     */
401    int last_signaled_fence = -1;
402    while (u_vector_length(&dev->zombie_vmas) > 0) {
403       struct tu_zombie_vma *vma = (struct tu_zombie_vma *)
404             u_vector_tail(&dev->zombie_vmas);
405       if (vma->fence > last_signaled_fence) {
406          VkResult ret =
407             tu_wait_fence(dev, dev->queues[0]->msm_queue_id, vma->fence, 0);
408          if (ret != VK_SUCCESS)
409             break;
410 
411          last_signaled_fence = vma->fence;
412       }
413 
414       u_vector_remove(&dev->zombie_vmas);
415 
416       if (vma->gem_handle) {
417          set_iova(dev, vma->res_id, 0);
418 
419          struct tu_zombie_vma *vma2 =
420             (struct tu_zombie_vma *) u_vector_add(&vdev->zombie_vmas_stage_2);
421 
422          *vma2 = *vma;
423       }
424    }
425 
426    /* And _then_ close the GEM handles: */
427    while (u_vector_length(&vdev->zombie_vmas_stage_2) > 0) {
428       struct tu_zombie_vma *vma = (struct tu_zombie_vma *)
429             u_vector_remove(&vdev->zombie_vmas_stage_2);
430 
431       util_vma_heap_free(&dev->vma, vma->iova, vma->size);
432       vdrm_bo_close(dev->vdev->vdrm, vma->gem_handle);
433    }
434 
435    return VK_SUCCESS;
436 }
437 
438 static bool
tu_restore_from_zombie_vma_locked(struct tu_device * dev,uint32_t gem_handle,uint64_t * iova)439 tu_restore_from_zombie_vma_locked(struct tu_device *dev,
440                                   uint32_t gem_handle,
441                                   uint64_t *iova)
442 {
443    struct tu_zombie_vma *vma;
444    u_vector_foreach (vma, &dev->zombie_vmas) {
445       if (vma->gem_handle == gem_handle) {
446          *iova = vma->iova;
447 
448          /* mark to skip later vdrm bo and iova cleanup */
449          vma->gem_handle = 0;
450          return true;
451       }
452    }
453 
454    return false;
455 }
456 
457 static VkResult
virtio_allocate_userspace_iova_locked(struct tu_device * dev,uint32_t gem_handle,uint64_t size,uint64_t client_iova,enum tu_bo_alloc_flags flags,uint64_t * iova)458 virtio_allocate_userspace_iova_locked(struct tu_device *dev,
459                                       uint32_t gem_handle,
460                                       uint64_t size,
461                                       uint64_t client_iova,
462                                       enum tu_bo_alloc_flags flags,
463                                       uint64_t *iova)
464 {
465    VkResult result;
466 
467    *iova = 0;
468 
469    if (flags & TU_BO_ALLOC_DMABUF) {
470       assert(gem_handle);
471 
472       if (tu_restore_from_zombie_vma_locked(dev, gem_handle, iova))
473          return VK_SUCCESS;
474    }
475 
476    tu_free_zombie_vma_locked(dev, false);
477 
478    result = tu_allocate_userspace_iova(dev, size, client_iova, flags, iova);
479    if (result == VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS) {
480       /* Address may be already freed by us, but not considered as
481        * freed by the kernel. We have to wait until all work that
482        * may hold the address is done. Since addresses are meant to
483        * be replayed only by debug tooling, it should be ok to wait.
484        */
485       tu_free_zombie_vma_locked(dev, true);
486       result = tu_allocate_userspace_iova(dev, size, client_iova, flags, iova);
487    }
488 
489    return result;
490 }
491 
492 static VkResult
tu_bo_init(struct tu_device * dev,struct vk_object_base * base,struct tu_bo * bo,uint32_t gem_handle,uint64_t size,uint64_t iova,enum tu_bo_alloc_flags flags,const char * name)493 tu_bo_init(struct tu_device *dev,
494            struct vk_object_base *base,
495            struct tu_bo *bo,
496            uint32_t gem_handle,
497            uint64_t size,
498            uint64_t iova,
499            enum tu_bo_alloc_flags flags,
500            const char *name)
501 {
502    assert(dev->physical_device->has_set_iova);
503 
504    set_iova(dev, bo->res_id, iova);
505 
506    name = tu_debug_bos_add(dev, size, name);
507 
508    mtx_lock(&dev->bo_mutex);
509    uint32_t idx = dev->bo_count++;
510 
511    /* grow the bo list if needed */
512    if (idx >= dev->bo_list_size) {
513       uint32_t new_len = idx + 64;
514       struct drm_msm_gem_submit_bo *new_ptr = (struct drm_msm_gem_submit_bo *)
515          vk_realloc(&dev->vk.alloc, dev->bo_list, new_len * sizeof(*dev->bo_list),
516                     8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
517       if (!new_ptr) {
518          dev->bo_count--;
519          mtx_unlock(&dev->bo_mutex);
520          vdrm_bo_close(dev->vdev->vdrm, bo->gem_handle);
521          return VK_ERROR_OUT_OF_HOST_MEMORY;
522       }
523 
524       dev->bo_list = new_ptr;
525       dev->bo_list_size = new_len;
526    }
527 
528    bool dump = flags & TU_BO_ALLOC_ALLOW_DUMP;
529    dev->bo_list[idx] = (struct drm_msm_gem_submit_bo) {
530       .flags = MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE |
531                COND(dump, MSM_SUBMIT_BO_DUMP),
532       .handle = bo->res_id,
533       .presumed = iova,
534    };
535 
536    *bo = (struct tu_bo) {
537       .gem_handle = gem_handle,
538       .res_id = bo->res_id,
539       .size = size,
540       .iova = iova,
541       .name = name,
542       .refcnt = 1,
543       .bo_list_idx = idx,
544       .base = base,
545    };
546 
547    mtx_unlock(&dev->bo_mutex);
548 
549    return VK_SUCCESS;
550 }
551 
552 /**
553  * Sets the name in the kernel so that the contents of /debug/dri/0/gem are more
554  * useful.
555  *
556  * We skip this on release builds (when we're also not doing BO debugging) to
557  * reduce overhead.
558  */
559 static void
tu_bo_set_kernel_name(struct tu_device * dev,struct tu_bo * bo,const char * name)560 tu_bo_set_kernel_name(struct tu_device *dev, struct tu_bo *bo, const char *name)
561 {
562    bool kernel_bo_names = dev->bo_sizes != NULL;
563 #if MESA_DEBUG
564    kernel_bo_names = true;
565 #endif
566    if (!kernel_bo_names)
567       return;
568 
569    size_t sz = strlen(name);
570 
571    unsigned req_len = sizeof(struct msm_ccmd_gem_set_name_req) + align(sz, 4);
572 
573    uint8_t buf[req_len];
574    struct msm_ccmd_gem_set_name_req *req = (struct msm_ccmd_gem_set_name_req *)buf;
575 
576    req->hdr = MSM_CCMD(GEM_SET_NAME, req_len);
577    req->res_id = bo->res_id;
578    req->len = sz;
579 
580    memcpy(req->payload, name, sz);
581 
582    vdrm_send_req(dev->vdev->vdrm, &req->hdr, false);
583 }
584 
585 static VkResult
virtio_bo_init(struct tu_device * dev,struct vk_object_base * base,struct tu_bo ** out_bo,uint64_t size,uint64_t client_iova,VkMemoryPropertyFlags mem_property,enum tu_bo_alloc_flags flags,const char * name)586 virtio_bo_init(struct tu_device *dev,
587                struct vk_object_base *base,
588                struct tu_bo **out_bo,
589                uint64_t size,
590                uint64_t client_iova,
591                VkMemoryPropertyFlags mem_property,
592                enum tu_bo_alloc_flags flags,
593                const char *name)
594 {
595    struct tu_virtio_device *vdev = dev->vdev;
596    struct msm_ccmd_gem_new_req req = {
597          .hdr = MSM_CCMD(GEM_NEW, sizeof(req)),
598          .size = size,
599    };
600    VkResult result;
601    uint32_t res_id;
602    struct tu_bo *bo;
603 
604    if (mem_property & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) {
605       if (mem_property & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) {
606          req.flags |= MSM_BO_CACHED_COHERENT;
607       } else {
608          req.flags |= MSM_BO_CACHED;
609       }
610    } else {
611       req.flags |= MSM_BO_WC;
612    }
613 
614    uint32_t blob_flags = 0;
615    if (mem_property & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
616       blob_flags |= VIRTGPU_BLOB_FLAG_USE_MAPPABLE;
617    }
618 
619    if (!(mem_property & VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT)) {
620       blob_flags |= VIRTGPU_BLOB_FLAG_USE_CROSS_DEVICE |
621             VIRTGPU_BLOB_FLAG_USE_SHAREABLE;
622    }
623 
624    if (flags & TU_BO_ALLOC_GPU_READ_ONLY)
625       req.flags |= MSM_BO_GPU_READONLY;
626 
627    assert(!(flags & TU_BO_ALLOC_DMABUF));
628 
629    mtx_lock(&dev->vma_mutex);
630    result = virtio_allocate_userspace_iova_locked(dev, 0, size, client_iova,
631                                                   flags, &req.iova);
632    mtx_unlock(&dev->vma_mutex);
633 
634    if (result != VK_SUCCESS)
635       return result;
636 
637    /* tunneled cmds are processed separately on host side,
638     * before the renderer->get_blob() callback.. the blob_id
639     * is used to link the created bo to the get_blob() call
640     */
641    req.blob_id = p_atomic_inc_return(&vdev->next_blob_id);;
642 
643    uint32_t handle =
644       vdrm_bo_create(vdev->vdrm, size, blob_flags, req.blob_id, &req.hdr);
645 
646    if (!handle) {
647       result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
648       goto fail;
649    }
650 
651    res_id = vdrm_handle_to_res_id(vdev->vdrm, handle);
652    bo = tu_device_lookup_bo(dev, res_id);
653    assert(bo && bo->gem_handle == 0);
654 
655    bo->res_id = res_id;
656 
657    result = tu_bo_init(dev, base, bo, handle, size, req.iova, flags, name);
658    if (result != VK_SUCCESS) {
659       memset(bo, 0, sizeof(*bo));
660       goto fail;
661    }
662 
663    *out_bo = bo;
664 
665    /* We don't use bo->name here because for the !TU_DEBUG=bo case bo->name is NULL. */
666    tu_bo_set_kernel_name(dev, bo, name);
667 
668    if ((mem_property & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) &&
669        !(mem_property & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
670       tu_bo_map(dev, bo, NULL);
671 
672       /* Cached non-coherent memory may already have dirty cache lines,
673        * we should clean the cache lines before GPU got the chance to
674        * write into this memory.
675        *
676        * MSM already does this automatically for uncached (MSM_BO_WC) memory.
677        */
678       tu_bo_sync_cache(dev, bo, 0, VK_WHOLE_SIZE, TU_MEM_SYNC_CACHE_TO_GPU);
679    }
680 
681    return VK_SUCCESS;
682 
683 fail:
684    mtx_lock(&dev->vma_mutex);
685    util_vma_heap_free(&dev->vma, req.iova, size);
686    mtx_unlock(&dev->vma_mutex);
687    return result;
688 }
689 
690 static VkResult
virtio_bo_init_dmabuf(struct tu_device * dev,struct tu_bo ** out_bo,uint64_t size,int prime_fd)691 virtio_bo_init_dmabuf(struct tu_device *dev,
692                    struct tu_bo **out_bo,
693                    uint64_t size,
694                    int prime_fd)
695 {
696    struct vdrm_device *vdrm = dev->vdev->vdrm;
697    VkResult result;
698    struct tu_bo* bo = NULL;
699 
700    /* lseek() to get the real size */
701    off_t real_size = lseek(prime_fd, 0, SEEK_END);
702    lseek(prime_fd, 0, SEEK_SET);
703    if (real_size < 0 || (uint64_t) real_size < size)
704       return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
705 
706    /* iova allocation needs to consider the object's *real* size: */
707    size = real_size;
708 
709    /* Importing the same dmabuf several times would yield the same
710     * gem_handle. Thus there could be a race when destroying
711     * BO and importing the same dmabuf from different threads.
712     * We must not permit the creation of dmabuf BO and its release
713     * to happen in parallel.
714     */
715    u_rwlock_wrlock(&dev->dma_bo_lock);
716    mtx_lock(&dev->vma_mutex);
717 
718    uint32_t handle, res_id;
719    uint64_t iova;
720 
721    handle = vdrm_dmabuf_to_handle(vdrm, prime_fd);
722    if (!handle) {
723       result = vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
724       goto out_unlock;
725    }
726 
727    res_id = vdrm_handle_to_res_id(vdrm, handle);
728    if (!res_id) {
729       /* XXX gem_handle potentially leaked here since no refcnt */
730       result = vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
731       goto out_unlock;
732    }
733 
734    bo = tu_device_lookup_bo(dev, res_id);
735 
736    if (bo->refcnt != 0) {
737       p_atomic_inc(&bo->refcnt);
738       assert(bo->res_id == res_id);
739       *out_bo = bo;
740       result = VK_SUCCESS;
741       goto out_unlock;
742    }
743 
744    bo->res_id = res_id;
745 
746    result = virtio_allocate_userspace_iova_locked(dev, handle, size, 0,
747                                                   TU_BO_ALLOC_DMABUF, &iova);
748    if (result != VK_SUCCESS) {
749       vdrm_bo_close(dev->vdev->vdrm, handle);
750       goto out_unlock;
751    }
752 
753    result =
754       tu_bo_init(dev, NULL, bo, handle, size, iova, TU_BO_ALLOC_NO_FLAGS, "dmabuf");
755    if (result != VK_SUCCESS) {
756       util_vma_heap_free(&dev->vma, iova, size);
757       memset(bo, 0, sizeof(*bo));
758    } else {
759       *out_bo = bo;
760    }
761 
762 out_unlock:
763    mtx_unlock(&dev->vma_mutex);
764    u_rwlock_wrunlock(&dev->dma_bo_lock);
765    return result;
766 }
767 
768 static VkResult
virtio_bo_map(struct tu_device * dev,struct tu_bo * bo,void * placed_addr)769 virtio_bo_map(struct tu_device *dev, struct tu_bo *bo, void *placed_addr)
770 {
771    bo->map = vdrm_bo_map(dev->vdev->vdrm, bo->gem_handle, bo->size, placed_addr);
772    if (bo->map == MAP_FAILED)
773       return vk_error(dev, VK_ERROR_MEMORY_MAP_FAILED);
774 
775    return VK_SUCCESS;
776 }
777 
778 static void
virtio_bo_allow_dump(struct tu_device * dev,struct tu_bo * bo)779 virtio_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo)
780 {
781    mtx_lock(&dev->bo_mutex);
782    dev->bo_list[bo->bo_list_idx].flags |= MSM_SUBMIT_BO_DUMP;
783    mtx_unlock(&dev->bo_mutex);
784 }
785 
786 static VkResult
tu_queue_submit_create_locked(struct tu_queue * queue,struct vk_queue_submit * vk_submit,const uint32_t nr_in_syncobjs,const uint32_t nr_out_syncobjs,uint32_t perf_pass_index,struct tu_virtio_queue_submit * new_submit)787 tu_queue_submit_create_locked(struct tu_queue *queue,
788                               struct vk_queue_submit *vk_submit,
789                               const uint32_t nr_in_syncobjs,
790                               const uint32_t nr_out_syncobjs,
791                               uint32_t perf_pass_index,
792                               struct tu_virtio_queue_submit *new_submit)
793 {
794    VkResult result;
795 
796    bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
797    bool has_trace_points = false;
798 
799    struct vk_command_buffer **vk_cmd_buffers = vk_submit->command_buffers;
800 
801    memset(new_submit, 0, sizeof(struct tu_virtio_queue_submit));
802 
803    new_submit->cmd_buffers = (struct tu_cmd_buffer **) vk_cmd_buffers;
804    new_submit->nr_cmd_buffers = vk_submit->command_buffer_count;
805    tu_insert_dynamic_cmdbufs(queue->device, &new_submit->cmd_buffers,
806                              &new_submit->nr_cmd_buffers);
807 
808    uint32_t entry_count = 0;
809    for (uint32_t j = 0; j < new_submit->nr_cmd_buffers; ++j) {
810       struct tu_cmd_buffer *cmdbuf = new_submit->cmd_buffers[j];
811 
812       if (perf_pass_index != ~0)
813          entry_count++;
814 
815       entry_count += cmdbuf->cs.entry_count;
816 
817       if (u_trace_enabled && u_trace_has_points(&cmdbuf->trace)) {
818          if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
819             entry_count++;
820 
821          has_trace_points = true;
822       }
823    }
824 
825    new_submit->autotune_fence =
826       tu_autotune_submit_requires_fence(new_submit->cmd_buffers, new_submit->nr_cmd_buffers);
827    if (new_submit->autotune_fence)
828       entry_count++;
829 
830    /* Add one for the userspace fence cmd: */
831    entry_count += 1;
832 
833    new_submit->cmds = (struct drm_msm_gem_submit_cmd *) vk_zalloc(
834       &queue->device->vk.alloc, entry_count * sizeof(*new_submit->cmds), 8,
835       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
836 
837    if (new_submit->cmds == NULL) {
838       result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
839       goto fail_cmds;
840    }
841 
842    if (has_trace_points) {
843       result =
844          tu_u_trace_submission_data_create(
845             queue->device, new_submit->cmd_buffers,
846             new_submit->nr_cmd_buffers,
847             &new_submit->u_trace_submission_data);
848 
849       if (result != VK_SUCCESS) {
850          goto fail_u_trace_submission_data;
851       }
852    }
853 
854    /* Allocate without wait timeline semaphores */
855    new_submit->in_syncobjs = (struct drm_virtgpu_execbuffer_syncobj *) vk_zalloc(
856       &queue->device->vk.alloc,
857       nr_in_syncobjs * sizeof(*new_submit->in_syncobjs), 8,
858       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
859 
860    if (new_submit->in_syncobjs == NULL) {
861       result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
862       goto fail_in_syncobjs;
863    }
864 
865    /* Allocate with signal timeline semaphores considered */
866    new_submit->out_syncobjs = (struct drm_virtgpu_execbuffer_syncobj *) vk_zalloc(
867       &queue->device->vk.alloc,
868       nr_out_syncobjs * sizeof(*new_submit->out_syncobjs), 8,
869       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
870 
871    if (new_submit->out_syncobjs == NULL) {
872       result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
873       goto fail_out_syncobjs;
874    }
875 
876    new_submit->entry_count = entry_count;
877    new_submit->nr_in_syncobjs = nr_in_syncobjs;
878    new_submit->nr_out_syncobjs = nr_out_syncobjs;
879    new_submit->perf_pass_index = perf_pass_index;
880    new_submit->vk_submit = vk_submit;
881 
882    return VK_SUCCESS;
883 
884 fail_out_syncobjs:
885    vk_free(&queue->device->vk.alloc, new_submit->in_syncobjs);
886 fail_in_syncobjs:
887    if (new_submit->u_trace_submission_data)
888       tu_u_trace_submission_data_finish(queue->device,
889                                         new_submit->u_trace_submission_data);
890 fail_u_trace_submission_data:
891    vk_free(&queue->device->vk.alloc, new_submit->cmds);
892 fail_cmds:
893    return result;
894 }
895 
896 static void
tu_queue_submit_finish(struct tu_queue * queue,struct tu_virtio_queue_submit * submit)897 tu_queue_submit_finish(struct tu_queue *queue, struct tu_virtio_queue_submit *submit)
898 {
899    vk_free(&queue->device->vk.alloc, submit->cmds);
900    vk_free(&queue->device->vk.alloc, submit->in_syncobjs);
901    vk_free(&queue->device->vk.alloc, submit->out_syncobjs);
902    if (submit->cmd_buffers != (void *) submit->vk_submit->command_buffers)
903       vk_free(&queue->device->vk.alloc, submit->cmd_buffers);
904 }
905 
906 static void
tu_fill_msm_gem_submit(struct tu_device * dev,struct drm_msm_gem_submit_cmd * cmd,struct tu_cs_entry * cs_entry)907 tu_fill_msm_gem_submit(struct tu_device *dev,
908                        struct drm_msm_gem_submit_cmd *cmd,
909                        struct tu_cs_entry *cs_entry)
910 {
911    cmd->type = MSM_SUBMIT_CMD_BUF;
912    cmd->submit_idx = cs_entry->bo->bo_list_idx;
913    cmd->submit_offset = cs_entry->offset;
914    cmd->size = cs_entry->size;
915    cmd->pad = 0;
916    cmd->nr_relocs = 0;
917    cmd->relocs = 0;
918 }
919 
920 static void
tu_queue_build_msm_gem_submit_cmds(struct tu_queue * queue,struct tu_virtio_queue_submit * submit,struct tu_cs * autotune_cs)921 tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue,
922                                    struct tu_virtio_queue_submit *submit,
923                                    struct tu_cs *autotune_cs)
924 {
925    struct tu_device *dev = queue->device;
926    struct tu_virtio_device *vdev = dev->vdev;
927    struct drm_msm_gem_submit_cmd *cmds = submit->cmds;
928 
929    uint32_t entry_idx = 0;
930    for (uint32_t j = 0; j < submit->nr_cmd_buffers; ++j) {
931       struct tu_device *dev = queue->device;
932       struct tu_cmd_buffer *cmdbuf = submit->cmd_buffers[j];
933       struct tu_cs *cs = &cmdbuf->cs;
934 
935       if (submit->perf_pass_index != ~0) {
936          struct tu_cs_entry *perf_cs_entry =
937             &dev->perfcntrs_pass_cs_entries[submit->perf_pass_index];
938 
939          tu_fill_msm_gem_submit(dev, &cmds[entry_idx], perf_cs_entry);
940          entry_idx++;
941       }
942 
943       for (unsigned i = 0; i < cs->entry_count; ++i, ++entry_idx) {
944          tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &cs->entries[i]);
945       }
946 
947       if (submit->u_trace_submission_data) {
948          struct tu_cs *ts_cs =
949             submit->u_trace_submission_data->cmd_trace_data[j].timestamp_copy_cs;
950          if (ts_cs) {
951             tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &ts_cs->entries[0]);
952             entry_idx++;
953          }
954       }
955    }
956 
957    if (autotune_cs) {
958       assert(autotune_cs->entry_count == 1);
959       tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &autotune_cs->entries[0]);
960       entry_idx++;
961    }
962 
963    /* Last, add the userspace fence cmd: */
964    struct tu_userspace_fence_cmds *fcmds = vdev->fence_cmds;
965    if (queue->fence <= 0)
966       queue->fence = 0;
967    uint32_t fence = ++queue->fence;
968    int idx = fence % ARRAY_SIZE(fcmds->cmds);
969 
970    /* Wait for previous usage of fence cmd to be idle.. in practice the table
971     * of recycled cmds should be big enough to never stall here:
972     */
973    tu_wait_fence(dev, dev->queues[0]->msm_queue_id, fcmds->cmds[idx].fence, 3000000000);
974 
975    fcmds->cmds[idx].fence = fence;
976 
977    cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF;
978    cmds[entry_idx].submit_idx = vdev->fence_cmds_mem->bo_list_idx;
979    cmds[entry_idx].submit_offset = ((intptr_t)&fcmds->cmds[idx]) - (intptr_t)fcmds;
980    cmds[entry_idx].size = 5 * 4;
981    cmds[entry_idx].pad = 0;
982    cmds[entry_idx].nr_relocs = 0;
983    cmds[entry_idx].relocs = 0;
984 }
985 
986 static VkResult
setup_fence_cmds(struct tu_device * dev)987 setup_fence_cmds(struct tu_device *dev)
988 {
989    struct tu_virtio_device *vdev = dev->vdev;
990    VkResult result;
991 
992    result = tu_bo_init_new(dev, NULL, &vdev->fence_cmds_mem,
993                            sizeof(*vdev->fence_cmds), (enum tu_bo_alloc_flags)
994                               (TU_BO_ALLOC_ALLOW_DUMP | TU_BO_ALLOC_GPU_READ_ONLY),
995                            "fence_cmds");
996    if (result != VK_SUCCESS)
997       return result;
998 
999    result = tu_bo_map(dev, vdev->fence_cmds_mem, NULL);
1000    if (result != VK_SUCCESS)
1001       return result;
1002 
1003    vdev->fence_cmds = (struct tu_userspace_fence_cmds *)vdev->fence_cmds_mem->map;
1004 
1005    uint64_t fence_iova = dev->global_bo->iova + gb_offset(userspace_fence);
1006    for (int i = 0; i < ARRAY_SIZE(vdev->fence_cmds->cmds); i++) {
1007       struct tu_userspace_fence_cmd *c = &vdev->fence_cmds->cmds[i];
1008 
1009       memset(c, 0, sizeof(*c));
1010 
1011       if (fd_dev_gen(&dev->physical_device->dev_id) >= A7XX) {
1012          c->pkt[0] = pm4_pkt7_hdr((uint8_t)CP_EVENT_WRITE7, 4);
1013          c->pkt[1] = CP_EVENT_WRITE7_0(.event = CACHE_FLUSH_TS,
1014                            .write_src = EV_WRITE_USER_32B,
1015                            .write_dst = EV_DST_RAM,
1016                            .write_enabled = true).value;
1017       } else {
1018          c->pkt[0] = pm4_pkt7_hdr((uint8_t)CP_EVENT_WRITE, 4);
1019          c->pkt[1] = CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS);
1020       }
1021       c->pkt[2] = fence_iova;
1022       c->pkt[3] = fence_iova >> 32;
1023    }
1024 
1025    return result;
1026 }
1027 
1028 static VkResult
tu_queue_submit_locked(struct tu_queue * queue,struct tu_virtio_queue_submit * submit)1029 tu_queue_submit_locked(struct tu_queue *queue, struct tu_virtio_queue_submit *submit)
1030 {
1031    struct tu_virtio_device *vdev = queue->device->vdev;
1032 
1033    queue->device->submit_count++;
1034 
1035    /* It would be nice to not need to defer this, but virtio_device_init()
1036     * happens before the device is initialized enough to allocate normal
1037     * GEM buffers
1038     */
1039    if (!vdev->fence_cmds) {
1040       VkResult result = setup_fence_cmds(queue->device);
1041       if (result != VK_SUCCESS)
1042          return result;
1043    }
1044 
1045    struct tu_cs *autotune_cs = NULL;
1046    if (submit->autotune_fence) {
1047       autotune_cs = tu_autotune_on_submit(queue->device,
1048                                           &queue->device->autotune,
1049                                           submit->cmd_buffers,
1050                                           submit->nr_cmd_buffers);
1051    }
1052 
1053    uint32_t flags = MSM_PIPE_3D0;
1054 
1055    if (submit->vk_submit->wait_count)
1056       flags |= MSM_SUBMIT_SYNCOBJ_IN;
1057 
1058    if (submit->vk_submit->signal_count)
1059       flags |= MSM_SUBMIT_SYNCOBJ_OUT;
1060 
1061    mtx_lock(&queue->device->bo_mutex);
1062 
1063    if (queue->device->implicit_sync_bo_count == 0)
1064       flags |= MSM_SUBMIT_NO_IMPLICIT;
1065 
1066    /* drm_msm_gem_submit_cmd requires index of bo which could change at any
1067     * time when bo_mutex is not locked. So we build submit cmds here the real
1068     * place to submit.
1069     */
1070    tu_queue_build_msm_gem_submit_cmds(queue, submit, autotune_cs);
1071 
1072    /* TODO avoid extra memcpy, and populate bo's and cmds directly
1073     * into the req msg
1074     */
1075    unsigned nr_cmds = submit->entry_count;
1076    unsigned nr_bos = nr_cmds ? queue->device->bo_count : 0;
1077    unsigned bos_len = nr_bos * sizeof(struct drm_msm_gem_submit_bo);
1078    unsigned cmd_len = nr_cmds * sizeof(struct drm_msm_gem_submit_cmd);
1079    unsigned req_len = sizeof(struct msm_ccmd_gem_submit_req) + bos_len + cmd_len;
1080    struct msm_ccmd_gem_submit_req *req = (struct msm_ccmd_gem_submit_req *)vk_alloc(
1081          &queue->device->vk.alloc, req_len, 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1082 
1083    if (!req) {
1084       mtx_unlock(&queue->device->bo_mutex);
1085       return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
1086    }
1087 
1088    req->hdr      = MSM_CCMD(GEM_SUBMIT, req_len);
1089    req->flags    = flags;
1090    req->queue_id = queue->msm_queue_id;
1091    req->nr_bos   = nr_bos;
1092    req->nr_cmds  = nr_cmds;
1093 
1094    /* Use same kernel fence and userspace fence seqno to avoid having
1095     * to track both:
1096     */
1097    req->fence    = queue->fence;
1098 
1099    memcpy(req->payload, queue->device->bo_list, bos_len);
1100    memcpy(req->payload + bos_len, submit->cmds, cmd_len);
1101 
1102    int ring_idx = queue->priority + 1;
1103    int ret;
1104 
1105    struct vdrm_execbuf_params p = {
1106       .ring_idx = ring_idx,
1107       .req = &req->hdr,
1108       .in_syncobjs = submit->in_syncobjs,
1109       .out_syncobjs = submit->out_syncobjs,
1110       .num_in_syncobjs = submit->nr_in_syncobjs,
1111       .num_out_syncobjs = submit->nr_out_syncobjs,
1112    };
1113 
1114    ret = vdrm_execbuf(vdev->vdrm, &p);
1115 
1116    mtx_unlock(&queue->device->bo_mutex);
1117 
1118    tu_debug_bos_print_stats(queue->device);
1119 
1120    if (ret)
1121       return vk_device_set_lost(&queue->device->vk, "submit failed: %m");
1122 
1123    uint64_t gpu_offset = 0;
1124 #if HAVE_PERFETTO
1125    struct tu_perfetto_clocks clocks =
1126       tu_perfetto_submit(queue->device, queue->device->submit_count, NULL);
1127    gpu_offset = clocks.gpu_ts_offset;
1128 #endif
1129 
1130    if (submit->u_trace_submission_data) {
1131       struct tu_u_trace_submission_data *submission_data =
1132          submit->u_trace_submission_data;
1133       submission_data->submission_id = queue->device->submit_count;
1134       submission_data->gpu_ts_offset = gpu_offset;
1135       /* We have to allocate it here since it is different between drm/kgsl */
1136       submission_data->syncobj = (struct tu_u_trace_syncobj *)
1137          vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj),
1138                8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1139       submission_data->syncobj->fence = req->fence;
1140       submission_data->syncobj->msm_queue_id = queue->msm_queue_id;
1141 
1142       submit->u_trace_submission_data = NULL;
1143 
1144       for (uint32_t i = 0; i < submission_data->cmd_buffer_count; i++) {
1145          bool free_data = i == submission_data->last_buffer_with_tracepoints;
1146          if (submission_data->cmd_trace_data[i].trace)
1147             u_trace_flush(submission_data->cmd_trace_data[i].trace,
1148                           submission_data, queue->device->vk.current_frame,
1149                           free_data);
1150 
1151          if (!submission_data->cmd_trace_data[i].timestamp_copy_cs) {
1152             /* u_trace is owned by cmd_buffer */
1153             submission_data->cmd_trace_data[i].trace = NULL;
1154          }
1155       }
1156    }
1157 
1158    for (uint32_t i = 0; i < submit->vk_submit->wait_count; i++) {
1159       if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->waits[i].sync))
1160          continue;
1161 
1162       struct tu_timeline_sync *sync =
1163          container_of(submit->vk_submit->waits[i].sync, struct tu_timeline_sync, base);
1164 
1165       assert(sync->state != TU_TIMELINE_SYNC_STATE_RESET);
1166 
1167       /* Set SIGNALED to the state of the wait timeline sync since this means the syncobj
1168        * is done and ready again so this can be garbage-collectioned later.
1169        */
1170       sync->state = TU_TIMELINE_SYNC_STATE_SIGNALED;
1171    }
1172 
1173    for (uint32_t i = 0; i < submit->vk_submit->signal_count; i++) {
1174       if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->signals[i].sync))
1175          continue;
1176 
1177       struct tu_timeline_sync *sync =
1178          container_of(submit->vk_submit->signals[i].sync, struct tu_timeline_sync, base);
1179 
1180       assert(sync->state == TU_TIMELINE_SYNC_STATE_RESET);
1181       /* Set SUBMITTED to the state of the signal timeline sync so we could wait for
1182        * this timeline sync until completed if necessary.
1183        */
1184       sync->state = TU_TIMELINE_SYNC_STATE_SUBMITTED;
1185    }
1186 
1187    pthread_cond_broadcast(&queue->device->timeline_cond);
1188 
1189    return VK_SUCCESS;
1190 }
1191 
1192 static VkResult
virtio_device_wait_u_trace(struct tu_device * dev,struct tu_u_trace_syncobj * syncobj)1193 virtio_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj)
1194 {
1195    return tu_wait_fence(dev, syncobj->msm_queue_id, syncobj->fence, 1000000000);
1196 }
1197 
1198 static VkResult
virtio_queue_submit(struct tu_queue * queue,struct vk_queue_submit * submit)1199 virtio_queue_submit(struct tu_queue *queue, struct vk_queue_submit *submit)
1200 {
1201    MESA_TRACE_FUNC();
1202    uint32_t perf_pass_index = queue->device->perfcntrs_pass_cs ?
1203                               submit->perf_pass_index : ~0;
1204    struct tu_virtio_queue_submit submit_req;
1205 
1206    if (TU_DEBUG(LOG_SKIP_GMEM_OPS)) {
1207       tu_dbg_log_gmem_load_store_skips(queue->device);
1208    }
1209 
1210    pthread_mutex_lock(&queue->device->submit_mutex);
1211 
1212    VkResult ret = tu_queue_submit_create_locked(queue, submit,
1213          submit->wait_count, submit->signal_count,
1214          perf_pass_index, &submit_req);
1215 
1216    if (ret != VK_SUCCESS) {
1217       pthread_mutex_unlock(&queue->device->submit_mutex);
1218       return ret;
1219    }
1220 
1221    /* note: assuming there won't be any very large semaphore counts */
1222    struct drm_virtgpu_execbuffer_syncobj *in_syncobjs = submit_req.in_syncobjs;
1223    struct drm_virtgpu_execbuffer_syncobj *out_syncobjs = submit_req.out_syncobjs;
1224 
1225    uint32_t nr_in_syncobjs = 0, nr_out_syncobjs = 0;
1226 
1227    for (uint32_t i = 0; i < submit->wait_count; i++) {
1228       struct vk_sync *sync = submit->waits[i].sync;
1229 
1230       in_syncobjs[nr_in_syncobjs++] = (struct drm_virtgpu_execbuffer_syncobj) {
1231          .handle = tu_syncobj_from_vk_sync(sync),
1232          .flags = 0,
1233          .point = submit->waits[i].wait_value,
1234       };
1235    }
1236 
1237    for (uint32_t i = 0; i < submit->signal_count; i++) {
1238       struct vk_sync *sync = submit->signals[i].sync;
1239 
1240       out_syncobjs[nr_out_syncobjs++] = (struct drm_virtgpu_execbuffer_syncobj) {
1241          .handle = tu_syncobj_from_vk_sync(sync),
1242          .flags = 0,
1243          .point = submit->signals[i].signal_value,
1244       };
1245    }
1246 
1247    ret = tu_queue_submit_locked(queue, &submit_req);
1248 
1249    pthread_mutex_unlock(&queue->device->submit_mutex);
1250    tu_queue_submit_finish(queue, &submit_req);
1251 
1252    if (ret != VK_SUCCESS)
1253        return ret;
1254 
1255    u_trace_context_process(&queue->device->trace_context, false);
1256 
1257    return VK_SUCCESS;
1258 }
1259 
1260 static const struct tu_knl virtio_knl_funcs = {
1261       .name = "virtgpu",
1262 
1263       .device_init = virtio_device_init,
1264       .device_finish = virtio_device_finish,
1265       .device_get_gpu_timestamp = virtio_device_get_gpu_timestamp,
1266       .device_get_suspend_count = virtio_device_get_suspend_count,
1267       .device_check_status = virtio_device_check_status,
1268       .submitqueue_new = virtio_submitqueue_new,
1269       .submitqueue_close = virtio_submitqueue_close,
1270       .bo_init = virtio_bo_init,
1271       .bo_init_dmabuf = virtio_bo_init_dmabuf,
1272       .bo_export_dmabuf = tu_drm_export_dmabuf,
1273       .bo_map = virtio_bo_map,
1274       .bo_allow_dump = virtio_bo_allow_dump,
1275       .bo_finish = tu_drm_bo_finish,
1276       .device_wait_u_trace = virtio_device_wait_u_trace,
1277       .queue_submit = virtio_queue_submit,
1278 };
1279 
1280 VkResult
tu_knl_drm_virtio_load(struct tu_instance * instance,int fd,struct _drmVersion * version,struct tu_physical_device ** out)1281 tu_knl_drm_virtio_load(struct tu_instance *instance,
1282                        int fd, struct _drmVersion *version,
1283                        struct tu_physical_device **out)
1284 {
1285    struct virgl_renderer_capset_drm caps;
1286    struct vdrm_device *vdrm;
1287    VkResult result = VK_SUCCESS;
1288    uint64_t val;
1289 
1290    /* Debug option to force fallback to venus: */
1291    if (debug_get_bool_option("TU_NO_VIRTIO", false))
1292       return VK_ERROR_INCOMPATIBLE_DRIVER;
1293 
1294    if (drmGetCap(fd, DRM_CAP_SYNCOBJ, &val) || !val) {
1295       return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1296                                "kernel driver for device %s does not support DRM_CAP_SYNC_OBJ",
1297                                version->name);
1298    }
1299 
1300    vdrm = vdrm_device_connect(fd, VIRTGPU_DRM_CONTEXT_MSM);
1301    if (!vdrm) {
1302       return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1303                                "could not get connect vdrm: %s", strerror(errno));
1304    }
1305 
1306    caps = vdrm->caps;
1307 
1308    vdrm_device_close(vdrm);
1309 
1310    mesa_logd("wire_format_version: %u", caps.wire_format_version);
1311    mesa_logd("version_major:       %u", caps.version_major);
1312    mesa_logd("version_minor:       %u", caps.version_minor);
1313    mesa_logd("version_patchlevel:  %u", caps.version_patchlevel);
1314    mesa_logd("has_cached_coherent: %u", caps.u.msm.has_cached_coherent);
1315    mesa_logd("va_start:            0x%0" PRIx64, caps.u.msm.va_start);
1316    mesa_logd("va_size:             0x%0" PRIx64, caps.u.msm.va_size);
1317    mesa_logd("gpu_id:              %u", caps.u.msm.gpu_id);
1318    mesa_logd("gmem_size:           %u", caps.u.msm.gmem_size);
1319    mesa_logd("gmem_base:           0x%0" PRIx64, caps.u.msm.gmem_base);
1320    mesa_logd("chip_id:             0x%0" PRIx64, caps.u.msm.chip_id);
1321    mesa_logd("max_freq:            %u", caps.u.msm.max_freq);
1322 
1323    if (caps.wire_format_version != 2) {
1324       return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1325                                "Unsupported protocol version: %u",
1326                                caps.wire_format_version);
1327    }
1328 
1329    if ((caps.version_major != 1) || (caps.version_minor < 9)) {
1330       return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1331                                "unsupported version: %u.%u.%u",
1332                                caps.version_major,
1333                                caps.version_minor,
1334                                caps.version_patchlevel);
1335    }
1336 
1337    if (!caps.u.msm.va_size) {
1338       return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1339                                "No address space");
1340    }
1341 
1342    struct tu_physical_device *device = (struct tu_physical_device *)
1343       vk_zalloc(&instance->vk.alloc, sizeof(*device), 8,
1344                 VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
1345    if (!device) {
1346       result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1347       goto fail;
1348    }
1349 
1350    device->msm_major_version = caps.version_major;
1351    device->msm_minor_version = caps.version_minor;
1352 
1353    device->instance = instance;
1354    device->local_fd = fd;
1355 
1356    device->dev_id.gpu_id  = caps.u.msm.gpu_id;
1357    device->dev_id.chip_id = caps.u.msm.chip_id;
1358    device->gmem_size      = caps.u.msm.gmem_size;
1359    device->gmem_base      = caps.u.msm.gmem_base;
1360    device->va_start       = caps.u.msm.va_start;
1361    device->va_size        = caps.u.msm.va_size;
1362    device->has_set_iova   = true;
1363 
1364    device->gmem_size = debug_get_num_option("TU_GMEM", device->gmem_size);
1365 
1366    device->has_cached_coherent_memory = caps.u.msm.has_cached_coherent;
1367 
1368    device->submitqueue_priority_count = caps.u.msm.priorities;
1369 
1370    device->syncobj_type = vk_drm_syncobj_get_type(fd);
1371    /* we don't support DRM_CAP_SYNCOBJ_TIMELINE, but drm-shim does */
1372    if (!(device->syncobj_type.features & VK_SYNC_FEATURE_TIMELINE))
1373       device->timeline_type = vk_sync_timeline_get_type(&tu_timeline_sync_type);
1374 
1375    device->sync_types[0] = &device->syncobj_type;
1376    device->sync_types[1] = &device->timeline_type.sync;
1377    device->sync_types[2] = NULL;
1378 
1379    device->heap.size = tu_get_system_heap_size(device);
1380    device->heap.used = 0u;
1381    device->heap.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT;
1382 
1383    instance->knl = &virtio_knl_funcs;
1384 
1385    *out = device;
1386 
1387    return VK_SUCCESS;
1388 
1389 fail:
1390    vk_free(&instance->vk.alloc, device);
1391    return result;
1392 }
1393