/* * Copyright 2024 Valve Corporation * Copyright 2024 Alyssa Rosenzweig * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. * SPDX-License-Identifier: MIT */ #include "hk_device.h" #include "agx_bg_eot.h" #include "agx_helpers.h" #include "agx_opcodes.h" #include "agx_scratch.h" #include "hk_cmd_buffer.h" #include "hk_descriptor_table.h" #include "hk_entrypoints.h" #include "hk_instance.h" #include "hk_physical_device.h" #include "hk_shader.h" #include "asahi/genxml/agx_pack.h" #include "asahi/lib/agx_bo.h" #include "asahi/lib/agx_device.h" #include "asahi/lib/shaders/geometry.h" #include "util/hash_table.h" #include "util/os_file.h" #include "util/ralloc.h" #include "util/simple_mtx.h" #include "vulkan/vulkan_core.h" #include "vulkan/wsi/wsi_common.h" #include "vk_cmd_enqueue_entrypoints.h" #include "vk_common_entrypoints.h" #include "vk_pipeline_cache.h" #include #include /* * We preupload some constants so we can cheaply reference later without extra * allocation and copying. * * TODO: This is small, don't waste a whole BO. */ static VkResult hk_upload_rodata(struct hk_device *dev) { dev->rodata.bo = agx_bo_create(&dev->dev, AGX_SAMPLER_LENGTH, 0, 0, "Read only data"); if (!dev->rodata.bo) return VK_ERROR_OUT_OF_HOST_MEMORY; uint8_t *map = dev->rodata.bo->map; uint32_t offs = 0; offs = align(offs, 8); agx_pack(&dev->rodata.txf_sampler, USC_SAMPLER, cfg) { cfg.start = 0; cfg.count = 1; cfg.buffer = dev->rodata.bo->va->addr + offs; } agx_pack_txf_sampler((struct agx_sampler_packed *)(map + offs)); offs += AGX_SAMPLER_LENGTH; /* The image heap is allocated on the device prior to the rodata. The heap * lives as long as the device does and has a stable address (requiring * sparse binding to grow dynamically). That means its address is effectively * rodata and can be uploaded now. agx_usc_uniform requires an indirection to * push the heap address, so this takes care of that indirection up front to * cut an alloc/upload at draw time. */ offs = align(offs, sizeof(uint64_t)); agx_pack(&dev->rodata.image_heap, USC_UNIFORM, cfg) { cfg.start_halfs = HK_IMAGE_HEAP_UNIFORM; cfg.size_halfs = 4; cfg.buffer = dev->rodata.bo->va->addr + offs; } uint64_t *image_heap_ptr = dev->rodata.bo->map + offs; *image_heap_ptr = dev->images.bo->va->addr; offs += sizeof(uint64_t); /* The geometry state buffer isn't strictly readonly data, but we only have a * single instance of it device-wide and -- after initializing at heap * allocate time -- it is read-only from the CPU perspective. The GPU uses it * for scratch, but is required to reset it after use to ensure resubmitting * the same command buffer works. * * So, we allocate it here for convenience. */ offs = align(offs, sizeof(uint64_t)); dev->rodata.geometry_state = dev->rodata.bo->va->addr + offs; offs += sizeof(struct agx_geometry_state); /* For null readonly buffers, we need to allocate 16 bytes of zeroes for * robustness2 semantics on read. */ offs = align(offs, 16); dev->rodata.zero_sink = dev->rodata.bo->va->addr + offs; memset(dev->rodata.bo->map + offs, 0, 16); offs += 16; /* For null storage descriptors, we need to reserve 16 bytes to catch writes. * No particular content is required; we cannot get robustness2 semantics * without more work. */ offs = align(offs, 16); dev->rodata.null_sink = dev->rodata.bo->va->addr + offs; offs += 16; return VK_SUCCESS; } static uint32_t internal_key_hash(const void *key_) { const struct hk_internal_key *key = key_; return _mesa_hash_data(key, sizeof(struct hk_internal_key) + key->key_size); } static bool internal_key_equal(const void *a_, const void *b_) { const struct hk_internal_key *a = a_; const struct hk_internal_key *b = b_; return a->builder == b->builder && a->key_size == b->key_size && memcmp(a->key, b->key, a->key_size) == 0; } static VkResult hk_init_internal_shaders(struct hk_internal_shaders *s) { s->ht = _mesa_hash_table_create(NULL, internal_key_hash, internal_key_equal); if (!s->ht) return VK_ERROR_OUT_OF_HOST_MEMORY; simple_mtx_init(&s->lock, mtx_plain); return VK_SUCCESS; } static void hk_destroy_internal_shaders(struct hk_device *dev, struct hk_internal_shaders *s, bool part) { hash_table_foreach(s->ht, ent) { if (part) { struct agx_shader_part *part = ent->data; free(part->binary); /* The agx_shader_part itself is ralloc'd against the hash table so * will be freed. */ } else { struct hk_api_shader *obj = ent->data; hk_api_shader_destroy(&dev->vk, &obj->vk, NULL); } } _mesa_hash_table_destroy(s->ht, NULL); simple_mtx_destroy(&s->lock); } DERIVE_HASH_TABLE(agx_sampler_packed); static VkResult hk_init_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h) { h->ht = agx_sampler_packed_table_create(NULL); if (!h->ht) return VK_ERROR_OUT_OF_HOST_MEMORY; VkResult result = hk_descriptor_table_init(dev, &h->table, AGX_SAMPLER_LENGTH, 1024, 1024); if (result != VK_SUCCESS) { ralloc_free(h->ht); return result; } simple_mtx_init(&h->lock, mtx_plain); return VK_SUCCESS; } static void hk_destroy_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h) { hk_descriptor_table_finish(dev, &h->table); ralloc_free(h->ht); simple_mtx_destroy(&h->lock); } static VkResult hk_sampler_heap_add_locked(struct hk_device *dev, struct hk_sampler_heap *h, struct agx_sampler_packed desc, struct hk_rc_sampler **out) { struct hash_entry *ent = _mesa_hash_table_search(h->ht, &desc); if (ent != NULL) { *out = ent->data; assert((*out)->refcount != 0); (*out)->refcount++; return VK_SUCCESS; } struct hk_rc_sampler *rc = ralloc(h->ht, struct hk_rc_sampler); if (!rc) return VK_ERROR_OUT_OF_HOST_MEMORY; uint32_t index; VkResult result = hk_descriptor_table_add(dev, &h->table, &desc, sizeof(desc), &index); if (result != VK_SUCCESS) { ralloc_free(rc); return result; } *rc = (struct hk_rc_sampler){ .key = desc, .refcount = 1, .index = index, }; _mesa_hash_table_insert(h->ht, &rc->key, rc); *out = rc; return VK_SUCCESS; } VkResult hk_sampler_heap_add(struct hk_device *dev, struct agx_sampler_packed desc, struct hk_rc_sampler **out) { struct hk_sampler_heap *h = &dev->samplers; simple_mtx_lock(&h->lock); VkResult result = hk_sampler_heap_add_locked(dev, h, desc, out); simple_mtx_unlock(&h->lock); return result; } static void hk_sampler_heap_remove_locked(struct hk_device *dev, struct hk_sampler_heap *h, struct hk_rc_sampler *rc) { assert(rc->refcount != 0); rc->refcount--; if (rc->refcount == 0) { hk_descriptor_table_remove(dev, &h->table, rc->index); _mesa_hash_table_remove_key(h->ht, &rc->key); ralloc_free(rc); } } void hk_sampler_heap_remove(struct hk_device *dev, struct hk_rc_sampler *rc) { struct hk_sampler_heap *h = &dev->samplers; simple_mtx_lock(&h->lock); hk_sampler_heap_remove_locked(dev, h, rc); simple_mtx_unlock(&h->lock); } /* * To implement nullDescriptor, the descriptor set code will reference * preuploaded null descriptors at fixed offsets in the image heap. Here we * upload those descriptors, initializing the image heap. */ static void hk_upload_null_descriptors(struct hk_device *dev) { struct agx_texture_packed null_tex; struct agx_pbe_packed null_pbe; uint32_t offset_tex, offset_pbe; agx_set_null_texture(&null_tex, dev->rodata.null_sink); agx_set_null_pbe(&null_pbe, dev->rodata.null_sink); hk_descriptor_table_add(dev, &dev->images, &null_tex, sizeof(null_tex), &offset_tex); hk_descriptor_table_add(dev, &dev->images, &null_pbe, sizeof(null_pbe), &offset_pbe); assert((offset_tex * HK_IMAGE_STRIDE) == HK_NULL_TEX_OFFSET && "static"); assert((offset_pbe * HK_IMAGE_STRIDE) == HK_NULL_PBE_OFFSET && "static"); } VKAPI_ATTR VkResult VKAPI_CALL hk_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkDevice *pDevice) { VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice); VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY; struct hk_device *dev; dev = vk_zalloc2(&pdev->vk.instance->alloc, pAllocator, sizeof(*dev), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!dev) return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY); struct vk_device_dispatch_table dispatch_table; /* For secondary command buffer support, overwrite any command entrypoints * in the main device-level dispatch table with * vk_cmd_enqueue_unless_primary_Cmd*. */ vk_device_dispatch_table_from_entrypoints( &dispatch_table, &vk_cmd_enqueue_unless_primary_device_entrypoints, true); vk_device_dispatch_table_from_entrypoints(&dispatch_table, &hk_device_entrypoints, false); vk_device_dispatch_table_from_entrypoints(&dispatch_table, &wsi_device_entrypoints, false); /* Populate primary cmd_dispatch table */ vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch, &hk_device_entrypoints, true); vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch, &wsi_device_entrypoints, false); vk_device_dispatch_table_from_entrypoints( &dev->cmd_dispatch, &vk_common_device_entrypoints, false); result = vk_device_init(&dev->vk, &pdev->vk, &dispatch_table, pCreateInfo, pAllocator); if (result != VK_SUCCESS) goto fail_alloc; dev->vk.shader_ops = &hk_device_shader_ops; dev->vk.command_dispatch_table = &dev->cmd_dispatch; drmDevicePtr drm_device = NULL; int ret = drmGetDeviceFromDevId(pdev->render_dev, 0, &drm_device); if (ret != 0) { result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED, "Failed to get DRM device: %m"); goto fail_init; } const char *path = drm_device->nodes[DRM_NODE_RENDER]; dev->dev.fd = open(path, O_RDWR | O_CLOEXEC); if (dev->dev.fd < 0) { drmFreeDevice(&drm_device); result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED, "failed to open device %s", path); goto fail_init; } bool succ = agx_open_device(NULL, &dev->dev); drmFreeDevice(&drm_device); if (!succ) { result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED, "Failed to get DRM device: %m"); goto fail_fd; } vk_device_set_drm_fd(&dev->vk, dev->dev.fd); dev->vk.command_buffer_ops = &hk_cmd_buffer_ops; result = hk_descriptor_table_init(dev, &dev->images, AGX_TEXTURE_LENGTH, 1024, 1024 * 1024); if (result != VK_SUCCESS) goto fail_dev; result = hk_init_sampler_heap(dev, &dev->samplers); if (result != VK_SUCCESS) goto fail_images; result = hk_descriptor_table_init( dev, &dev->occlusion_queries, sizeof(uint64_t), AGX_MAX_OCCLUSION_QUERIES, AGX_MAX_OCCLUSION_QUERIES); if (result != VK_SUCCESS) goto fail_samplers; result = hk_upload_rodata(dev); if (result != VK_SUCCESS) goto fail_queries; /* Depends on rodata */ hk_upload_null_descriptors(dev); /* XXX: error handling, and should this even go on the device? */ agx_bg_eot_init(&dev->bg_eot, &dev->dev); if (!dev->bg_eot.ht) { result = VK_ERROR_OUT_OF_HOST_MEMORY; goto fail_rodata; } result = hk_init_internal_shaders(&dev->prolog_epilog); if (result != VK_SUCCESS) goto fail_bg_eot; result = hk_init_internal_shaders(&dev->kernels); if (result != VK_SUCCESS) goto fail_internal_shaders; result = hk_queue_init(dev, &dev->queue, &pCreateInfo->pQueueCreateInfos[0], 0); if (result != VK_SUCCESS) goto fail_internal_shaders_2; struct vk_pipeline_cache_create_info cache_info = { .weak_ref = true, }; dev->mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL); if (dev->mem_cache == NULL) { result = VK_ERROR_OUT_OF_HOST_MEMORY; goto fail_queue; } result = hk_device_init_meta(dev); if (result != VK_SUCCESS) goto fail_mem_cache; *pDevice = hk_device_to_handle(dev); simple_mtx_init(&dev->scratch.lock, mtx_plain); agx_scratch_init(&dev->dev, &dev->scratch.vs); agx_scratch_init(&dev->dev, &dev->scratch.fs); agx_scratch_init(&dev->dev, &dev->scratch.cs); return VK_SUCCESS; fail_mem_cache: vk_pipeline_cache_destroy(dev->mem_cache, NULL); fail_queue: hk_queue_finish(dev, &dev->queue); fail_rodata: agx_bo_unreference(&dev->dev, dev->rodata.bo); fail_bg_eot: agx_bg_eot_cleanup(&dev->bg_eot); fail_internal_shaders_2: hk_destroy_internal_shaders(dev, &dev->kernels, false); fail_internal_shaders: hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true); fail_queries: hk_descriptor_table_finish(dev, &dev->occlusion_queries); fail_samplers: hk_destroy_sampler_heap(dev, &dev->samplers); fail_images: hk_descriptor_table_finish(dev, &dev->images); fail_dev: agx_close_device(&dev->dev); fail_fd: close(dev->dev.fd); fail_init: vk_device_finish(&dev->vk); fail_alloc: vk_free(&dev->vk.alloc, dev); return result; } VKAPI_ATTR void VKAPI_CALL hk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) { VK_FROM_HANDLE(hk_device, dev, _device); if (!dev) return; hk_device_finish_meta(dev); hk_destroy_internal_shaders(dev, &dev->kernels, false); hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true); vk_pipeline_cache_destroy(dev->mem_cache, NULL); hk_queue_finish(dev, &dev->queue); vk_device_finish(&dev->vk); agx_scratch_fini(&dev->scratch.vs); agx_scratch_fini(&dev->scratch.fs); agx_scratch_fini(&dev->scratch.cs); simple_mtx_destroy(&dev->scratch.lock); hk_destroy_sampler_heap(dev, &dev->samplers); hk_descriptor_table_finish(dev, &dev->images); hk_descriptor_table_finish(dev, &dev->occlusion_queries); agx_bo_unreference(&dev->dev, dev->rodata.bo); agx_bo_unreference(&dev->dev, dev->heap); agx_bg_eot_cleanup(&dev->bg_eot); agx_close_device(&dev->dev); vk_free(&dev->vk.alloc, dev); } VKAPI_ATTR VkResult VKAPI_CALL hk_GetCalibratedTimestampsKHR( VkDevice _device, uint32_t timestampCount, const VkCalibratedTimestampInfoKHR *pTimestampInfos, uint64_t *pTimestamps, uint64_t *pMaxDeviation) { // VK_FROM_HANDLE(hk_device, dev, _device); // struct hk_physical_device *pdev = hk_device_physical(dev); uint64_t max_clock_period = 0; uint64_t begin, end; int d; #ifdef CLOCK_MONOTONIC_RAW begin = vk_clock_gettime(CLOCK_MONOTONIC_RAW); #else begin = vk_clock_gettime(CLOCK_MONOTONIC); #endif for (d = 0; d < timestampCount; d++) { switch (pTimestampInfos[d].timeDomain) { case VK_TIME_DOMAIN_DEVICE_KHR: unreachable("todo"); // pTimestamps[d] = agx_get_gpu_timestamp(&pdev->dev); max_clock_period = MAX2( max_clock_period, 1); /* FIXME: Is timestamp period actually 1? */ break; case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR: pTimestamps[d] = vk_clock_gettime(CLOCK_MONOTONIC); max_clock_period = MAX2(max_clock_period, 1); break; #ifdef CLOCK_MONOTONIC_RAW case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR: pTimestamps[d] = begin; break; #endif default: pTimestamps[d] = 0; break; } } #ifdef CLOCK_MONOTONIC_RAW end = vk_clock_gettime(CLOCK_MONOTONIC_RAW); #else end = vk_clock_gettime(CLOCK_MONOTONIC); #endif *pMaxDeviation = vk_time_max_deviation(begin, end, max_clock_period); return VK_SUCCESS; }