/* * Copyright © 2022 Collabora Ltd. and Red Hat Inc. * SPDX-License-Identifier: MIT */ #include "nvk_heap.h" #include "nvk_device.h" #include "nvk_physical_device.h" #include "nvk_queue.h" #include "util/macros.h" #include "nv_push.h" #include "nv_push_cl90b5.h" VkResult nvk_heap_init(struct nvk_device *dev, struct nvk_heap *heap, enum nvkmd_mem_flags mem_flags, enum nvkmd_mem_map_flags map_flags, uint32_t overalloc, bool contiguous) { VkResult result; memset(heap, 0, sizeof(*heap)); heap->mem_flags = mem_flags; if (map_flags) heap->mem_flags |= NVKMD_MEM_CAN_MAP; heap->map_flags = map_flags; heap->overalloc = overalloc; if (contiguous) { result = nvkmd_dev_alloc_va(dev->nvkmd, &dev->vk.base, 0 /* va_flags */, 0 /* pte_kind */, NVK_HEAP_MAX_SIZE, 0 /* align_B */, 0 /* fixed_addr */, &heap->contig_va); if (result != VK_SUCCESS) return result; } simple_mtx_init(&heap->mutex, mtx_plain); util_vma_heap_init(&heap->heap, 0, 0); heap->total_size = 0; heap->mem_count = 0; return VK_SUCCESS; } void nvk_heap_finish(struct nvk_device *dev, struct nvk_heap *heap) { /* Freeing the VA will unbind all the memory */ if (heap->contig_va) nvkmd_va_free(heap->contig_va); for (uint32_t mem_idx = 0; mem_idx < heap->mem_count; mem_idx++) nvkmd_mem_unref(heap->mem[mem_idx].mem); util_vma_heap_finish(&heap->heap); simple_mtx_destroy(&heap->mutex); } static uint64_t encode_vma(uint32_t mem_idx, uint64_t mem_offset) { assert(mem_idx < UINT16_MAX - 1); assert(mem_offset < (1ull << 48)); return ((uint64_t)(mem_idx + 1) << 48) | mem_offset; } static uint32_t vma_mem_idx(uint64_t offset) { offset = offset >> 48; assert(offset > 0); return offset - 1; } static uint64_t vma_mem_offset(uint64_t offset) { return offset & BITFIELD64_MASK(48); } static VkResult nvk_heap_grow_locked(struct nvk_device *dev, struct nvk_heap *heap) { VkResult result; if (heap->mem_count >= NVK_HEAP_MAX_BO_COUNT) { return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, "Heap has already hit its maximum size"); } /* First two BOs are MIN_SIZE, double after that */ const uint64_t new_mem_size = NVK_HEAP_MIN_SIZE << (MAX2(heap->mem_count, 1) - 1); struct nvkmd_mem *mem; if (heap->map_flags) { result = nvkmd_dev_alloc_mapped_mem(dev->nvkmd, &dev->vk.base, new_mem_size, 0, heap->mem_flags, heap->map_flags, &mem); } else { result = nvkmd_dev_alloc_mem(dev->nvkmd, &dev->vk.base, new_mem_size, 0, heap->mem_flags, &mem); } if (result != VK_SUCCESS) return result; assert(mem->size_B == new_mem_size); uint64_t addr; if (heap->contig_va != NULL) { result = nvkmd_va_bind_mem(heap->contig_va, &dev->vk.base, heap->total_size, mem, 0, new_mem_size); if (result != VK_SUCCESS) { nvkmd_mem_unref(mem); return result; } addr = heap->contig_va->addr + heap->total_size; /* For contiguous heaps, we can now free the padding from the previous * BO because the BO we just added will provide the needed padding. For * non-contiguous heaps, we have to leave each BO padded individually. */ if (heap->mem_count > 0) { struct nvkmd_mem *prev_mem = heap->mem[heap->mem_count - 1].mem; assert(heap->overalloc < prev_mem->size_B); const uint64_t pad_vma = encode_vma(heap->mem_count - 1, prev_mem->size_B - heap->overalloc); util_vma_heap_free(&heap->heap, pad_vma, heap->overalloc); } } else { addr = mem->va->addr; } uint64_t vma = encode_vma(heap->mem_count, 0); assert(heap->overalloc < new_mem_size); util_vma_heap_free(&heap->heap, vma, new_mem_size - heap->overalloc); heap->mem[heap->mem_count++] = (struct nvk_heap_mem) { .mem = mem, .addr = addr, }; heap->total_size += new_mem_size; return VK_SUCCESS; } static VkResult nvk_heap_alloc_locked(struct nvk_device *dev, struct nvk_heap *heap, uint64_t size, uint32_t alignment, uint64_t *addr_out, void **map_out) { while (1) { uint64_t vma = util_vma_heap_alloc(&heap->heap, size, alignment); if (vma != 0) { uint32_t mem_idx = vma_mem_idx(vma); uint64_t mem_offset = vma_mem_offset(vma); assert(mem_idx < heap->mem_count); assert(heap->mem[mem_idx].mem != NULL); assert(mem_offset + size <= heap->mem[mem_idx].mem->size_B); *addr_out = heap->mem[mem_idx].addr + mem_offset; if (map_out != NULL) { if (heap->mem[mem_idx].mem->map != NULL) *map_out = (char *)heap->mem[mem_idx].mem->map + mem_offset; else *map_out = NULL; } return VK_SUCCESS; } VkResult result = nvk_heap_grow_locked(dev, heap); if (result != VK_SUCCESS) return result; } } static void nvk_heap_free_locked(struct nvk_device *dev, struct nvk_heap *heap, uint64_t addr, uint64_t size) { assert(addr + size > addr); for (uint32_t mem_idx = 0; mem_idx < heap->mem_count; mem_idx++) { if (addr < heap->mem[mem_idx].addr) continue; uint64_t mem_offset = addr - heap->mem[mem_idx].addr; if (mem_offset >= heap->mem[mem_idx].mem->size_B) continue; assert(mem_offset + size <= heap->mem[mem_idx].mem->size_B); uint64_t vma = encode_vma(mem_idx, mem_offset); util_vma_heap_free(&heap->heap, vma, size); return; } assert(!"Failed to find heap BO"); } VkResult nvk_heap_alloc(struct nvk_device *dev, struct nvk_heap *heap, uint64_t size, uint32_t alignment, uint64_t *addr_out, void **map_out) { simple_mtx_lock(&heap->mutex); VkResult result = nvk_heap_alloc_locked(dev, heap, size, alignment, addr_out, map_out); simple_mtx_unlock(&heap->mutex); return result; } VkResult nvk_heap_upload(struct nvk_device *dev, struct nvk_heap *heap, const void *data, size_t size, uint32_t alignment, uint64_t *addr_out) { simple_mtx_lock(&heap->mutex); void *map = NULL; VkResult result = nvk_heap_alloc_locked(dev, heap, size, alignment, addr_out, &map); simple_mtx_unlock(&heap->mutex); if (result != VK_SUCCESS) return result; if (map != NULL && (heap->map_flags & NVKMD_MEM_MAP_WR)) { /* If we have a map, copy directly with memcpy */ memcpy(map, data, size); } else { /* Otherwise, kick off an upload with the upload queue. * * This is a queued operation that the driver ensures happens before any * more client work via semaphores. Because this is asynchronous and * heap allocations are synchronous we have to be a bit careful here. * The heap only ever tracks the current known CPU state of everything * while the upload queue makes that state valid at some point in the * future. * * This can be especially tricky for very fast upload/free cycles such * as if the client compiles a shader, throws it away without using it, * and then compiles another shader that ends up at the same address. * What makes this all correct is the fact that the everything on the * upload queue happens in a well-defined device-wide order. In this * case the first shader will get uploaded and then the second will get * uploaded over top of it. As long as we don't free the memory out * from under the upload queue, everything will end up in the correct * state by the time the client's shaders actually execute. */ result = nvk_upload_queue_upload(dev, &dev->upload, *addr_out, data, size); if (result != VK_SUCCESS) { nvk_heap_free(dev, heap, *addr_out, size); return result; } } return VK_SUCCESS; } void nvk_heap_free(struct nvk_device *dev, struct nvk_heap *heap, uint64_t addr, uint64_t size) { simple_mtx_lock(&heap->mutex); nvk_heap_free_locked(dev, heap, addr, size); simple_mtx_unlock(&heap->mutex); }