/* * Copyright © 2022 Collabora Ltd. and Red Hat Inc. * SPDX-License-Identifier: MIT */ #include "nvk_device.h" #include "nvk_cmd_buffer.h" #include "nvk_entrypoints.h" #include "nvk_instance.h" #include "nvk_physical_device.h" #include "nvk_shader.h" #include "nvkmd/nvkmd.h" #include "vk_pipeline_cache.h" #include "vulkan/wsi/wsi_common.h" #include "cl9097.h" #include "clb097.h" #include "clc397.h" static void nvk_slm_area_init(struct nvk_slm_area *area) { memset(area, 0, sizeof(*area)); simple_mtx_init(&area->mutex, mtx_plain); } static void nvk_slm_area_finish(struct nvk_slm_area *area) { simple_mtx_destroy(&area->mutex); if (area->mem) nvkmd_mem_unref(area->mem); } struct nvkmd_mem * nvk_slm_area_get_mem_ref(struct nvk_slm_area *area, uint32_t *bytes_per_warp_out, uint32_t *bytes_per_tpc_out) { simple_mtx_lock(&area->mutex); struct nvkmd_mem *mem = area->mem; if (mem) nvkmd_mem_ref(mem); *bytes_per_warp_out = area->bytes_per_warp; *bytes_per_tpc_out = area->bytes_per_tpc; simple_mtx_unlock(&area->mutex); return mem; } static VkResult nvk_slm_area_ensure(struct nvk_device *dev, struct nvk_slm_area *area, uint32_t slm_bytes_per_lane, uint32_t crs_bytes_per_warp) { struct nvk_physical_device *pdev = nvk_device_physical(dev); VkResult result; assert(slm_bytes_per_lane < (1 << 24)); assert(crs_bytes_per_warp <= (1 << 20)); uint64_t bytes_per_warp = slm_bytes_per_lane * 32 + crs_bytes_per_warp; /* The hardware seems to require this alignment for * NV9097_SET_SHADER_LOCAL_MEMORY_E_DEFAULT_SIZE_PER_WARP */ bytes_per_warp = align64(bytes_per_warp, 0x200); uint64_t bytes_per_mp = bytes_per_warp * pdev->info.max_warps_per_mp; uint64_t bytes_per_tpc = bytes_per_mp * pdev->info.mp_per_tpc; /* The hardware seems to require this alignment for * NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A_SIZE_LOWER. */ bytes_per_tpc = align64(bytes_per_tpc, 0x8000); /* nvk_slm_area::bytes_per_mp only ever increases so we can check this * outside the lock and exit early in the common case. We only need to * take the lock if we're actually going to resize. * * Also, we only care about bytes_per_mp and not bytes_per_warp because * they are integer multiples of each other. */ if (likely(bytes_per_tpc <= area->bytes_per_tpc)) return VK_SUCCESS; uint64_t size = bytes_per_tpc * pdev->info.tpc_count; /* The hardware seems to require this alignment for * NV9097_SET_SHADER_LOCAL_MEMORY_D_SIZE_LOWER. */ size = align64(size, 0x20000); struct nvkmd_mem *mem; result = nvkmd_dev_alloc_mem(dev->nvkmd, &dev->vk.base, size, 0, NVKMD_MEM_LOCAL, &mem); if (result != VK_SUCCESS) return result; struct nvkmd_mem *unref_mem; simple_mtx_lock(&area->mutex); if (bytes_per_tpc <= area->bytes_per_tpc) { /* We lost the race, throw away our BO */ assert(area->bytes_per_warp == bytes_per_warp); unref_mem = mem; } else { unref_mem = area->mem; area->mem = mem; area->bytes_per_warp = bytes_per_warp; area->bytes_per_tpc = bytes_per_tpc; } simple_mtx_unlock(&area->mutex); if (unref_mem) nvkmd_mem_unref(unref_mem); return VK_SUCCESS; } VKAPI_ATTR VkResult VKAPI_CALL nvk_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkDevice *pDevice) { VK_FROM_HANDLE(nvk_physical_device, pdev, physicalDevice); VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY; struct nvk_device *dev; dev = vk_zalloc2(&pdev->vk.instance->alloc, pAllocator, sizeof(*dev), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!dev) return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY); struct vk_device_dispatch_table dispatch_table; vk_device_dispatch_table_from_entrypoints(&dispatch_table, &nvk_device_entrypoints, true); vk_device_dispatch_table_from_entrypoints(&dispatch_table, &wsi_device_entrypoints, false); result = vk_device_init(&dev->vk, &pdev->vk, &dispatch_table, pCreateInfo, pAllocator); if (result != VK_SUCCESS) goto fail_alloc; dev->vk.shader_ops = &nvk_device_shader_ops; result = nvkmd_pdev_create_dev(pdev->nvkmd, &pdev->vk.base, &dev->nvkmd); if (result != VK_SUCCESS) goto fail_init; vk_device_set_drm_fd(&dev->vk, nvkmd_dev_get_drm_fd(dev->nvkmd)); dev->vk.command_buffer_ops = &nvk_cmd_buffer_ops; result = nvk_upload_queue_init(dev, &dev->upload); if (result != VK_SUCCESS) goto fail_nvkmd; result = nvkmd_dev_alloc_mapped_mem(dev->nvkmd, &pdev->vk.base, 0x1000, 0, NVKMD_MEM_LOCAL, NVKMD_MEM_MAP_WR, &dev->zero_page); if (result != VK_SUCCESS) goto fail_upload; memset(dev->zero_page->map, 0, 0x1000); nvkmd_mem_unmap(dev->zero_page, 0); result = nvk_descriptor_table_init(dev, &dev->images, 8 * 4 /* tic entry size */, 1024, 1024 * 1024); if (result != VK_SUCCESS) goto fail_zero_page; /* Reserve the descriptor at offset 0 to be the null descriptor */ uint32_t null_tic[8] = { 0, }; nil_fill_null_tic(&pdev->info, dev->zero_page->va->addr, &null_tic); ASSERTED uint32_t null_image_index; result = nvk_descriptor_table_add(dev, &dev->images, null_tic, sizeof(null_tic), &null_image_index); assert(result == VK_SUCCESS); assert(null_image_index == 0); result = nvk_descriptor_table_init(dev, &dev->samplers, 8 * 4 /* tsc entry size */, 4096, 4096); if (result != VK_SUCCESS) goto fail_images; if (dev->vk.enabled_features.descriptorBuffer || nvk_use_edb_buffer_views(pdev)) { result = nvk_edb_bview_cache_init(dev, &dev->edb_bview_cache); if (result != VK_SUCCESS) goto fail_samplers; } /* If we have a full BAR, go ahead and do shader uploads on the CPU. * Otherwise, we fall back to doing shader uploads via the upload queue. * * Also, the I-cache pre-fetches and NVIDIA has informed us * overallocating shaders BOs by 2K is sufficient. */ enum nvkmd_mem_map_flags shader_map_flags = 0; if (pdev->info.bar_size_B >= pdev->info.vram_size_B) shader_map_flags = NVKMD_MEM_MAP_WR; result = nvk_heap_init(dev, &dev->shader_heap, NVKMD_MEM_LOCAL, shader_map_flags, 2048 /* overalloc */, pdev->info.cls_eng3d < VOLTA_A); if (result != VK_SUCCESS) goto fail_edb_bview_cache; result = nvk_heap_init(dev, &dev->event_heap, NVKMD_MEM_LOCAL, NVKMD_MEM_MAP_WR, 0 /* overalloc */, false /* contiguous */); if (result != VK_SUCCESS) goto fail_shader_heap; nvk_slm_area_init(&dev->slm); if (pdev->info.cls_eng3d >= FERMI_A && pdev->info.cls_eng3d < MAXWELL_A) { /* max size is 256k */ result = nvkmd_dev_alloc_mem(dev->nvkmd, &pdev->vk.base, 1 << 17, 1 << 20, NVKMD_MEM_LOCAL, &dev->vab_memory); if (result != VK_SUCCESS) goto fail_slm; } result = nvk_queue_init(dev, &dev->queue, &pCreateInfo->pQueueCreateInfos[0], 0); if (result != VK_SUCCESS) goto fail_vab_memory; struct vk_pipeline_cache_create_info cache_info = { .weak_ref = true, }; dev->vk.mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL); if (dev->vk.mem_cache == NULL) { result = VK_ERROR_OUT_OF_HOST_MEMORY; goto fail_queue; } result = nvk_device_init_meta(dev); if (result != VK_SUCCESS) goto fail_mem_cache; *pDevice = nvk_device_to_handle(dev); return VK_SUCCESS; fail_mem_cache: vk_pipeline_cache_destroy(dev->vk.mem_cache, NULL); fail_queue: nvk_queue_finish(dev, &dev->queue); fail_vab_memory: if (dev->vab_memory) nvkmd_mem_unref(dev->vab_memory); fail_slm: nvk_slm_area_finish(&dev->slm); nvk_heap_finish(dev, &dev->event_heap); fail_shader_heap: nvk_heap_finish(dev, &dev->shader_heap); fail_edb_bview_cache: nvk_edb_bview_cache_finish(dev, &dev->edb_bview_cache); fail_samplers: nvk_descriptor_table_finish(dev, &dev->samplers); fail_images: nvk_descriptor_table_finish(dev, &dev->images); fail_zero_page: nvkmd_mem_unref(dev->zero_page); fail_upload: nvk_upload_queue_finish(dev, &dev->upload); fail_nvkmd: nvkmd_dev_destroy(dev->nvkmd); fail_init: vk_device_finish(&dev->vk); fail_alloc: vk_free(&dev->vk.alloc, dev); return result; } VKAPI_ATTR void VKAPI_CALL nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) { VK_FROM_HANDLE(nvk_device, dev, _device); if (!dev) return; if (dev->copy_queries) vk_shader_destroy(&dev->vk, &dev->copy_queries->vk, &dev->vk.alloc); nvk_device_finish_meta(dev); vk_pipeline_cache_destroy(dev->vk.mem_cache, NULL); nvk_queue_finish(dev, &dev->queue); if (dev->vab_memory) nvkmd_mem_unref(dev->vab_memory); vk_device_finish(&dev->vk); /* Idle the upload queue before we tear down heaps */ nvk_upload_queue_sync(dev, &dev->upload); nvk_slm_area_finish(&dev->slm); nvk_heap_finish(dev, &dev->event_heap); nvk_heap_finish(dev, &dev->shader_heap); nvk_edb_bview_cache_finish(dev, &dev->edb_bview_cache); nvk_descriptor_table_finish(dev, &dev->samplers); nvk_descriptor_table_finish(dev, &dev->images); nvkmd_mem_unref(dev->zero_page); nvk_upload_queue_finish(dev, &dev->upload); nvkmd_dev_destroy(dev->nvkmd); vk_free(&dev->vk.alloc, dev); } VKAPI_ATTR VkResult VKAPI_CALL nvk_GetCalibratedTimestampsKHR(VkDevice _device, uint32_t timestampCount, const VkCalibratedTimestampInfoKHR *pTimestampInfos, uint64_t *pTimestamps, uint64_t *pMaxDeviation) { VK_FROM_HANDLE(nvk_device, dev, _device); uint64_t max_clock_period = 0; uint64_t begin, end; int d; #ifdef CLOCK_MONOTONIC_RAW begin = vk_clock_gettime(CLOCK_MONOTONIC_RAW); #else begin = vk_clock_gettime(CLOCK_MONOTONIC); #endif for (d = 0; d < timestampCount; d++) { switch (pTimestampInfos[d].timeDomain) { case VK_TIME_DOMAIN_DEVICE_KHR: pTimestamps[d] = nvkmd_dev_get_gpu_timestamp(dev->nvkmd); max_clock_period = MAX2(max_clock_period, 1); /* FIXME: Is timestamp period actually 1? */ break; case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR: pTimestamps[d] = vk_clock_gettime(CLOCK_MONOTONIC); max_clock_period = MAX2(max_clock_period, 1); break; #ifdef CLOCK_MONOTONIC_RAW case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR: pTimestamps[d] = begin; break; #endif default: pTimestamps[d] = 0; break; } } #ifdef CLOCK_MONOTONIC_RAW end = vk_clock_gettime(CLOCK_MONOTONIC_RAW); #else end = vk_clock_gettime(CLOCK_MONOTONIC); #endif *pMaxDeviation = vk_time_max_deviation(begin, end, max_clock_period); return VK_SUCCESS; } VkResult nvk_device_ensure_slm(struct nvk_device *dev, uint32_t slm_bytes_per_lane, uint32_t crs_bytes_per_warp) { return nvk_slm_area_ensure(dev, &dev->slm, slm_bytes_per_lane, crs_bytes_per_warp); }