xref: /aosp_15_r20/external/mesa3d/src/nouveau/vulkan/nvk_device.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "nvk_device.h"
6 
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_entrypoints.h"
9 #include "nvk_instance.h"
10 #include "nvk_physical_device.h"
11 #include "nvk_shader.h"
12 #include "nvkmd/nvkmd.h"
13 
14 #include "vk_pipeline_cache.h"
15 #include "vulkan/wsi/wsi_common.h"
16 
17 #include "cl9097.h"
18 #include "clb097.h"
19 #include "clc397.h"
20 
21 static void
nvk_slm_area_init(struct nvk_slm_area * area)22 nvk_slm_area_init(struct nvk_slm_area *area)
23 {
24    memset(area, 0, sizeof(*area));
25    simple_mtx_init(&area->mutex, mtx_plain);
26 }
27 
28 static void
nvk_slm_area_finish(struct nvk_slm_area * area)29 nvk_slm_area_finish(struct nvk_slm_area *area)
30 {
31    simple_mtx_destroy(&area->mutex);
32    if (area->mem)
33       nvkmd_mem_unref(area->mem);
34 }
35 
36 struct nvkmd_mem *
nvk_slm_area_get_mem_ref(struct nvk_slm_area * area,uint32_t * bytes_per_warp_out,uint32_t * bytes_per_tpc_out)37 nvk_slm_area_get_mem_ref(struct nvk_slm_area *area,
38                          uint32_t *bytes_per_warp_out,
39                          uint32_t *bytes_per_tpc_out)
40 {
41    simple_mtx_lock(&area->mutex);
42    struct nvkmd_mem *mem = area->mem;
43    if (mem)
44       nvkmd_mem_ref(mem);
45    *bytes_per_warp_out = area->bytes_per_warp;
46    *bytes_per_tpc_out = area->bytes_per_tpc;
47    simple_mtx_unlock(&area->mutex);
48 
49    return mem;
50 }
51 
52 static VkResult
nvk_slm_area_ensure(struct nvk_device * dev,struct nvk_slm_area * area,uint32_t slm_bytes_per_lane,uint32_t crs_bytes_per_warp)53 nvk_slm_area_ensure(struct nvk_device *dev,
54                     struct nvk_slm_area *area,
55                     uint32_t slm_bytes_per_lane,
56                     uint32_t crs_bytes_per_warp)
57 {
58    struct nvk_physical_device *pdev = nvk_device_physical(dev);
59    VkResult result;
60 
61    assert(slm_bytes_per_lane < (1 << 24));
62    assert(crs_bytes_per_warp <= (1 << 20));
63    uint64_t bytes_per_warp = slm_bytes_per_lane * 32 + crs_bytes_per_warp;
64 
65    /* The hardware seems to require this alignment for
66     * NV9097_SET_SHADER_LOCAL_MEMORY_E_DEFAULT_SIZE_PER_WARP
67     */
68    bytes_per_warp = align64(bytes_per_warp, 0x200);
69 
70    uint64_t bytes_per_mp = bytes_per_warp * pdev->info.max_warps_per_mp;
71    uint64_t bytes_per_tpc = bytes_per_mp * pdev->info.mp_per_tpc;
72 
73    /* The hardware seems to require this alignment for
74     * NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A_SIZE_LOWER.
75     */
76    bytes_per_tpc = align64(bytes_per_tpc, 0x8000);
77 
78    /* nvk_slm_area::bytes_per_mp only ever increases so we can check this
79     * outside the lock and exit early in the common case.  We only need to
80     * take the lock if we're actually going to resize.
81     *
82     * Also, we only care about bytes_per_mp and not bytes_per_warp because
83     * they are integer multiples of each other.
84     */
85    if (likely(bytes_per_tpc <= area->bytes_per_tpc))
86       return VK_SUCCESS;
87 
88    uint64_t size = bytes_per_tpc * pdev->info.tpc_count;
89 
90    /* The hardware seems to require this alignment for
91     * NV9097_SET_SHADER_LOCAL_MEMORY_D_SIZE_LOWER.
92     */
93    size = align64(size, 0x20000);
94 
95    struct nvkmd_mem *mem;
96    result = nvkmd_dev_alloc_mem(dev->nvkmd, &dev->vk.base, size, 0,
97                                 NVKMD_MEM_LOCAL, &mem);
98    if (result != VK_SUCCESS)
99       return result;
100 
101    struct nvkmd_mem *unref_mem;
102    simple_mtx_lock(&area->mutex);
103    if (bytes_per_tpc <= area->bytes_per_tpc) {
104       /* We lost the race, throw away our BO */
105       assert(area->bytes_per_warp == bytes_per_warp);
106       unref_mem = mem;
107    } else {
108       unref_mem = area->mem;
109       area->mem = mem;
110       area->bytes_per_warp = bytes_per_warp;
111       area->bytes_per_tpc = bytes_per_tpc;
112    }
113    simple_mtx_unlock(&area->mutex);
114 
115    if (unref_mem)
116       nvkmd_mem_unref(unref_mem);
117 
118    return VK_SUCCESS;
119 }
120 
121 VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateDevice(VkPhysicalDevice physicalDevice,const VkDeviceCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkDevice * pDevice)122 nvk_CreateDevice(VkPhysicalDevice physicalDevice,
123                  const VkDeviceCreateInfo *pCreateInfo,
124                  const VkAllocationCallbacks *pAllocator,
125                  VkDevice *pDevice)
126 {
127    VK_FROM_HANDLE(nvk_physical_device, pdev, physicalDevice);
128    VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY;
129    struct nvk_device *dev;
130 
131    dev = vk_zalloc2(&pdev->vk.instance->alloc, pAllocator,
132                     sizeof(*dev), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
133    if (!dev)
134       return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY);
135 
136    struct vk_device_dispatch_table dispatch_table;
137    vk_device_dispatch_table_from_entrypoints(&dispatch_table,
138                                              &nvk_device_entrypoints, true);
139    vk_device_dispatch_table_from_entrypoints(&dispatch_table,
140                                              &wsi_device_entrypoints, false);
141 
142    result = vk_device_init(&dev->vk, &pdev->vk, &dispatch_table,
143                            pCreateInfo, pAllocator);
144    if (result != VK_SUCCESS)
145       goto fail_alloc;
146 
147    dev->vk.shader_ops = &nvk_device_shader_ops;
148 
149    result = nvkmd_pdev_create_dev(pdev->nvkmd, &pdev->vk.base, &dev->nvkmd);
150    if (result != VK_SUCCESS)
151       goto fail_init;
152 
153    vk_device_set_drm_fd(&dev->vk, nvkmd_dev_get_drm_fd(dev->nvkmd));
154    dev->vk.command_buffer_ops = &nvk_cmd_buffer_ops;
155 
156    result = nvk_upload_queue_init(dev, &dev->upload);
157    if (result != VK_SUCCESS)
158       goto fail_nvkmd;
159 
160    result = nvkmd_dev_alloc_mapped_mem(dev->nvkmd, &pdev->vk.base,
161                                        0x1000, 0, NVKMD_MEM_LOCAL,
162                                        NVKMD_MEM_MAP_WR, &dev->zero_page);
163    if (result != VK_SUCCESS)
164       goto fail_upload;
165 
166    memset(dev->zero_page->map, 0, 0x1000);
167    nvkmd_mem_unmap(dev->zero_page, 0);
168 
169    result = nvk_descriptor_table_init(dev, &dev->images,
170                                       8 * 4 /* tic entry size */,
171                                       1024, 1024 * 1024);
172    if (result != VK_SUCCESS)
173       goto fail_zero_page;
174 
175    /* Reserve the descriptor at offset 0 to be the null descriptor */
176    uint32_t null_tic[8] = { 0, };
177    nil_fill_null_tic(&pdev->info, dev->zero_page->va->addr, &null_tic);
178 
179    ASSERTED uint32_t null_image_index;
180    result = nvk_descriptor_table_add(dev, &dev->images,
181                                      null_tic, sizeof(null_tic),
182                                      &null_image_index);
183    assert(result == VK_SUCCESS);
184    assert(null_image_index == 0);
185 
186    result = nvk_descriptor_table_init(dev, &dev->samplers,
187                                       8 * 4 /* tsc entry size */,
188                                       4096, 4096);
189    if (result != VK_SUCCESS)
190       goto fail_images;
191 
192    if (dev->vk.enabled_features.descriptorBuffer ||
193        nvk_use_edb_buffer_views(pdev)) {
194       result = nvk_edb_bview_cache_init(dev, &dev->edb_bview_cache);
195       if (result != VK_SUCCESS)
196          goto fail_samplers;
197    }
198 
199    /* If we have a full BAR, go ahead and do shader uploads on the CPU.
200     * Otherwise, we fall back to doing shader uploads via the upload queue.
201     *
202     * Also, the I-cache pre-fetches and NVIDIA has informed us
203     * overallocating shaders BOs by 2K is sufficient.
204     */
205    enum nvkmd_mem_map_flags shader_map_flags = 0;
206    if (pdev->info.bar_size_B >= pdev->info.vram_size_B)
207       shader_map_flags = NVKMD_MEM_MAP_WR;
208    result = nvk_heap_init(dev, &dev->shader_heap,
209                           NVKMD_MEM_LOCAL, shader_map_flags,
210                           2048 /* overalloc */,
211                           pdev->info.cls_eng3d < VOLTA_A);
212    if (result != VK_SUCCESS)
213       goto fail_edb_bview_cache;
214 
215    result = nvk_heap_init(dev, &dev->event_heap,
216                           NVKMD_MEM_LOCAL, NVKMD_MEM_MAP_WR,
217                           0 /* overalloc */, false /* contiguous */);
218    if (result != VK_SUCCESS)
219       goto fail_shader_heap;
220 
221    nvk_slm_area_init(&dev->slm);
222 
223    if (pdev->info.cls_eng3d >= FERMI_A &&
224        pdev->info.cls_eng3d < MAXWELL_A) {
225       /* max size is 256k */
226       result = nvkmd_dev_alloc_mem(dev->nvkmd, &pdev->vk.base,
227                                    1 << 17, 1 << 20, NVKMD_MEM_LOCAL,
228                                    &dev->vab_memory);
229       if (result != VK_SUCCESS)
230          goto fail_slm;
231    }
232 
233    result = nvk_queue_init(dev, &dev->queue,
234                            &pCreateInfo->pQueueCreateInfos[0], 0);
235    if (result != VK_SUCCESS)
236       goto fail_vab_memory;
237 
238    struct vk_pipeline_cache_create_info cache_info = {
239       .weak_ref = true,
240    };
241    dev->vk.mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL);
242    if (dev->vk.mem_cache == NULL) {
243       result = VK_ERROR_OUT_OF_HOST_MEMORY;
244       goto fail_queue;
245    }
246 
247    result = nvk_device_init_meta(dev);
248    if (result != VK_SUCCESS)
249       goto fail_mem_cache;
250 
251    *pDevice = nvk_device_to_handle(dev);
252 
253    return VK_SUCCESS;
254 
255 fail_mem_cache:
256    vk_pipeline_cache_destroy(dev->vk.mem_cache, NULL);
257 fail_queue:
258    nvk_queue_finish(dev, &dev->queue);
259 fail_vab_memory:
260    if (dev->vab_memory)
261       nvkmd_mem_unref(dev->vab_memory);
262 fail_slm:
263    nvk_slm_area_finish(&dev->slm);
264    nvk_heap_finish(dev, &dev->event_heap);
265 fail_shader_heap:
266    nvk_heap_finish(dev, &dev->shader_heap);
267 fail_edb_bview_cache:
268    nvk_edb_bview_cache_finish(dev, &dev->edb_bview_cache);
269 fail_samplers:
270    nvk_descriptor_table_finish(dev, &dev->samplers);
271 fail_images:
272    nvk_descriptor_table_finish(dev, &dev->images);
273 fail_zero_page:
274    nvkmd_mem_unref(dev->zero_page);
275 fail_upload:
276    nvk_upload_queue_finish(dev, &dev->upload);
277 fail_nvkmd:
278    nvkmd_dev_destroy(dev->nvkmd);
279 fail_init:
280    vk_device_finish(&dev->vk);
281 fail_alloc:
282    vk_free(&dev->vk.alloc, dev);
283    return result;
284 }
285 
286 VKAPI_ATTR void VKAPI_CALL
nvk_DestroyDevice(VkDevice _device,const VkAllocationCallbacks * pAllocator)287 nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
288 {
289    VK_FROM_HANDLE(nvk_device, dev, _device);
290 
291    if (!dev)
292       return;
293 
294    if (dev->copy_queries)
295       vk_shader_destroy(&dev->vk, &dev->copy_queries->vk, &dev->vk.alloc);
296 
297    nvk_device_finish_meta(dev);
298 
299    vk_pipeline_cache_destroy(dev->vk.mem_cache, NULL);
300    nvk_queue_finish(dev, &dev->queue);
301    if (dev->vab_memory)
302       nvkmd_mem_unref(dev->vab_memory);
303    vk_device_finish(&dev->vk);
304 
305    /* Idle the upload queue before we tear down heaps */
306    nvk_upload_queue_sync(dev, &dev->upload);
307 
308    nvk_slm_area_finish(&dev->slm);
309    nvk_heap_finish(dev, &dev->event_heap);
310    nvk_heap_finish(dev, &dev->shader_heap);
311    nvk_edb_bview_cache_finish(dev, &dev->edb_bview_cache);
312    nvk_descriptor_table_finish(dev, &dev->samplers);
313    nvk_descriptor_table_finish(dev, &dev->images);
314    nvkmd_mem_unref(dev->zero_page);
315    nvk_upload_queue_finish(dev, &dev->upload);
316    nvkmd_dev_destroy(dev->nvkmd);
317    vk_free(&dev->vk.alloc, dev);
318 }
319 
320 VKAPI_ATTR VkResult VKAPI_CALL
nvk_GetCalibratedTimestampsKHR(VkDevice _device,uint32_t timestampCount,const VkCalibratedTimestampInfoKHR * pTimestampInfos,uint64_t * pTimestamps,uint64_t * pMaxDeviation)321 nvk_GetCalibratedTimestampsKHR(VkDevice _device,
322                                uint32_t timestampCount,
323                                const VkCalibratedTimestampInfoKHR *pTimestampInfos,
324                                uint64_t *pTimestamps,
325                                uint64_t *pMaxDeviation)
326 {
327    VK_FROM_HANDLE(nvk_device, dev, _device);
328    uint64_t max_clock_period = 0;
329    uint64_t begin, end;
330    int d;
331 
332 #ifdef CLOCK_MONOTONIC_RAW
333    begin = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
334 #else
335    begin = vk_clock_gettime(CLOCK_MONOTONIC);
336 #endif
337 
338    for (d = 0; d < timestampCount; d++) {
339       switch (pTimestampInfos[d].timeDomain) {
340       case VK_TIME_DOMAIN_DEVICE_KHR:
341          pTimestamps[d] = nvkmd_dev_get_gpu_timestamp(dev->nvkmd);
342          max_clock_period = MAX2(max_clock_period, 1); /* FIXME: Is timestamp period actually 1? */
343          break;
344       case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR:
345          pTimestamps[d] = vk_clock_gettime(CLOCK_MONOTONIC);
346          max_clock_period = MAX2(max_clock_period, 1);
347          break;
348 
349 #ifdef CLOCK_MONOTONIC_RAW
350       case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR:
351          pTimestamps[d] = begin;
352          break;
353 #endif
354       default:
355          pTimestamps[d] = 0;
356          break;
357       }
358    }
359 
360 #ifdef CLOCK_MONOTONIC_RAW
361    end = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
362 #else
363    end = vk_clock_gettime(CLOCK_MONOTONIC);
364 #endif
365 
366    *pMaxDeviation = vk_time_max_deviation(begin, end, max_clock_period);
367 
368    return VK_SUCCESS;
369 }
370 
371 VkResult
nvk_device_ensure_slm(struct nvk_device * dev,uint32_t slm_bytes_per_lane,uint32_t crs_bytes_per_warp)372 nvk_device_ensure_slm(struct nvk_device *dev,
373                       uint32_t slm_bytes_per_lane,
374                       uint32_t crs_bytes_per_warp)
375 {
376    return nvk_slm_area_ensure(dev, &dev->slm,
377                               slm_bytes_per_lane,
378                               crs_bytes_per_warp);
379 }
380