1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * SPDX-License-Identifier: MIT
9 */
10
11 #include <fcntl.h>
12 #include <stdbool.h>
13 #include <string.h>
14
15 #ifdef __FreeBSD__
16 #include <sys/types.h>
17 #endif
18 #ifdef MAJOR_IN_MKDEV
19 #include <sys/mkdev.h>
20 #endif
21 #ifdef MAJOR_IN_SYSMACROS
22 #include <sys/sysmacros.h>
23 #endif
24
25 #ifdef __linux__
26 #include <sys/inotify.h>
27 #endif
28
29 #include "meta/radv_meta.h"
30 #include "util/disk_cache.h"
31 #include "util/u_debug.h"
32 #include "radv_cs.h"
33 #include "radv_debug.h"
34 #include "radv_entrypoints.h"
35 #include "radv_formats.h"
36 #include "radv_physical_device.h"
37 #include "radv_printf.h"
38 #include "radv_rmv.h"
39 #include "radv_shader.h"
40 #include "radv_spm.h"
41 #include "radv_sqtt.h"
42 #include "vk_common_entrypoints.h"
43 #include "vk_pipeline_cache.h"
44 #include "vk_semaphore.h"
45 #include "vk_util.h"
46 #ifdef _WIN32
47 typedef void *drmDevicePtr;
48 #include <io.h>
49 #else
50 #include <amdgpu.h>
51 #include <xf86drm.h>
52 #include "drm-uapi/amdgpu_drm.h"
53 #include "winsys/amdgpu/radv_amdgpu_winsys_public.h"
54 #endif
55 #include "util/build_id.h"
56 #include "util/driconf.h"
57 #include "util/mesa-sha1.h"
58 #include "util/os_time.h"
59 #include "util/timespec.h"
60 #include "util/u_atomic.h"
61 #include "util/u_process.h"
62 #include "vulkan/vk_icd.h"
63 #include "winsys/null/radv_null_winsys_public.h"
64 #include "git_sha1.h"
65 #include "sid.h"
66 #include "vk_common_entrypoints.h"
67 #include "vk_format.h"
68 #include "vk_sync.h"
69 #include "vk_sync_dummy.h"
70
71 #if AMD_LLVM_AVAILABLE
72 #include "ac_llvm_util.h"
73 #endif
74
75 #include "ac_descriptors.h"
76 #include "ac_formats.h"
77
78 static bool
radv_spm_trace_enabled(const struct radv_instance * instance)79 radv_spm_trace_enabled(const struct radv_instance *instance)
80 {
81 return (instance->vk.trace_mode & RADV_TRACE_MODE_RGP) &&
82 debug_get_bool_option("RADV_THREAD_TRACE_CACHE_COUNTERS", true);
83 }
84
85 static bool
radv_trap_handler_enabled()86 radv_trap_handler_enabled()
87 {
88 return !!getenv("RADV_TRAP_HANDLER");
89 }
90
91 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetMemoryHostPointerPropertiesEXT(VkDevice _device,VkExternalMemoryHandleTypeFlagBits handleType,const void * pHostPointer,VkMemoryHostPointerPropertiesEXT * pMemoryHostPointerProperties)92 radv_GetMemoryHostPointerPropertiesEXT(VkDevice _device, VkExternalMemoryHandleTypeFlagBits handleType,
93 const void *pHostPointer,
94 VkMemoryHostPointerPropertiesEXT *pMemoryHostPointerProperties)
95 {
96 VK_FROM_HANDLE(radv_device, device, _device);
97 const struct radv_physical_device *pdev = radv_device_physical(device);
98
99 switch (handleType) {
100 case VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT: {
101 uint32_t memoryTypeBits = 0;
102 for (int i = 0; i < pdev->memory_properties.memoryTypeCount; i++) {
103 if (pdev->memory_domains[i] == RADEON_DOMAIN_GTT && !(pdev->memory_flags[i] & RADEON_FLAG_GTT_WC)) {
104 memoryTypeBits = (1 << i);
105 break;
106 }
107 }
108 pMemoryHostPointerProperties->memoryTypeBits = memoryTypeBits;
109 return VK_SUCCESS;
110 }
111 default:
112 return VK_ERROR_INVALID_EXTERNAL_HANDLE;
113 }
114 }
115
116 static VkResult
radv_device_init_border_color(struct radv_device * device)117 radv_device_init_border_color(struct radv_device *device)
118 {
119 VkResult result;
120
121 result = radv_bo_create(device, NULL, RADV_BORDER_COLOR_BUFFER_SIZE, 4096, RADEON_DOMAIN_VRAM,
122 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_READ_ONLY | RADEON_FLAG_NO_INTERPROCESS_SHARING,
123 RADV_BO_PRIORITY_SHADER, 0, true, &device->border_color_data.bo);
124
125 if (result != VK_SUCCESS)
126 return vk_error(device, result);
127
128 radv_rmv_log_border_color_palette_create(device, device->border_color_data.bo);
129
130 result = device->ws->buffer_make_resident(device->ws, device->border_color_data.bo, true);
131 if (result != VK_SUCCESS)
132 return vk_error(device, result);
133
134 device->border_color_data.colors_gpu_ptr = radv_buffer_map(device->ws, device->border_color_data.bo);
135 if (!device->border_color_data.colors_gpu_ptr)
136 return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
137 mtx_init(&device->border_color_data.mutex, mtx_plain);
138
139 return VK_SUCCESS;
140 }
141
142 static void
radv_device_finish_border_color(struct radv_device * device)143 radv_device_finish_border_color(struct radv_device *device)
144 {
145 if (device->border_color_data.bo) {
146 radv_rmv_log_border_color_palette_destroy(device, device->border_color_data.bo);
147 device->ws->buffer_make_resident(device->ws, device->border_color_data.bo, false);
148 radv_bo_destroy(device, NULL, device->border_color_data.bo);
149
150 mtx_destroy(&device->border_color_data.mutex);
151 }
152 }
153
154 static struct radv_shader_part *
_radv_create_vs_prolog(struct radv_device * device,const void * _key)155 _radv_create_vs_prolog(struct radv_device *device, const void *_key)
156 {
157 struct radv_vs_prolog_key *key = (struct radv_vs_prolog_key *)_key;
158 return radv_create_vs_prolog(device, key);
159 }
160
161 static uint32_t
radv_hash_vs_prolog(const void * key_)162 radv_hash_vs_prolog(const void *key_)
163 {
164 const struct radv_vs_prolog_key *key = key_;
165 return _mesa_hash_data(key, sizeof(*key));
166 }
167
168 static bool
radv_cmp_vs_prolog(const void * a_,const void * b_)169 radv_cmp_vs_prolog(const void *a_, const void *b_)
170 {
171 const struct radv_vs_prolog_key *a = a_;
172 const struct radv_vs_prolog_key *b = b_;
173
174 return memcmp(a, b, sizeof(*a)) == 0;
175 }
176
177 static struct radv_shader_part_cache_ops vs_prolog_ops = {
178 .create = _radv_create_vs_prolog,
179 .hash = radv_hash_vs_prolog,
180 .equals = radv_cmp_vs_prolog,
181 };
182
183 static VkResult
radv_device_init_vs_prologs(struct radv_device * device)184 radv_device_init_vs_prologs(struct radv_device *device)
185 {
186 const struct radv_physical_device *pdev = radv_device_physical(device);
187 const struct radv_instance *instance = radv_physical_device_instance(pdev);
188
189 if (!radv_shader_part_cache_init(&device->vs_prologs, &vs_prolog_ops))
190 return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
191
192 /* don't pre-compile prologs if we want to print them */
193 if (instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS)
194 return VK_SUCCESS;
195
196 struct radv_vs_prolog_key key;
197 memset(&key, 0, sizeof(key));
198 key.as_ls = false;
199 key.is_ngg = pdev->use_ngg;
200 key.next_stage = MESA_SHADER_VERTEX;
201 key.wave32 = pdev->ge_wave_size == 32;
202
203 for (unsigned i = 1; i <= MAX_VERTEX_ATTRIBS; i++) {
204 key.instance_rate_inputs = 0;
205 key.num_attributes = i;
206
207 device->simple_vs_prologs[i - 1] = radv_create_vs_prolog(device, &key);
208 if (!device->simple_vs_prologs[i - 1])
209 return vk_error(instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
210 }
211
212 unsigned idx = 0;
213 for (unsigned num_attributes = 1; num_attributes <= 16; num_attributes++) {
214 for (unsigned count = 1; count <= num_attributes; count++) {
215 for (unsigned start = 0; start <= (num_attributes - count); start++) {
216 key.instance_rate_inputs = u_bit_consecutive(start, count);
217 key.num_attributes = num_attributes;
218
219 struct radv_shader_part *prolog = radv_create_vs_prolog(device, &key);
220 if (!prolog)
221 return vk_error(instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
222
223 assert(idx == radv_instance_rate_prolog_index(num_attributes, key.instance_rate_inputs));
224 device->instance_rate_vs_prologs[idx++] = prolog;
225 }
226 }
227 }
228 assert(idx == ARRAY_SIZE(device->instance_rate_vs_prologs));
229
230 return VK_SUCCESS;
231 }
232
233 static void
radv_device_finish_vs_prologs(struct radv_device * device)234 radv_device_finish_vs_prologs(struct radv_device *device)
235 {
236 if (device->vs_prologs.ops)
237 radv_shader_part_cache_finish(device, &device->vs_prologs);
238
239 for (unsigned i = 0; i < ARRAY_SIZE(device->simple_vs_prologs); i++) {
240 if (!device->simple_vs_prologs[i])
241 continue;
242
243 radv_shader_part_unref(device, device->simple_vs_prologs[i]);
244 }
245
246 for (unsigned i = 0; i < ARRAY_SIZE(device->instance_rate_vs_prologs); i++) {
247 if (!device->instance_rate_vs_prologs[i])
248 continue;
249
250 radv_shader_part_unref(device, device->instance_rate_vs_prologs[i]);
251 }
252 }
253
254 static struct radv_shader_part *
_radv_create_ps_epilog(struct radv_device * device,const void * _key)255 _radv_create_ps_epilog(struct radv_device *device, const void *_key)
256 {
257 struct radv_ps_epilog_key *key = (struct radv_ps_epilog_key *)_key;
258 return radv_create_ps_epilog(device, key, NULL);
259 }
260
261 static uint32_t
radv_hash_ps_epilog(const void * key_)262 radv_hash_ps_epilog(const void *key_)
263 {
264 const struct radv_ps_epilog_key *key = key_;
265 return _mesa_hash_data(key, sizeof(*key));
266 }
267
268 static bool
radv_cmp_ps_epilog(const void * a_,const void * b_)269 radv_cmp_ps_epilog(const void *a_, const void *b_)
270 {
271 const struct radv_ps_epilog_key *a = a_;
272 const struct radv_ps_epilog_key *b = b_;
273
274 return memcmp(a, b, sizeof(*a)) == 0;
275 }
276
277 static struct radv_shader_part_cache_ops ps_epilog_ops = {
278 .create = _radv_create_ps_epilog,
279 .hash = radv_hash_ps_epilog,
280 .equals = radv_cmp_ps_epilog,
281 };
282
283 VkResult
radv_device_init_vrs_state(struct radv_device * device)284 radv_device_init_vrs_state(struct radv_device *device)
285 {
286 VkDeviceMemory mem;
287 VkBuffer buffer;
288 VkResult result;
289 VkImage image;
290
291 VkImageCreateInfo image_create_info = {
292 .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
293 .imageType = VK_IMAGE_TYPE_2D,
294 .format = VK_FORMAT_D16_UNORM,
295 .extent = {MAX_FRAMEBUFFER_WIDTH, MAX_FRAMEBUFFER_HEIGHT, 1},
296 .mipLevels = 1,
297 .arrayLayers = 1,
298 .samples = VK_SAMPLE_COUNT_1_BIT,
299 .tiling = VK_IMAGE_TILING_OPTIMAL,
300 .usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
301 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
302 .queueFamilyIndexCount = 0,
303 .pQueueFamilyIndices = NULL,
304 .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
305 };
306
307 result =
308 radv_image_create(radv_device_to_handle(device), &(struct radv_image_create_info){.vk_info = &image_create_info},
309 &device->meta_state.alloc, &image, true);
310 if (result != VK_SUCCESS)
311 return result;
312
313 VkBufferCreateInfo buffer_create_info = {
314 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
315 .pNext =
316 &(VkBufferUsageFlags2CreateInfoKHR){
317 .sType = VK_STRUCTURE_TYPE_BUFFER_USAGE_FLAGS_2_CREATE_INFO_KHR,
318 .usage = VK_BUFFER_USAGE_2_STORAGE_BUFFER_BIT_KHR,
319 },
320 .size = radv_image_from_handle(image)->planes[0].surface.meta_size,
321 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
322 };
323
324 result = radv_create_buffer(device, &buffer_create_info, &device->meta_state.alloc, &buffer, true);
325 if (result != VK_SUCCESS)
326 goto fail_create;
327
328 VkBufferMemoryRequirementsInfo2 info = {
329 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,
330 .buffer = buffer,
331 };
332 VkMemoryRequirements2 mem_req = {
333 .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
334 };
335 vk_common_GetBufferMemoryRequirements2(radv_device_to_handle(device), &info, &mem_req);
336
337 VkMemoryAllocateInfo alloc_info = {
338 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
339 .allocationSize = mem_req.memoryRequirements.size,
340 };
341
342 result = radv_alloc_memory(device, &alloc_info, &device->meta_state.alloc, &mem, true);
343 if (result != VK_SUCCESS)
344 goto fail_alloc;
345
346 VkBindBufferMemoryInfo bind_info = {.sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
347 .buffer = buffer,
348 .memory = mem,
349 .memoryOffset = 0};
350
351 result = radv_BindBufferMemory2(radv_device_to_handle(device), 1, &bind_info);
352 if (result != VK_SUCCESS)
353 goto fail_bind;
354
355 device->vrs.image = radv_image_from_handle(image);
356 device->vrs.buffer = radv_buffer_from_handle(buffer);
357 device->vrs.mem = radv_device_memory_from_handle(mem);
358
359 return VK_SUCCESS;
360
361 fail_bind:
362 radv_FreeMemory(radv_device_to_handle(device), mem, &device->meta_state.alloc);
363 fail_alloc:
364 radv_DestroyBuffer(radv_device_to_handle(device), buffer, &device->meta_state.alloc);
365 fail_create:
366 radv_DestroyImage(radv_device_to_handle(device), image, &device->meta_state.alloc);
367
368 return result;
369 }
370
371 static void
radv_device_finish_vrs_image(struct radv_device * device)372 radv_device_finish_vrs_image(struct radv_device *device)
373 {
374 if (!device->vrs.image)
375 return;
376
377 radv_FreeMemory(radv_device_to_handle(device), radv_device_memory_to_handle(device->vrs.mem),
378 &device->meta_state.alloc);
379 radv_DestroyBuffer(radv_device_to_handle(device), radv_buffer_to_handle(device->vrs.buffer),
380 &device->meta_state.alloc);
381 radv_DestroyImage(radv_device_to_handle(device), radv_image_to_handle(device->vrs.image), &device->meta_state.alloc);
382 }
383
384 static enum radv_force_vrs
radv_parse_vrs_rates(const char * str)385 radv_parse_vrs_rates(const char *str)
386 {
387 if (!strcmp(str, "2x2")) {
388 return RADV_FORCE_VRS_2x2;
389 } else if (!strcmp(str, "2x1")) {
390 return RADV_FORCE_VRS_2x1;
391 } else if (!strcmp(str, "1x2")) {
392 return RADV_FORCE_VRS_1x2;
393 } else if (!strcmp(str, "1x1")) {
394 return RADV_FORCE_VRS_1x1;
395 }
396
397 fprintf(stderr, "radv: Invalid VRS rates specified (valid values are 2x2, 2x1, 1x2 and 1x1)\n");
398 return RADV_FORCE_VRS_1x1;
399 }
400
401 static const char *
radv_get_force_vrs_config_file(void)402 radv_get_force_vrs_config_file(void)
403 {
404 return getenv("RADV_FORCE_VRS_CONFIG_FILE");
405 }
406
407 static enum radv_force_vrs
radv_parse_force_vrs_config_file(const char * config_file)408 radv_parse_force_vrs_config_file(const char *config_file)
409 {
410 enum radv_force_vrs force_vrs = RADV_FORCE_VRS_1x1;
411 char buf[4];
412 FILE *f;
413
414 f = fopen(config_file, "r");
415 if (!f) {
416 fprintf(stderr, "radv: Can't open file: '%s'.\n", config_file);
417 return force_vrs;
418 }
419
420 if (fread(buf, sizeof(buf), 1, f) == 1) {
421 buf[3] = '\0';
422 force_vrs = radv_parse_vrs_rates(buf);
423 }
424
425 fclose(f);
426 return force_vrs;
427 }
428
429 #ifdef __linux__
430
431 #define BUF_LEN ((10 * (sizeof(struct inotify_event) + NAME_MAX + 1)))
432
433 static int
radv_notifier_thread_run(void * data)434 radv_notifier_thread_run(void *data)
435 {
436 struct radv_device *device = data;
437 struct radv_notifier *notifier = &device->notifier;
438 char buf[BUF_LEN];
439
440 while (!notifier->quit) {
441 const char *file = radv_get_force_vrs_config_file();
442 struct timespec tm = {.tv_nsec = 100000000}; /* 1OOms */
443 int length, i = 0;
444
445 length = read(notifier->fd, buf, BUF_LEN);
446 while (i < length) {
447 struct inotify_event *event = (struct inotify_event *)&buf[i];
448
449 i += sizeof(struct inotify_event) + event->len;
450 if (event->mask & IN_MODIFY || event->mask & IN_DELETE_SELF) {
451 /* Sleep 100ms for editors that use a temporary file and delete the original. */
452 thrd_sleep(&tm, NULL);
453 device->force_vrs = radv_parse_force_vrs_config_file(file);
454
455 fprintf(stderr, "radv: Updated the per-vertex VRS rate to '%d'.\n", device->force_vrs);
456
457 if (event->mask & IN_DELETE_SELF) {
458 inotify_rm_watch(notifier->fd, notifier->watch);
459 notifier->watch = inotify_add_watch(notifier->fd, file, IN_MODIFY | IN_DELETE_SELF);
460 }
461 }
462 }
463
464 thrd_sleep(&tm, NULL);
465 }
466
467 return 0;
468 }
469
470 #endif
471
472 static int
radv_device_init_notifier(struct radv_device * device)473 radv_device_init_notifier(struct radv_device *device)
474 {
475 #ifndef __linux__
476 return true;
477 #else
478 struct radv_notifier *notifier = &device->notifier;
479 const char *file = radv_get_force_vrs_config_file();
480 int ret;
481
482 notifier->fd = inotify_init1(IN_NONBLOCK);
483 if (notifier->fd < 0)
484 return false;
485
486 notifier->watch = inotify_add_watch(notifier->fd, file, IN_MODIFY | IN_DELETE_SELF);
487 if (notifier->watch < 0)
488 goto fail_watch;
489
490 ret = thrd_create(¬ifier->thread, radv_notifier_thread_run, device);
491 if (ret)
492 goto fail_thread;
493
494 return true;
495
496 fail_thread:
497 inotify_rm_watch(notifier->fd, notifier->watch);
498 fail_watch:
499 close(notifier->fd);
500
501 return false;
502 #endif
503 }
504
505 static void
radv_device_finish_notifier(struct radv_device * device)506 radv_device_finish_notifier(struct radv_device *device)
507 {
508 #ifdef __linux__
509 struct radv_notifier *notifier = &device->notifier;
510
511 if (!notifier->thread)
512 return;
513
514 notifier->quit = true;
515 thrd_join(notifier->thread, NULL);
516 inotify_rm_watch(notifier->fd, notifier->watch);
517 close(notifier->fd);
518 #endif
519 }
520
521 static VkResult
radv_device_init_perf_counter(struct radv_device * device)522 radv_device_init_perf_counter(struct radv_device *device)
523 {
524 const struct radv_physical_device *pdev = radv_device_physical(device);
525 const size_t bo_size = PERF_CTR_BO_PASS_OFFSET + sizeof(uint64_t) * PERF_CTR_MAX_PASSES;
526 VkResult result;
527
528 result = radv_bo_create(device, NULL, bo_size, 4096, RADEON_DOMAIN_GTT,
529 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_UPLOAD_BUFFER,
530 0, true, &device->perf_counter_bo);
531 if (result != VK_SUCCESS)
532 return result;
533
534 device->perf_counter_lock_cs = calloc(sizeof(struct radeon_winsys_cs *), 2 * PERF_CTR_MAX_PASSES);
535 if (!device->perf_counter_lock_cs)
536 return VK_ERROR_OUT_OF_HOST_MEMORY;
537
538 if (!pdev->ac_perfcounters.blocks)
539 return VK_ERROR_INITIALIZATION_FAILED;
540
541 return VK_SUCCESS;
542 }
543
544 static void
radv_device_finish_perf_counter(struct radv_device * device)545 radv_device_finish_perf_counter(struct radv_device *device)
546 {
547 if (device->perf_counter_bo)
548 radv_bo_destroy(device, NULL, device->perf_counter_bo);
549
550 if (!device->perf_counter_lock_cs)
551 return;
552
553 for (unsigned i = 0; i < 2 * PERF_CTR_MAX_PASSES; ++i) {
554 if (device->perf_counter_lock_cs[i])
555 device->ws->cs_destroy(device->perf_counter_lock_cs[i]);
556 }
557
558 free(device->perf_counter_lock_cs);
559 }
560
561 static VkResult
radv_device_init_memory_cache(struct radv_device * device)562 radv_device_init_memory_cache(struct radv_device *device)
563 {
564 struct vk_pipeline_cache_create_info info = {.weak_ref = true};
565
566 device->mem_cache = vk_pipeline_cache_create(&device->vk, &info, NULL);
567 if (!device->mem_cache)
568 return VK_ERROR_OUT_OF_HOST_MEMORY;
569
570 return VK_SUCCESS;
571 }
572
573 static void
radv_device_finish_memory_cache(struct radv_device * device)574 radv_device_finish_memory_cache(struct radv_device *device)
575 {
576 if (device->mem_cache)
577 vk_pipeline_cache_destroy(device->mem_cache, NULL);
578 }
579
580 static VkResult
radv_device_init_rgp(struct radv_device * device)581 radv_device_init_rgp(struct radv_device *device)
582 {
583 const struct radv_physical_device *pdev = radv_device_physical(device);
584 const struct radv_instance *instance = radv_physical_device_instance(pdev);
585
586 if (!(instance->vk.trace_mode & RADV_TRACE_MODE_RGP))
587 return VK_SUCCESS;
588
589 if (pdev->info.gfx_level < GFX8 || pdev->info.gfx_level > GFX11_5) {
590 fprintf(stderr, "GPU hardware not supported: refer to "
591 "the RGP documentation for the list of "
592 "supported GPUs!\n");
593 abort();
594 }
595
596 if (!radv_sqtt_init(device))
597 return VK_ERROR_INITIALIZATION_FAILED;
598
599 fprintf(stderr,
600 "radv: Thread trace support is enabled (initial buffer size: %u MiB, "
601 "instruction timing: %s, cache counters: %s, queue events: %s).\n",
602 device->sqtt.buffer_size / (1024 * 1024), radv_is_instruction_timing_enabled() ? "enabled" : "disabled",
603 radv_spm_trace_enabled(instance) ? "enabled" : "disabled",
604 radv_sqtt_queue_events_enabled() ? "enabled" : "disabled");
605
606 if (radv_spm_trace_enabled(instance)) {
607 if (pdev->info.gfx_level >= GFX10 && pdev->info.gfx_level < GFX11_5) {
608 if (!radv_spm_init(device))
609 return VK_ERROR_INITIALIZATION_FAILED;
610 } else {
611 fprintf(stderr, "radv: SPM isn't supported for this GPU (%s)!\n", pdev->name);
612 }
613 }
614
615 return VK_SUCCESS;
616 }
617
618 static void
radv_device_finish_rgp(struct radv_device * device)619 radv_device_finish_rgp(struct radv_device *device)
620 {
621 radv_sqtt_finish(device);
622 radv_spm_finish(device);
623 }
624
625 static void
radv_device_init_rmv(struct radv_device * device)626 radv_device_init_rmv(struct radv_device *device)
627 {
628 const struct radv_physical_device *pdev = radv_device_physical(device);
629 const struct radv_instance *instance = radv_physical_device_instance(pdev);
630
631 if (!(instance->vk.trace_mode & VK_TRACE_MODE_RMV))
632 return;
633
634 struct vk_rmv_device_info info;
635 memset(&info, 0, sizeof(struct vk_rmv_device_info));
636 radv_rmv_fill_device_info(pdev, &info);
637 vk_memory_trace_init(&device->vk, &info);
638 radv_memory_trace_init(device);
639 }
640
641 static VkResult
radv_device_init_trap_handler(struct radv_device * device)642 radv_device_init_trap_handler(struct radv_device *device)
643 {
644 const struct radv_physical_device *pdev = radv_device_physical(device);
645
646 if (!radv_trap_handler_enabled())
647 return VK_SUCCESS;
648
649 /* TODO: Add support for more hardware. */
650 assert(pdev->info.gfx_level == GFX8);
651
652 fprintf(stderr, "**********************************************************************\n");
653 fprintf(stderr, "* WARNING: RADV_TRAP_HANDLER is experimental and only for debugging! *\n");
654 fprintf(stderr, "**********************************************************************\n");
655
656 if (!radv_trap_handler_init(device))
657 return VK_ERROR_INITIALIZATION_FAILED;
658
659 return VK_SUCCESS;
660 }
661
662 static VkResult
radv_device_init_device_fault_detection(struct radv_device * device)663 radv_device_init_device_fault_detection(struct radv_device *device)
664 {
665 const struct radv_physical_device *pdev = radv_device_physical(device);
666 struct radv_instance *instance = radv_physical_device_instance(pdev);
667
668 if (!radv_device_fault_detection_enabled(device))
669 return VK_SUCCESS;
670
671 if (!radv_init_trace(device))
672 return VK_ERROR_INITIALIZATION_FAILED;
673
674 fprintf(stderr, "*****************************************************************************\n");
675 fprintf(stderr, "* WARNING: RADV_DEBUG=hang is costly and should only be used for debugging! *\n");
676 fprintf(stderr, "*****************************************************************************\n");
677
678 /* Wait for idle after every draw/dispatch to identify the
679 * first bad call.
680 */
681 instance->debug_flags |= RADV_DEBUG_SYNC_SHADERS;
682
683 radv_dump_enabled_options(device, stderr);
684
685 return VK_SUCCESS;
686 }
687
688 static void
radv_device_finish_device_fault_detection(struct radv_device * device)689 radv_device_finish_device_fault_detection(struct radv_device *device)
690 {
691 radv_finish_trace(device);
692 ralloc_free(device->gpu_hang_report);
693 }
694
695 static VkResult
radv_device_init_tools(struct radv_device * device)696 radv_device_init_tools(struct radv_device *device)
697 {
698 const struct radv_physical_device *pdev = radv_device_physical(device);
699 struct radv_instance *instance = radv_physical_device_instance(pdev);
700 VkResult result;
701
702 result = radv_device_init_device_fault_detection(device);
703 if (result != VK_SUCCESS)
704 return result;
705
706 result = radv_device_init_rgp(device);
707 if (result != VK_SUCCESS)
708 return result;
709
710 radv_device_init_rmv(device);
711
712 result = radv_device_init_trap_handler(device);
713 if (result != VK_SUCCESS)
714 return result;
715
716 if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev, false)) {
717 result = radv_rra_trace_init(device);
718 if (result != VK_SUCCESS)
719 return result;
720 }
721
722 result = radv_printf_data_init(device);
723 if (result != VK_SUCCESS)
724 return result;
725
726 return VK_SUCCESS;
727 }
728
729 static void
radv_device_finish_tools(struct radv_device * device)730 radv_device_finish_tools(struct radv_device *device)
731 {
732 radv_printf_data_finish(device);
733 radv_rra_trace_finish(radv_device_to_handle(device), &device->rra_trace);
734 radv_trap_handler_finish(device);
735 radv_memory_trace_finish(device);
736 radv_device_finish_rgp(device);
737 radv_device_finish_device_fault_detection(device);
738 }
739
740 struct dispatch_table_builder {
741 struct vk_device_dispatch_table *tables[RADV_DISPATCH_TABLE_COUNT];
742 bool used[RADV_DISPATCH_TABLE_COUNT];
743 bool initialized[RADV_DISPATCH_TABLE_COUNT];
744 };
745
746 static void
add_entrypoints(struct dispatch_table_builder * b,const struct vk_device_entrypoint_table * entrypoints,enum radv_dispatch_table table)747 add_entrypoints(struct dispatch_table_builder *b, const struct vk_device_entrypoint_table *entrypoints,
748 enum radv_dispatch_table table)
749 {
750 for (int32_t i = table - 1; i >= RADV_DEVICE_DISPATCH_TABLE; i--) {
751 if (i == RADV_DEVICE_DISPATCH_TABLE || b->used[i]) {
752 vk_device_dispatch_table_from_entrypoints(b->tables[i], entrypoints, !b->initialized[i]);
753 b->initialized[i] = true;
754 }
755 }
756
757 if (table < RADV_DISPATCH_TABLE_COUNT)
758 b->used[table] = true;
759 }
760
761 static void
init_dispatch_tables(struct radv_device * device,struct radv_physical_device * pdev)762 init_dispatch_tables(struct radv_device *device, struct radv_physical_device *pdev)
763 {
764 const struct radv_instance *instance = radv_physical_device_instance(pdev);
765 struct dispatch_table_builder b = {0};
766 b.tables[RADV_DEVICE_DISPATCH_TABLE] = &device->vk.dispatch_table;
767 b.tables[RADV_ANNOTATE_DISPATCH_TABLE] = &device->layer_dispatch.annotate;
768 b.tables[RADV_APP_DISPATCH_TABLE] = &device->layer_dispatch.app;
769 b.tables[RADV_RGP_DISPATCH_TABLE] = &device->layer_dispatch.rgp;
770 b.tables[RADV_RRA_DISPATCH_TABLE] = &device->layer_dispatch.rra;
771 b.tables[RADV_RMV_DISPATCH_TABLE] = &device->layer_dispatch.rmv;
772 b.tables[RADV_CTX_ROLL_DISPATCH_TABLE] = &device->layer_dispatch.ctx_roll;
773
774 bool gather_ctx_rolls = instance->vk.trace_mode & RADV_TRACE_MODE_CTX_ROLLS;
775 if (radv_device_fault_detection_enabled(device) || gather_ctx_rolls)
776 add_entrypoints(&b, &annotate_device_entrypoints, RADV_ANNOTATE_DISPATCH_TABLE);
777
778 if (!strcmp(instance->drirc.app_layer, "metroexodus")) {
779 add_entrypoints(&b, &metro_exodus_device_entrypoints, RADV_APP_DISPATCH_TABLE);
780 } else if (!strcmp(instance->drirc.app_layer, "rage2")) {
781 add_entrypoints(&b, &rage2_device_entrypoints, RADV_APP_DISPATCH_TABLE);
782 } else if (!strcmp(instance->drirc.app_layer, "quanticdream")) {
783 add_entrypoints(&b, &quantic_dream_device_entrypoints, RADV_APP_DISPATCH_TABLE);
784 }
785
786 if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP)
787 add_entrypoints(&b, &sqtt_device_entrypoints, RADV_RGP_DISPATCH_TABLE);
788
789 if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev, false))
790 add_entrypoints(&b, &rra_device_entrypoints, RADV_RRA_DISPATCH_TABLE);
791
792 #ifndef _WIN32
793 if (instance->vk.trace_mode & VK_TRACE_MODE_RMV)
794 add_entrypoints(&b, &rmv_device_entrypoints, RADV_RMV_DISPATCH_TABLE);
795 #endif
796
797 if (gather_ctx_rolls)
798 add_entrypoints(&b, &ctx_roll_device_entrypoints, RADV_CTX_ROLL_DISPATCH_TABLE);
799
800 add_entrypoints(&b, &radv_device_entrypoints, RADV_DISPATCH_TABLE_COUNT);
801 add_entrypoints(&b, &wsi_device_entrypoints, RADV_DISPATCH_TABLE_COUNT);
802 add_entrypoints(&b, &vk_common_device_entrypoints, RADV_DISPATCH_TABLE_COUNT);
803 }
804
805 static VkResult
capture_trace(VkQueue _queue)806 capture_trace(VkQueue _queue)
807 {
808 VK_FROM_HANDLE(radv_queue, queue, _queue);
809 struct radv_device *device = radv_queue_device(queue);
810 const struct radv_physical_device *pdev = radv_device_physical(device);
811 const struct radv_instance *instance = radv_physical_device_instance(pdev);
812
813 VkResult result = VK_SUCCESS;
814
815 if (instance->vk.trace_mode & RADV_TRACE_MODE_RRA)
816 device->rra_trace.triggered = true;
817
818 if (device->vk.memory_trace_data.is_enabled) {
819 simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
820 radv_rmv_collect_trace_events(device);
821 vk_dump_rmv_capture(&device->vk.memory_trace_data);
822 simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
823 }
824
825 if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP)
826 device->sqtt_triggered = true;
827
828 if (instance->vk.trace_mode & RADV_TRACE_MODE_CTX_ROLLS) {
829 char filename[2048];
830 time_t t = time(NULL);
831 struct tm now = *localtime(&t);
832 snprintf(filename, sizeof(filename), "/tmp/%s_%04d.%02d.%02d_%02d.%02d.%02d.ctxroll", util_get_process_name(),
833 1900 + now.tm_year, now.tm_mon + 1, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec);
834
835 simple_mtx_lock(&device->ctx_roll_mtx);
836
837 device->ctx_roll_file = fopen(filename, "w");
838 if (device->ctx_roll_file)
839 fprintf(stderr, "radv: Writing context rolls to '%s'...\n", filename);
840
841 simple_mtx_unlock(&device->ctx_roll_mtx);
842 }
843
844 return result;
845 }
846
847 static void
radv_device_init_cache_key(struct radv_device * device)848 radv_device_init_cache_key(struct radv_device *device)
849 {
850 const struct radv_physical_device *pdev = radv_device_physical(device);
851 struct radv_device_cache_key *key = &device->cache_key;
852
853 key->disable_trunc_coord = device->disable_trunc_coord;
854 key->image_2d_view_of_3d = device->vk.enabled_features.image2DViewOf3D && pdev->info.gfx_level == GFX9;
855 key->mesh_shader_queries = device->vk.enabled_features.meshShaderQueries;
856 key->primitives_generated_query = radv_uses_primitives_generated_query(device);
857
858 /* The Vulkan spec says:
859 * "Binary shaders retrieved from a physical device with a certain shaderBinaryUUID are
860 * guaranteed to be compatible with all other physical devices reporting the same
861 * shaderBinaryUUID and the same or higher shaderBinaryVersion."
862 *
863 * That means the driver should compile shaders for the "worst" case of all features being
864 * enabled, regardless of what features are actually enabled on the logical device.
865 */
866 if (device->vk.enabled_features.shaderObject) {
867 key->image_2d_view_of_3d = pdev->info.gfx_level == GFX9;
868 key->primitives_generated_query = true;
869 }
870
871 _mesa_blake3_compute(key, sizeof(*key), device->cache_hash);
872 }
873
874 static void
radv_create_gfx_preamble(struct radv_device * device)875 radv_create_gfx_preamble(struct radv_device *device)
876 {
877 struct radeon_cmdbuf *cs = device->ws->cs_create(device->ws, AMD_IP_GFX, false);
878 if (!cs)
879 return;
880
881 radeon_check_space(device->ws, cs, 512);
882
883 radv_emit_graphics(device, cs);
884
885 device->ws->cs_pad(cs, 0);
886
887 VkResult result = radv_bo_create(
888 device, NULL, cs->cdw * 4, 4096, device->ws->cs_domain(device->ws),
889 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY | RADEON_FLAG_GTT_WC,
890 RADV_BO_PRIORITY_CS, 0, true, &device->gfx_init);
891 if (result != VK_SUCCESS)
892 goto fail;
893
894 void *map = radv_buffer_map(device->ws, device->gfx_init);
895 if (!map) {
896 radv_bo_destroy(device, NULL, device->gfx_init);
897 device->gfx_init = NULL;
898 goto fail;
899 }
900 memcpy(map, cs->buf, cs->cdw * 4);
901
902 device->ws->buffer_unmap(device->ws, device->gfx_init, false);
903 device->gfx_init_size_dw = cs->cdw;
904 fail:
905 device->ws->cs_destroy(cs);
906 }
907
908 /* For MSAA sample positions. */
909 #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \
910 ((((unsigned)(s0x)&0xf) << 0) | (((unsigned)(s0y)&0xf) << 4) | (((unsigned)(s1x)&0xf) << 8) | \
911 (((unsigned)(s1y)&0xf) << 12) | (((unsigned)(s2x)&0xf) << 16) | (((unsigned)(s2y)&0xf) << 20) | \
912 (((unsigned)(s3x)&0xf) << 24) | (((unsigned)(s3y)&0xf) << 28))
913
914 /* For obtaining location coordinates from registers */
915 #define SEXT4(x) ((int)((x) | ((x)&0x8 ? 0xfffffff0 : 0)))
916 #define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index)*4)) & 0xf)
917 #define GET_SX(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
918 #define GET_SY(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
919
920 /* 1x MSAA */
921 static const uint32_t sample_locs_1x = FILL_SREG(0, 0, 0, 0, 0, 0, 0, 0);
922 static const unsigned max_dist_1x = 0;
923 static const uint64_t centroid_priority_1x = 0x0000000000000000ull;
924
925 /* 2xMSAA */
926 static const uint32_t sample_locs_2x = FILL_SREG(4, 4, -4, -4, 0, 0, 0, 0);
927 static const unsigned max_dist_2x = 4;
928 static const uint64_t centroid_priority_2x = 0x1010101010101010ull;
929
930 /* 4xMSAA */
931 static const uint32_t sample_locs_4x = FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6);
932 static const unsigned max_dist_4x = 6;
933 static const uint64_t centroid_priority_4x = 0x3210321032103210ull;
934
935 /* 8xMSAA */
936 static const uint32_t sample_locs_8x[] = {
937 FILL_SREG(1, -3, -1, 3, 5, 1, -3, -5),
938 FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7),
939 /* The following are unused by hardware, but we emit them to IBs
940 * instead of multiple SET_CONTEXT_REG packets. */
941 0,
942 0,
943 };
944 static const unsigned max_dist_8x = 7;
945 static const uint64_t centroid_priority_8x = 0x7654321076543210ull;
946
947 unsigned
radv_get_default_max_sample_dist(int log_samples)948 radv_get_default_max_sample_dist(int log_samples)
949 {
950 unsigned max_dist[] = {
951 max_dist_1x,
952 max_dist_2x,
953 max_dist_4x,
954 max_dist_8x,
955 };
956 return max_dist[log_samples];
957 }
958
959 void
radv_emit_default_sample_locations(const struct radv_physical_device * pdev,struct radeon_cmdbuf * cs,int nr_samples)960 radv_emit_default_sample_locations(const struct radv_physical_device *pdev, struct radeon_cmdbuf *cs, int nr_samples)
961 {
962 uint64_t centroid_priority;
963
964 switch (nr_samples) {
965 default:
966 case 1:
967 centroid_priority = centroid_priority_1x;
968
969 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_1x);
970 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_1x);
971 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_1x);
972 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_1x);
973 break;
974 case 2:
975 centroid_priority = centroid_priority_2x;
976
977 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_2x);
978 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_2x);
979 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_2x);
980 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_2x);
981 break;
982 case 4:
983 centroid_priority = centroid_priority_4x;
984
985 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_4x);
986 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_4x);
987 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_4x);
988 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_4x);
989 break;
990 case 8:
991 centroid_priority = centroid_priority_8x;
992
993 radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14);
994 radeon_emit_array(cs, sample_locs_8x, 4);
995 radeon_emit_array(cs, sample_locs_8x, 4);
996 radeon_emit_array(cs, sample_locs_8x, 4);
997 radeon_emit_array(cs, sample_locs_8x, 2);
998 break;
999 }
1000
1001 if (pdev->info.gfx_level >= GFX12) {
1002 radeon_set_context_reg_seq(cs, R_028BF0_PA_SC_CENTROID_PRIORITY_0, 2);
1003 } else {
1004 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
1005 }
1006 radeon_emit(cs, centroid_priority);
1007 radeon_emit(cs, centroid_priority >> 32);
1008 }
1009
1010 static void
radv_get_sample_position(struct radv_device * device,unsigned sample_count,unsigned sample_index,float * out_value)1011 radv_get_sample_position(struct radv_device *device, unsigned sample_count, unsigned sample_index, float *out_value)
1012 {
1013 const uint32_t *sample_locs;
1014
1015 switch (sample_count) {
1016 case 1:
1017 default:
1018 sample_locs = &sample_locs_1x;
1019 break;
1020 case 2:
1021 sample_locs = &sample_locs_2x;
1022 break;
1023 case 4:
1024 sample_locs = &sample_locs_4x;
1025 break;
1026 case 8:
1027 sample_locs = sample_locs_8x;
1028 break;
1029 }
1030
1031 out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
1032 out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
1033 }
1034
1035 static void
radv_device_init_msaa(struct radv_device * device)1036 radv_device_init_msaa(struct radv_device *device)
1037 {
1038 int i;
1039
1040 radv_get_sample_position(device, 1, 0, device->sample_locations_1x[0]);
1041
1042 for (i = 0; i < 2; i++)
1043 radv_get_sample_position(device, 2, i, device->sample_locations_2x[i]);
1044 for (i = 0; i < 4; i++)
1045 radv_get_sample_position(device, 4, i, device->sample_locations_4x[i]);
1046 for (i = 0; i < 8; i++)
1047 radv_get_sample_position(device, 8, i, device->sample_locations_8x[i]);
1048 }
1049
1050 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateDevice(VkPhysicalDevice physicalDevice,const VkDeviceCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkDevice * pDevice)1051 radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo,
1052 const VkAllocationCallbacks *pAllocator, VkDevice *pDevice)
1053 {
1054 VK_FROM_HANDLE(radv_physical_device, pdev, physicalDevice);
1055 struct radv_instance *instance = radv_physical_device_instance(pdev);
1056 VkResult result;
1057 struct radv_device *device;
1058
1059 bool overallocation_disallowed = false;
1060
1061 vk_foreach_struct_const (ext, pCreateInfo->pNext) {
1062 switch (ext->sType) {
1063 case VK_STRUCTURE_TYPE_DEVICE_MEMORY_OVERALLOCATION_CREATE_INFO_AMD: {
1064 const VkDeviceMemoryOverallocationCreateInfoAMD *overallocation = (const void *)ext;
1065 if (overallocation->overallocationBehavior == VK_MEMORY_OVERALLOCATION_BEHAVIOR_DISALLOWED_AMD)
1066 overallocation_disallowed = true;
1067 break;
1068 }
1069 default:
1070 break;
1071 }
1072 }
1073
1074 device = vk_zalloc2(&instance->vk.alloc, pAllocator, sizeof(*device), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1075 if (!device)
1076 return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1077
1078 result = vk_device_init(&device->vk, &pdev->vk, NULL, pCreateInfo, pAllocator);
1079 if (result != VK_SUCCESS) {
1080 vk_free(&device->vk.alloc, device);
1081 return result;
1082 }
1083
1084 device->vk.capture_trace = capture_trace;
1085
1086 device->vk.command_buffer_ops = &radv_cmd_buffer_ops;
1087
1088 init_dispatch_tables(device, pdev);
1089
1090 simple_mtx_init(&device->ctx_roll_mtx, mtx_plain);
1091 simple_mtx_init(&device->trace_mtx, mtx_plain);
1092 simple_mtx_init(&device->pstate_mtx, mtx_plain);
1093 simple_mtx_init(&device->rt_handles_mtx, mtx_plain);
1094 simple_mtx_init(&device->compute_scratch_mtx, mtx_plain);
1095 simple_mtx_init(&device->pso_cache_stats_mtx, mtx_plain);
1096
1097 device->rt_handles = _mesa_hash_table_create(NULL, _mesa_hash_u32, _mesa_key_u32_equal);
1098
1099 device->ws = pdev->ws;
1100 vk_device_set_drm_fd(&device->vk, device->ws->get_fd(device->ws));
1101
1102 /* With update after bind we can't attach bo's to the command buffer
1103 * from the descriptor set anymore, so we have to use a global BO list.
1104 */
1105 device->use_global_bo_list =
1106 (instance->perftest_flags & RADV_PERFTEST_BO_LIST) || device->vk.enabled_features.bufferDeviceAddress ||
1107 device->vk.enabled_features.descriptorIndexing || device->vk.enabled_extensions.EXT_descriptor_indexing ||
1108 device->vk.enabled_extensions.EXT_buffer_device_address ||
1109 device->vk.enabled_extensions.KHR_buffer_device_address ||
1110 device->vk.enabled_extensions.KHR_ray_tracing_pipeline ||
1111 device->vk.enabled_extensions.KHR_acceleration_structure ||
1112 device->vk.enabled_extensions.VALVE_descriptor_set_host_mapping;
1113
1114 radv_init_shader_arenas(device);
1115
1116 device->overallocation_disallowed = overallocation_disallowed;
1117 mtx_init(&device->overallocation_mutex, mtx_plain);
1118
1119 if (pdev->info.register_shadowing_required || instance->debug_flags & RADV_DEBUG_SHADOW_REGS)
1120 device->uses_shadow_regs = true;
1121
1122 /* Create one context per queue priority. */
1123 for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
1124 const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i];
1125 const VkDeviceQueueGlobalPriorityCreateInfoKHR *global_priority =
1126 vk_find_struct_const(queue_create->pNext, DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
1127 enum radeon_ctx_priority priority = radv_get_queue_global_priority(global_priority);
1128
1129 if (device->hw_ctx[priority])
1130 continue;
1131
1132 result = device->ws->ctx_create(device->ws, priority, &device->hw_ctx[priority]);
1133 if (result != VK_SUCCESS)
1134 goto fail_queue;
1135 }
1136
1137 for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
1138 const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i];
1139 uint32_t qfi = queue_create->queueFamilyIndex;
1140 const VkDeviceQueueGlobalPriorityCreateInfoKHR *global_priority =
1141 vk_find_struct_const(queue_create->pNext, DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
1142
1143 device->queues[qfi] = vk_zalloc(&device->vk.alloc, queue_create->queueCount * sizeof(struct radv_queue), 8,
1144 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1145 if (!device->queues[qfi]) {
1146 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1147 goto fail_queue;
1148 }
1149
1150 device->queue_count[qfi] = queue_create->queueCount;
1151
1152 for (unsigned q = 0; q < queue_create->queueCount; q++) {
1153 result = radv_queue_init(device, &device->queues[qfi][q], q, queue_create, global_priority);
1154 if (result != VK_SUCCESS)
1155 goto fail_queue;
1156 }
1157 }
1158 device->private_sdma_queue = VK_NULL_HANDLE;
1159
1160 device->shader_use_invisible_vram = (instance->perftest_flags & RADV_PERFTEST_DMA_SHADERS) &&
1161 /* SDMA buffer copy is only implemented for GFX7+. */
1162 pdev->info.gfx_level >= GFX7;
1163 result = radv_init_shader_upload_queue(device);
1164 if (result != VK_SUCCESS)
1165 goto fail;
1166
1167 device->pbb_allowed = pdev->info.gfx_level >= GFX9 && !(instance->debug_flags & RADV_DEBUG_NOBINNING);
1168
1169 device->disable_trunc_coord = instance->drirc.disable_trunc_coord;
1170
1171 if (instance->vk.app_info.engine_name && !strcmp(instance->vk.app_info.engine_name, "DXVK")) {
1172 /* For DXVK 2.3.0 and older, use dualSrcBlend to determine if this is D3D9. */
1173 bool is_d3d9 = !device->vk.enabled_features.dualSrcBlend;
1174 if (instance->vk.app_info.engine_version > VK_MAKE_VERSION(2, 3, 0))
1175 is_d3d9 = instance->vk.app_info.app_version & 0x1;
1176
1177 device->disable_trunc_coord &= !is_d3d9;
1178 }
1179
1180 /* The maximum number of scratch waves. Scratch space isn't divided
1181 * evenly between CUs. The number is only a function of the number of CUs.
1182 * We can decrease the constant to decrease the scratch buffer size.
1183 *
1184 * sctx->scratch_waves must be >= the maximum possible size of
1185 * 1 threadgroup, so that the hw doesn't hang from being unable
1186 * to start any.
1187 *
1188 * The recommended value is 4 per CU at most. Higher numbers don't
1189 * bring much benefit, but they still occupy chip resources (think
1190 * async compute). I've seen ~2% performance difference between 4 and 32.
1191 */
1192 uint32_t max_threads_per_block = 2048;
1193 device->scratch_waves = MAX2(32 * pdev->info.num_cu, max_threads_per_block / 64);
1194
1195 device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1);
1196
1197 if (pdev->info.gfx_level >= GFX7) {
1198 /* If the KMD allows it (there is a KMD hw register for it),
1199 * allow launching waves out-of-order.
1200 */
1201 device->dispatch_initiator |= S_00B800_ORDER_MODE(1);
1202 }
1203 if (pdev->info.gfx_level >= GFX10) {
1204 /* Enable asynchronous compute tunneling. The KMD restricts this feature
1205 * to high-priority compute queues, so setting the bit on any other queue
1206 * is a no-op. PAL always sets this bit as well.
1207 */
1208 device->dispatch_initiator |= S_00B800_TUNNEL_ENABLE(1);
1209 }
1210
1211 /* Disable partial preemption for task shaders.
1212 * The kernel may not support preemption, but PAL always sets this bit,
1213 * so let's also set it here for consistency.
1214 */
1215 device->dispatch_initiator_task = device->dispatch_initiator | S_00B800_DISABLE_DISP_PREMPT_EN(1);
1216
1217 if (pdev->info.gfx_level == GFX10_3) {
1218 if (getenv("RADV_FORCE_VRS_CONFIG_FILE")) {
1219 const char *file = radv_get_force_vrs_config_file();
1220
1221 device->force_vrs = radv_parse_force_vrs_config_file(file);
1222
1223 if (radv_device_init_notifier(device)) {
1224 device->force_vrs_enabled = true;
1225 } else {
1226 fprintf(stderr, "radv: Failed to initialize the notifier for RADV_FORCE_VRS_CONFIG_FILE!\n");
1227 }
1228 } else if (getenv("RADV_FORCE_VRS")) {
1229 const char *vrs_rates = getenv("RADV_FORCE_VRS");
1230
1231 device->force_vrs = radv_parse_vrs_rates(vrs_rates);
1232 device->force_vrs_enabled = device->force_vrs != RADV_FORCE_VRS_1x1;
1233 }
1234 }
1235
1236 /* PKT3_LOAD_SH_REG_INDEX is supported on GFX8+, but it hangs with compute queues until GFX10.3. */
1237 device->load_grid_size_from_user_sgpr = pdev->info.gfx_level >= GFX10_3;
1238
1239 /* Keep shader info for GPU hangs debugging. */
1240 device->keep_shader_info = radv_device_fault_detection_enabled(device) || radv_trap_handler_enabled();
1241
1242 /* Initialize the per-device cache key before compiling meta shaders. */
1243 radv_device_init_cache_key(device);
1244
1245 result = radv_device_init_tools(device);
1246 if (result != VK_SUCCESS)
1247 goto fail;
1248
1249 result = radv_device_init_meta(device);
1250 if (result != VK_SUCCESS)
1251 goto fail;
1252
1253 radv_device_init_msaa(device);
1254
1255 /* If the border color extension is enabled, let's create the buffer we need. */
1256 if (device->vk.enabled_features.customBorderColors) {
1257 result = radv_device_init_border_color(device);
1258 if (result != VK_SUCCESS)
1259 goto fail;
1260 }
1261
1262 if (device->vk.enabled_features.vertexInputDynamicState || device->vk.enabled_features.graphicsPipelineLibrary ||
1263 device->vk.enabled_features.shaderObject) {
1264 result = radv_device_init_vs_prologs(device);
1265 if (result != VK_SUCCESS)
1266 goto fail;
1267 }
1268
1269 if (device->vk.enabled_features.graphicsPipelineLibrary || device->vk.enabled_features.shaderObject ||
1270 device->vk.enabled_features.extendedDynamicState3ColorBlendEnable ||
1271 device->vk.enabled_features.extendedDynamicState3ColorWriteMask ||
1272 device->vk.enabled_features.extendedDynamicState3AlphaToCoverageEnable ||
1273 device->vk.enabled_features.extendedDynamicState3ColorBlendEquation) {
1274 if (!radv_shader_part_cache_init(&device->ps_epilogs, &ps_epilog_ops)) {
1275 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1276 goto fail;
1277 }
1278 }
1279
1280 if (!(instance->debug_flags & RADV_DEBUG_NO_IBS))
1281 radv_create_gfx_preamble(device);
1282
1283 if (!device->vk.disable_internal_cache) {
1284 result = radv_device_init_memory_cache(device);
1285 if (result != VK_SUCCESS)
1286 goto fail_meta;
1287 }
1288
1289 device->force_aniso = MIN2(16, (int)debug_get_num_option("RADV_TEX_ANISO", -1));
1290 if (device->force_aniso >= 0) {
1291 fprintf(stderr, "radv: Forcing anisotropy filter to %ix\n", 1 << util_logbase2(device->force_aniso));
1292 }
1293
1294 if (device->vk.enabled_features.performanceCounterQueryPools) {
1295 result = radv_device_init_perf_counter(device);
1296 if (result != VK_SUCCESS)
1297 goto fail_cache;
1298 }
1299
1300 if (device->vk.enabled_features.rayTracingPipelineShaderGroupHandleCaptureReplay) {
1301 device->capture_replay_arena_vas = _mesa_hash_table_u64_create(NULL);
1302 }
1303
1304 if (pdev->info.gfx_level == GFX11 && pdev->info.has_dedicated_vram && instance->drirc.force_pstate_peak_gfx11_dgpu) {
1305 if (!radv_device_acquire_performance_counters(device))
1306 fprintf(stderr, "radv: failed to set pstate to profile_peak.\n");
1307 }
1308
1309 *pDevice = radv_device_to_handle(device);
1310 return VK_SUCCESS;
1311
1312 fail_cache:
1313 radv_device_finish_memory_cache(device);
1314 fail_meta:
1315 radv_device_finish_meta(device);
1316 fail:
1317 radv_device_finish_perf_counter(device);
1318
1319 radv_device_finish_tools(device);
1320
1321 if (device->gfx_init)
1322 radv_bo_destroy(device, NULL, device->gfx_init);
1323
1324 radv_device_finish_notifier(device);
1325 radv_device_finish_vs_prologs(device);
1326 if (device->ps_epilogs.ops)
1327 radv_shader_part_cache_finish(device, &device->ps_epilogs);
1328 radv_device_finish_border_color(device);
1329
1330 radv_destroy_shader_upload_queue(device);
1331
1332 fail_queue:
1333 for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
1334 for (unsigned q = 0; q < device->queue_count[i]; q++)
1335 radv_queue_finish(&device->queues[i][q]);
1336 if (device->queue_count[i])
1337 vk_free(&device->vk.alloc, device->queues[i]);
1338 }
1339
1340 for (unsigned i = 0; i < RADV_NUM_HW_CTX; i++) {
1341 if (device->hw_ctx[i])
1342 device->ws->ctx_destroy(device->hw_ctx[i]);
1343 }
1344
1345 radv_destroy_shader_arenas(device);
1346
1347 _mesa_hash_table_destroy(device->rt_handles, NULL);
1348
1349 simple_mtx_destroy(&device->ctx_roll_mtx);
1350 simple_mtx_destroy(&device->pstate_mtx);
1351 simple_mtx_destroy(&device->trace_mtx);
1352 simple_mtx_destroy(&device->rt_handles_mtx);
1353 simple_mtx_destroy(&device->compute_scratch_mtx);
1354 simple_mtx_destroy(&device->pso_cache_stats_mtx);
1355 mtx_destroy(&device->overallocation_mutex);
1356
1357 vk_device_finish(&device->vk);
1358 vk_free(&device->vk.alloc, device);
1359 return result;
1360 }
1361
1362 VKAPI_ATTR void VKAPI_CALL
radv_DestroyDevice(VkDevice _device,const VkAllocationCallbacks * pAllocator)1363 radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
1364 {
1365 VK_FROM_HANDLE(radv_device, device, _device);
1366
1367 if (!device)
1368 return;
1369
1370 radv_device_finish_perf_counter(device);
1371
1372 if (device->gfx_init)
1373 radv_bo_destroy(device, NULL, device->gfx_init);
1374
1375 radv_device_finish_notifier(device);
1376 radv_device_finish_vs_prologs(device);
1377 if (device->ps_epilogs.ops)
1378 radv_shader_part_cache_finish(device, &device->ps_epilogs);
1379 radv_device_finish_border_color(device);
1380 radv_device_finish_vrs_image(device);
1381
1382 for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
1383 for (unsigned q = 0; q < device->queue_count[i]; q++)
1384 radv_queue_finish(&device->queues[i][q]);
1385 if (device->queue_count[i])
1386 vk_free(&device->vk.alloc, device->queues[i]);
1387 }
1388 if (device->private_sdma_queue != VK_NULL_HANDLE) {
1389 radv_queue_finish(device->private_sdma_queue);
1390 vk_free(&device->vk.alloc, device->private_sdma_queue);
1391 }
1392
1393 _mesa_hash_table_destroy(device->rt_handles, NULL);
1394
1395 radv_device_finish_meta(device);
1396
1397 radv_device_finish_memory_cache(device);
1398
1399 radv_destroy_shader_upload_queue(device);
1400
1401 for (unsigned i = 0; i < RADV_NUM_HW_CTX; i++) {
1402 if (device->hw_ctx[i])
1403 device->ws->ctx_destroy(device->hw_ctx[i]);
1404 }
1405
1406 mtx_destroy(&device->overallocation_mutex);
1407 simple_mtx_destroy(&device->ctx_roll_mtx);
1408 simple_mtx_destroy(&device->pstate_mtx);
1409 simple_mtx_destroy(&device->trace_mtx);
1410 simple_mtx_destroy(&device->rt_handles_mtx);
1411 simple_mtx_destroy(&device->compute_scratch_mtx);
1412 simple_mtx_destroy(&device->pso_cache_stats_mtx);
1413
1414 radv_destroy_shader_arenas(device);
1415 if (device->capture_replay_arena_vas)
1416 _mesa_hash_table_u64_destroy(device->capture_replay_arena_vas);
1417
1418 vk_device_finish(&device->vk);
1419 vk_free(&device->vk.alloc, device);
1420 }
1421
1422 bool
radv_get_memory_fd(struct radv_device * device,struct radv_device_memory * memory,int * pFD)1423 radv_get_memory_fd(struct radv_device *device, struct radv_device_memory *memory, int *pFD)
1424 {
1425 /* Set BO metadata for dedicated image allocations. We don't need it for import when the image
1426 * tiling is VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT, but we set it anyway for foreign consumers.
1427 */
1428 if (memory->image) {
1429 struct radeon_bo_metadata metadata;
1430
1431 assert(memory->image->bindings[0].offset == 0);
1432 radv_init_metadata(device, memory->image, &metadata);
1433 device->ws->buffer_set_metadata(device->ws, memory->bo, &metadata);
1434 }
1435
1436 return device->ws->buffer_get_fd(device->ws, memory->bo, pFD);
1437 }
1438
1439 VKAPI_ATTR void VKAPI_CALL
radv_GetImageMemoryRequirements2(VkDevice _device,const VkImageMemoryRequirementsInfo2 * pInfo,VkMemoryRequirements2 * pMemoryRequirements)1440 radv_GetImageMemoryRequirements2(VkDevice _device, const VkImageMemoryRequirementsInfo2 *pInfo,
1441 VkMemoryRequirements2 *pMemoryRequirements)
1442 {
1443 VK_FROM_HANDLE(radv_device, device, _device);
1444 VK_FROM_HANDLE(radv_image, image, pInfo->image);
1445 const struct radv_physical_device *pdev = radv_device_physical(device);
1446 uint32_t alignment;
1447 uint64_t size;
1448
1449 const VkImagePlaneMemoryRequirementsInfo *plane_info =
1450 vk_find_struct_const(pInfo->pNext, IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO);
1451
1452 if (plane_info) {
1453 const uint32_t plane = radv_plane_from_aspect(plane_info->planeAspect);
1454
1455 size = image->planes[plane].surface.total_size;
1456 alignment = 1 << image->planes[plane].surface.alignment_log2;
1457 } else {
1458 size = image->size;
1459 alignment = image->alignment;
1460 }
1461
1462 pMemoryRequirements->memoryRequirements.memoryTypeBits =
1463 ((1u << pdev->memory_properties.memoryTypeCount) - 1u) & ~pdev->memory_types_32bit;
1464
1465 pMemoryRequirements->memoryRequirements.size = size;
1466 pMemoryRequirements->memoryRequirements.alignment = alignment;
1467
1468 vk_foreach_struct (ext, pMemoryRequirements->pNext) {
1469 switch (ext->sType) {
1470 case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
1471 VkMemoryDedicatedRequirements *req = (VkMemoryDedicatedRequirements *)ext;
1472 req->requiresDedicatedAllocation = image->shareable && image->vk.tiling != VK_IMAGE_TILING_LINEAR;
1473 req->prefersDedicatedAllocation = req->requiresDedicatedAllocation;
1474 break;
1475 }
1476 default:
1477 break;
1478 }
1479 }
1480 }
1481
1482 VKAPI_ATTR void VKAPI_CALL
radv_GetDeviceImageMemoryRequirements(VkDevice device,const VkDeviceImageMemoryRequirements * pInfo,VkMemoryRequirements2 * pMemoryRequirements)1483 radv_GetDeviceImageMemoryRequirements(VkDevice device, const VkDeviceImageMemoryRequirements *pInfo,
1484 VkMemoryRequirements2 *pMemoryRequirements)
1485 {
1486 UNUSED VkResult result;
1487 VkImage image;
1488
1489 /* Determining the image size/alignment require to create a surface, which is complicated without
1490 * creating an image.
1491 * TODO: Avoid creating an image.
1492 */
1493 result =
1494 radv_image_create(device, &(struct radv_image_create_info){.vk_info = pInfo->pCreateInfo}, NULL, &image, true);
1495 assert(result == VK_SUCCESS);
1496
1497 VkImageMemoryRequirementsInfo2 info2 = {
1498 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2,
1499 .image = image,
1500 };
1501
1502 radv_GetImageMemoryRequirements2(device, &info2, pMemoryRequirements);
1503
1504 radv_DestroyImage(device, image, NULL);
1505 }
1506
1507 static uint32_t
radv_surface_max_layer_count(struct radv_image_view * iview)1508 radv_surface_max_layer_count(struct radv_image_view *iview)
1509 {
1510 return iview->vk.view_type == VK_IMAGE_VIEW_TYPE_3D ? iview->extent.depth
1511 : (iview->vk.base_array_layer + iview->vk.layer_count);
1512 }
1513
1514 unsigned
radv_get_dcc_max_uncompressed_block_size(const struct radv_device * device,const struct radv_image * image)1515 radv_get_dcc_max_uncompressed_block_size(const struct radv_device *device, const struct radv_image *image)
1516 {
1517 const struct radv_physical_device *pdev = radv_device_physical(device);
1518
1519 if (pdev->info.gfx_level < GFX10 && image->vk.samples > 1) {
1520 if (image->planes[0].surface.bpe == 1)
1521 return V_028C78_MAX_BLOCK_SIZE_64B;
1522 else if (image->planes[0].surface.bpe == 2)
1523 return V_028C78_MAX_BLOCK_SIZE_128B;
1524 }
1525
1526 return V_028C78_MAX_BLOCK_SIZE_256B;
1527 }
1528
1529 void
radv_initialise_color_surface(struct radv_device * device,struct radv_color_buffer_info * cb,struct radv_image_view * iview)1530 radv_initialise_color_surface(struct radv_device *device, struct radv_color_buffer_info *cb,
1531 struct radv_image_view *iview)
1532 {
1533 const struct radv_physical_device *pdev = radv_device_physical(device);
1534 const struct radv_instance *instance = radv_physical_device_instance(pdev);
1535 uint64_t va;
1536 const struct radv_image_plane *plane = &iview->image->planes[iview->plane_id];
1537 const struct radeon_surf *surf = &plane->surface;
1538
1539 memset(cb, 0, sizeof(*cb));
1540
1541 const unsigned num_layers =
1542 iview->image->vk.image_type == VK_IMAGE_TYPE_3D ? (iview->extent.depth - 1) : (iview->image->vk.array_layers - 1);
1543
1544 const struct ac_cb_state cb_state = {
1545 .surf = surf,
1546 .format = vk_format_to_pipe_format(iview->vk.format),
1547 .width = vk_format_get_plane_width(iview->image->vk.format, iview->plane_id, iview->extent.width),
1548 .height = vk_format_get_plane_height(iview->image->vk.format, iview->plane_id, iview->extent.height),
1549 .first_layer = iview->vk.base_array_layer,
1550 .last_layer = radv_surface_max_layer_count(iview) - 1,
1551 .num_layers = num_layers,
1552 .num_samples = iview->image->vk.samples,
1553 .num_storage_samples = iview->image->vk.samples,
1554 .base_level = iview->vk.base_mip_level,
1555 .num_levels = iview->image->vk.mip_levels,
1556 .gfx10 =
1557 {
1558 .nbc_view = iview->nbc_view.valid ? &iview->nbc_view : NULL,
1559 },
1560 };
1561
1562 ac_init_cb_surface(&pdev->info, &cb_state, &cb->ac);
1563
1564 uint32_t plane_id = iview->image->disjoint ? iview->plane_id : 0;
1565 va = radv_image_get_va(iview->image, plane_id);
1566
1567 const struct ac_mutable_cb_state mutable_cb_state = {
1568 .surf = surf,
1569 .cb = &cb->ac,
1570 .va = va,
1571 .base_level = iview->vk.base_mip_level,
1572 .num_samples = iview->image->vk.samples,
1573 .fmask_enabled = radv_image_has_fmask(iview->image),
1574 .cmask_enabled = radv_image_has_cmask(iview->image),
1575 .fast_clear_enabled = !(instance->debug_flags & RADV_DEBUG_NO_FAST_CLEARS),
1576 .tc_compat_cmask_enabled = radv_image_is_tc_compat_cmask(iview->image),
1577 .dcc_enabled = radv_dcc_enabled(iview->image, iview->vk.base_mip_level) &&
1578 (pdev->info.gfx_level >= GFX11 || !iview->disable_dcc_mrt),
1579 .gfx10 =
1580 {
1581 .nbc_view = iview->nbc_view.valid ? &iview->nbc_view : NULL,
1582 },
1583 };
1584
1585 ac_set_mutable_cb_surface_fields(&pdev->info, &mutable_cb_state, &cb->ac);
1586 }
1587
1588 void
radv_initialise_vrs_surface(struct radv_image * image,struct radv_buffer * htile_buffer,struct radv_ds_buffer_info * ds)1589 radv_initialise_vrs_surface(struct radv_image *image, struct radv_buffer *htile_buffer, struct radv_ds_buffer_info *ds)
1590 {
1591 const struct radeon_surf *surf = &image->planes[0].surface;
1592
1593 assert(image->vk.format == VK_FORMAT_D16_UNORM);
1594 memset(ds, 0, sizeof(*ds));
1595
1596 ds->ac.db_z_info = S_028038_FORMAT(V_028040_Z_16) | S_028038_SW_MODE(surf->u.gfx9.swizzle_mode) |
1597 S_028038_ZRANGE_PRECISION(1) | S_028038_TILE_SURFACE_ENABLE(1);
1598 ds->ac.db_stencil_info = S_02803C_FORMAT(V_028044_STENCIL_INVALID);
1599
1600 ds->ac.db_depth_size = S_02801C_X_MAX(image->vk.extent.width - 1) | S_02801C_Y_MAX(image->vk.extent.height - 1);
1601
1602 ds->ac.u.gfx6.db_htile_data_base = radv_buffer_get_va(htile_buffer->bo) >> 8;
1603 ds->ac.u.gfx6.db_htile_surface =
1604 S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(1) | S_028ABC_VRS_HTILE_ENCODING(V_028ABC_VRS_HTILE_4BIT_ENCODING);
1605 }
1606
1607 void
radv_initialise_ds_surface(const struct radv_device * device,struct radv_ds_buffer_info * ds,struct radv_image_view * iview,VkImageAspectFlags ds_aspects)1608 radv_initialise_ds_surface(const struct radv_device *device, struct radv_ds_buffer_info *ds,
1609 struct radv_image_view *iview, VkImageAspectFlags ds_aspects)
1610 {
1611 const struct radv_physical_device *pdev = radv_device_physical(device);
1612 unsigned level = iview->vk.base_mip_level;
1613 bool stencil_only = iview->image->vk.format == VK_FORMAT_S8_UINT;
1614
1615 assert(vk_format_get_plane_count(iview->image->vk.format) == 1);
1616
1617 memset(ds, 0, sizeof(*ds));
1618
1619 uint32_t max_slice = radv_surface_max_layer_count(iview) - 1;
1620
1621 /* Recommended value for better performance with 4x and 8x. */
1622 ds->db_render_override2 = S_028010_DECOMPRESS_Z_ON_FLUSH(iview->image->vk.samples >= 4) |
1623 S_028010_CENTROID_COMPUTATION_MODE(pdev->info.gfx_level >= GFX10_3);
1624
1625 const struct ac_ds_state ds_state = {
1626 .surf = &iview->image->planes[0].surface,
1627 .va = radv_image_get_va(iview->image, 0),
1628 .format = vk_format_to_pipe_format(iview->image->vk.format),
1629 .width = iview->image->vk.extent.width,
1630 .height = iview->image->vk.extent.height,
1631 .level = level,
1632 .num_levels = iview->image->vk.mip_levels,
1633 .num_samples = iview->image->vk.samples,
1634 .first_layer = iview->vk.base_array_layer,
1635 .last_layer = max_slice,
1636 .stencil_only = stencil_only,
1637 .z_read_only = !(ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT),
1638 .stencil_read_only = !(ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT),
1639 .htile_enabled = radv_htile_enabled(iview->image, level),
1640 .htile_stencil_disabled = radv_image_tile_stencil_disabled(device, iview->image),
1641 .vrs_enabled = radv_image_has_vrs_htile(device, iview->image),
1642 };
1643
1644 ac_init_ds_surface(&pdev->info, &ds_state, &ds->ac);
1645
1646 const struct ac_mutable_ds_state mutable_ds_state = {
1647 .ds = &ds->ac,
1648 .format = vk_format_to_pipe_format(iview->image->vk.format),
1649 .tc_compat_htile_enabled = radv_htile_enabled(iview->image, level) && radv_image_is_tc_compat_htile(iview->image),
1650 .zrange_precision = true,
1651 .no_d16_compression = true,
1652 };
1653
1654 ac_set_mutable_ds_surface_fields(&pdev->info, &mutable_ds_state, &ds->ac);
1655
1656 if (pdev->info.gfx_level >= GFX11) {
1657 radv_gfx11_set_db_render_control(device, iview->image->vk.samples, &ds->db_render_control);
1658 }
1659 }
1660
1661 void
radv_gfx11_set_db_render_control(const struct radv_device * device,unsigned num_samples,unsigned * db_render_control)1662 radv_gfx11_set_db_render_control(const struct radv_device *device, unsigned num_samples, unsigned *db_render_control)
1663 {
1664 const struct radv_physical_device *pdev = radv_device_physical(device);
1665 unsigned max_allowed_tiles_in_wave = 0;
1666
1667 if (pdev->info.has_dedicated_vram) {
1668 if (num_samples == 8)
1669 max_allowed_tiles_in_wave = 6;
1670 else if (num_samples == 4)
1671 max_allowed_tiles_in_wave = 13;
1672 else
1673 max_allowed_tiles_in_wave = 0;
1674 } else {
1675 if (num_samples == 8)
1676 max_allowed_tiles_in_wave = 7;
1677 else if (num_samples == 4)
1678 max_allowed_tiles_in_wave = 15;
1679 else
1680 max_allowed_tiles_in_wave = 0;
1681 }
1682
1683 *db_render_control |= S_028000_MAX_ALLOWED_TILES_IN_WAVE(max_allowed_tiles_in_wave);
1684 }
1685
1686 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetMemoryFdKHR(VkDevice _device,const VkMemoryGetFdInfoKHR * pGetFdInfo,int * pFD)1687 radv_GetMemoryFdKHR(VkDevice _device, const VkMemoryGetFdInfoKHR *pGetFdInfo, int *pFD)
1688 {
1689 VK_FROM_HANDLE(radv_device, device, _device);
1690 VK_FROM_HANDLE(radv_device_memory, memory, pGetFdInfo->memory);
1691
1692 assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR);
1693
1694 /* At the moment, we support only the below handle types. */
1695 assert(pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
1696 pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
1697
1698 bool ret = radv_get_memory_fd(device, memory, pFD);
1699 if (ret == false)
1700 return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1701 return VK_SUCCESS;
1702 }
1703
1704 static uint32_t
radv_compute_valid_memory_types_attempt(struct radv_physical_device * pdev,enum radeon_bo_domain domains,enum radeon_bo_flag flags,enum radeon_bo_flag ignore_flags)1705 radv_compute_valid_memory_types_attempt(struct radv_physical_device *pdev, enum radeon_bo_domain domains,
1706 enum radeon_bo_flag flags, enum radeon_bo_flag ignore_flags)
1707 {
1708 /* Don't count GTT/CPU as relevant:
1709 *
1710 * - We're not fully consistent between the two.
1711 * - Sometimes VRAM gets VRAM|GTT.
1712 */
1713 const enum radeon_bo_domain relevant_domains = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GDS | RADEON_DOMAIN_OA;
1714 uint32_t bits = 0;
1715 for (unsigned i = 0; i < pdev->memory_properties.memoryTypeCount; ++i) {
1716 if ((domains & relevant_domains) != (pdev->memory_domains[i] & relevant_domains))
1717 continue;
1718
1719 if ((flags & ~ignore_flags) != (pdev->memory_flags[i] & ~ignore_flags))
1720 continue;
1721
1722 bits |= 1u << i;
1723 }
1724
1725 return bits;
1726 }
1727
1728 static uint32_t
radv_compute_valid_memory_types(struct radv_physical_device * pdev,enum radeon_bo_domain domains,enum radeon_bo_flag flags)1729 radv_compute_valid_memory_types(struct radv_physical_device *pdev, enum radeon_bo_domain domains,
1730 enum radeon_bo_flag flags)
1731 {
1732 enum radeon_bo_flag ignore_flags = ~(RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_GTT_WC);
1733 uint32_t bits = radv_compute_valid_memory_types_attempt(pdev, domains, flags, ignore_flags);
1734
1735 if (!bits) {
1736 ignore_flags |= RADEON_FLAG_GTT_WC;
1737 bits = radv_compute_valid_memory_types_attempt(pdev, domains, flags, ignore_flags);
1738 }
1739
1740 if (!bits) {
1741 ignore_flags |= RADEON_FLAG_NO_CPU_ACCESS;
1742 bits = radv_compute_valid_memory_types_attempt(pdev, domains, flags, ignore_flags);
1743 }
1744
1745 /* Avoid 32-bit memory types for shared memory. */
1746 bits &= ~pdev->memory_types_32bit;
1747
1748 return bits;
1749 }
1750 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetMemoryFdPropertiesKHR(VkDevice _device,VkExternalMemoryHandleTypeFlagBits handleType,int fd,VkMemoryFdPropertiesKHR * pMemoryFdProperties)1751 radv_GetMemoryFdPropertiesKHR(VkDevice _device, VkExternalMemoryHandleTypeFlagBits handleType, int fd,
1752 VkMemoryFdPropertiesKHR *pMemoryFdProperties)
1753 {
1754 VK_FROM_HANDLE(radv_device, device, _device);
1755 struct radv_physical_device *pdev = radv_device_physical(device);
1756
1757 switch (handleType) {
1758 case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT: {
1759 enum radeon_bo_domain domains;
1760 enum radeon_bo_flag flags;
1761 if (!device->ws->buffer_get_flags_from_fd(device->ws, fd, &domains, &flags))
1762 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1763
1764 pMemoryFdProperties->memoryTypeBits = radv_compute_valid_memory_types(pdev, domains, flags);
1765 return VK_SUCCESS;
1766 }
1767 default:
1768 /* The valid usage section for this function says:
1769 *
1770 * "handleType must not be one of the handle types defined as
1771 * opaque."
1772 *
1773 * So opaque handle types fall into the default "unsupported" case.
1774 */
1775 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1776 }
1777 }
1778
1779 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetCalibratedTimestampsKHR(VkDevice _device,uint32_t timestampCount,const VkCalibratedTimestampInfoKHR * pTimestampInfos,uint64_t * pTimestamps,uint64_t * pMaxDeviation)1780 radv_GetCalibratedTimestampsKHR(VkDevice _device, uint32_t timestampCount,
1781 const VkCalibratedTimestampInfoKHR *pTimestampInfos, uint64_t *pTimestamps,
1782 uint64_t *pMaxDeviation)
1783 {
1784 #ifndef _WIN32
1785 VK_FROM_HANDLE(radv_device, device, _device);
1786 const struct radv_physical_device *pdev = radv_device_physical(device);
1787 uint32_t clock_crystal_freq = pdev->info.clock_crystal_freq;
1788 int d;
1789 uint64_t begin, end;
1790 uint64_t max_clock_period = 0;
1791
1792 #ifdef CLOCK_MONOTONIC_RAW
1793 begin = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
1794 #else
1795 begin = vk_clock_gettime(CLOCK_MONOTONIC);
1796 #endif
1797
1798 for (d = 0; d < timestampCount; d++) {
1799 switch (pTimestampInfos[d].timeDomain) {
1800 case VK_TIME_DOMAIN_DEVICE_KHR:
1801 pTimestamps[d] = device->ws->query_value(device->ws, RADEON_TIMESTAMP);
1802 uint64_t device_period = DIV_ROUND_UP(1000000, clock_crystal_freq);
1803 max_clock_period = MAX2(max_clock_period, device_period);
1804 break;
1805 case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR:
1806 pTimestamps[d] = vk_clock_gettime(CLOCK_MONOTONIC);
1807 max_clock_period = MAX2(max_clock_period, 1);
1808 break;
1809
1810 #ifdef CLOCK_MONOTONIC_RAW
1811 case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR:
1812 pTimestamps[d] = begin;
1813 break;
1814 #endif
1815 default:
1816 pTimestamps[d] = 0;
1817 break;
1818 }
1819 }
1820
1821 #ifdef CLOCK_MONOTONIC_RAW
1822 end = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
1823 #else
1824 end = vk_clock_gettime(CLOCK_MONOTONIC);
1825 #endif
1826
1827 *pMaxDeviation = vk_time_max_deviation(begin, end, max_clock_period);
1828
1829 return VK_SUCCESS;
1830 #else
1831 return VK_ERROR_FEATURE_NOT_PRESENT;
1832 #endif
1833 }
1834
1835 bool
radv_device_set_pstate(struct radv_device * device,bool enable)1836 radv_device_set_pstate(struct radv_device *device, bool enable)
1837 {
1838 const struct radv_physical_device *pdev = radv_device_physical(device);
1839 const struct radv_instance *instance = radv_physical_device_instance(pdev);
1840 struct radeon_winsys *ws = device->ws;
1841 enum radeon_ctx_pstate pstate = enable ? instance->profile_pstate : RADEON_CTX_PSTATE_NONE;
1842
1843 if (pdev->info.has_stable_pstate) {
1844 /* pstate is per-device; setting it for one ctx is sufficient.
1845 * We pick the first initialized one below. */
1846 for (unsigned i = 0; i < RADV_NUM_HW_CTX; i++)
1847 if (device->hw_ctx[i])
1848 return ws->ctx_set_pstate(device->hw_ctx[i], pstate) >= 0;
1849 }
1850
1851 return true;
1852 }
1853
1854 bool
radv_device_acquire_performance_counters(struct radv_device * device)1855 radv_device_acquire_performance_counters(struct radv_device *device)
1856 {
1857 bool result = true;
1858 simple_mtx_lock(&device->pstate_mtx);
1859
1860 if (device->pstate_cnt == 0) {
1861 result = radv_device_set_pstate(device, true);
1862 if (result)
1863 ++device->pstate_cnt;
1864 }
1865
1866 simple_mtx_unlock(&device->pstate_mtx);
1867 return result;
1868 }
1869
1870 void
radv_device_release_performance_counters(struct radv_device * device)1871 radv_device_release_performance_counters(struct radv_device *device)
1872 {
1873 simple_mtx_lock(&device->pstate_mtx);
1874
1875 if (--device->pstate_cnt == 0)
1876 radv_device_set_pstate(device, false);
1877
1878 simple_mtx_unlock(&device->pstate_mtx);
1879 }
1880
1881 VKAPI_ATTR VkResult VKAPI_CALL
radv_AcquireProfilingLockKHR(VkDevice _device,const VkAcquireProfilingLockInfoKHR * pInfo)1882 radv_AcquireProfilingLockKHR(VkDevice _device, const VkAcquireProfilingLockInfoKHR *pInfo)
1883 {
1884 VK_FROM_HANDLE(radv_device, device, _device);
1885 bool result = radv_device_acquire_performance_counters(device);
1886 return result ? VK_SUCCESS : VK_ERROR_UNKNOWN;
1887 }
1888
1889 VKAPI_ATTR void VKAPI_CALL
radv_ReleaseProfilingLockKHR(VkDevice _device)1890 radv_ReleaseProfilingLockKHR(VkDevice _device)
1891 {
1892 VK_FROM_HANDLE(radv_device, device, _device);
1893 radv_device_release_performance_counters(device);
1894 }
1895
1896 VKAPI_ATTR void VKAPI_CALL
radv_GetDeviceImageSubresourceLayoutKHR(VkDevice device,const VkDeviceImageSubresourceInfoKHR * pInfo,VkSubresourceLayout2KHR * pLayout)1897 radv_GetDeviceImageSubresourceLayoutKHR(VkDevice device, const VkDeviceImageSubresourceInfoKHR *pInfo,
1898 VkSubresourceLayout2KHR *pLayout)
1899 {
1900 UNUSED VkResult result;
1901 VkImage image;
1902
1903 result =
1904 radv_image_create(device, &(struct radv_image_create_info){.vk_info = pInfo->pCreateInfo}, NULL, &image, true);
1905 assert(result == VK_SUCCESS);
1906
1907 radv_GetImageSubresourceLayout2KHR(device, image, pInfo->pSubresource, pLayout);
1908
1909 radv_DestroyImage(device, image, NULL);
1910 }
1911