xref: /aosp_15_r20/external/mesa3d/src/amd/vulkan/radv_device.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * SPDX-License-Identifier: MIT
9  */
10 
11 #ifndef RADV_DEVICE_H
12 #define RADV_DEVICE_H
13 
14 #include "ac_descriptors.h"
15 #include "ac_spm.h"
16 #include "ac_sqtt.h"
17 
18 #include "util/mesa-blake3.h"
19 
20 #include "radv_pipeline.h"
21 #include "radv_printf.h"
22 #include "radv_queue.h"
23 #include "radv_radeon_winsys.h"
24 #include "radv_rra.h"
25 #include "radv_shader.h"
26 
27 #include "vk_device.h"
28 #include "vk_texcompress_astc.h"
29 #include "vk_texcompress_etc2.h"
30 
31 #define RADV_NUM_HW_CTX (RADEON_CTX_PRIORITY_REALTIME + 1)
32 
33 struct radv_image_view;
34 
35 enum radv_dispatch_table {
36    RADV_DEVICE_DISPATCH_TABLE,
37    RADV_ANNOTATE_DISPATCH_TABLE,
38    RADV_APP_DISPATCH_TABLE,
39    RADV_RGP_DISPATCH_TABLE,
40    RADV_RRA_DISPATCH_TABLE,
41    RADV_RMV_DISPATCH_TABLE,
42    RADV_CTX_ROLL_DISPATCH_TABLE,
43    RADV_DISPATCH_TABLE_COUNT,
44 };
45 
46 struct radv_layer_dispatch_tables {
47    struct vk_device_dispatch_table annotate;
48    struct vk_device_dispatch_table app;
49    struct vk_device_dispatch_table rgp;
50    struct vk_device_dispatch_table rra;
51    struct vk_device_dispatch_table rmv;
52    struct vk_device_dispatch_table ctx_roll;
53 };
54 
55 struct radv_device_cache_key {
56    uint32_t disable_trunc_coord : 1;
57    uint32_t image_2d_view_of_3d : 1;
58    uint32_t mesh_shader_queries : 1;
59    uint32_t primitives_generated_query : 1;
60 };
61 
62 enum radv_force_vrs {
63    RADV_FORCE_VRS_1x1 = 0,
64    RADV_FORCE_VRS_2x2,
65    RADV_FORCE_VRS_2x1,
66    RADV_FORCE_VRS_1x2,
67 };
68 
69 struct radv_notifier {
70    int fd;
71    int watch;
72    bool quit;
73    thrd_t thread;
74 };
75 
76 struct radv_meta_state {
77    VkAllocationCallbacks alloc;
78 
79    VkPipelineCache cache;
80    uint32_t initial_cache_entries;
81 
82    /*
83     * For on-demand pipeline creation, makes sure that
84     * only one thread tries to build a pipeline at the same time.
85     */
86    mtx_t mtx;
87 
88    /**
89     * Use array element `i` for images with `2^i` samples.
90     */
91    struct {
92       VkPipeline color_pipelines[NUM_META_FS_KEYS];
93    } color_clear[MAX_SAMPLES_LOG2][MAX_RTS];
94 
95    struct {
96       VkPipeline depth_only_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
97       VkPipeline stencil_only_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
98       VkPipeline depthstencil_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
99 
100       VkPipeline depth_only_unrestricted_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
101       VkPipeline stencil_only_unrestricted_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
102       VkPipeline depthstencil_unrestricted_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
103    } ds_clear[MAX_SAMPLES_LOG2];
104 
105    VkPipelineLayout clear_color_p_layout;
106    VkPipelineLayout clear_depth_p_layout;
107    VkPipelineLayout clear_depth_unrestricted_p_layout;
108 
109    /* Optimized compute fast HTILE clear for stencil or depth only. */
110    VkPipeline clear_htile_mask_pipeline;
111    VkPipelineLayout clear_htile_mask_p_layout;
112    VkDescriptorSetLayout clear_htile_mask_ds_layout;
113 
114    /* Copy VRS into HTILE. */
115    VkPipeline copy_vrs_htile_pipeline;
116    VkPipelineLayout copy_vrs_htile_p_layout;
117    VkDescriptorSetLayout copy_vrs_htile_ds_layout;
118 
119    /* Clear DCC with comp-to-single. */
120    VkPipeline clear_dcc_comp_to_single_pipeline[2]; /* 0: 1x, 1: 2x/4x/8x */
121    VkPipelineLayout clear_dcc_comp_to_single_p_layout;
122    VkDescriptorSetLayout clear_dcc_comp_to_single_ds_layout;
123 
124    struct {
125       /** Pipeline that blits from a 1D image. */
126       VkPipeline pipeline_1d_src[NUM_META_FS_KEYS];
127 
128       /** Pipeline that blits from a 2D image. */
129       VkPipeline pipeline_2d_src[NUM_META_FS_KEYS];
130 
131       /** Pipeline that blits from a 3D image. */
132       VkPipeline pipeline_3d_src[NUM_META_FS_KEYS];
133 
134       VkPipeline depth_only_1d_pipeline;
135       VkPipeline depth_only_2d_pipeline;
136       VkPipeline depth_only_3d_pipeline;
137 
138       VkPipeline stencil_only_1d_pipeline;
139       VkPipeline stencil_only_2d_pipeline;
140       VkPipeline stencil_only_3d_pipeline;
141       VkPipelineLayout pipeline_layout;
142       VkDescriptorSetLayout ds_layout;
143    } blit;
144 
145    struct {
146       VkPipelineLayout p_layouts[5];
147       VkDescriptorSetLayout ds_layouts[5];
148       VkPipeline pipelines[5][NUM_META_FS_KEYS];
149 
150       VkPipeline depth_only_pipeline[5];
151 
152       VkPipeline stencil_only_pipeline[5];
153    } blit2d[MAX_SAMPLES_LOG2];
154 
155    struct {
156       VkPipelineLayout img_p_layout;
157       VkDescriptorSetLayout img_ds_layout;
158       VkPipeline pipeline;
159       VkPipeline pipeline_3d;
160    } itob;
161    struct {
162       VkPipelineLayout img_p_layout;
163       VkDescriptorSetLayout img_ds_layout;
164       VkPipeline pipeline;
165       VkPipeline pipeline_3d;
166    } btoi;
167    struct {
168       VkPipelineLayout img_p_layout;
169       VkDescriptorSetLayout img_ds_layout;
170       VkPipeline pipeline;
171    } btoi_r32g32b32;
172    struct {
173       VkPipelineLayout img_p_layout;
174       VkDescriptorSetLayout img_ds_layout;
175       VkPipeline pipeline[MAX_SAMPLES_LOG2];
176       VkPipeline pipeline_2d_3d;
177       VkPipeline pipeline_3d_2d;
178       VkPipeline pipeline_3d_3d;
179    } itoi;
180    struct {
181       VkPipelineLayout img_p_layout;
182       VkDescriptorSetLayout img_ds_layout;
183       VkPipeline pipeline;
184    } itoi_r32g32b32;
185    struct {
186       VkPipelineLayout img_p_layout;
187       VkDescriptorSetLayout img_ds_layout;
188       VkPipeline pipeline[MAX_SAMPLES_LOG2];
189       VkPipeline pipeline_3d;
190    } cleari;
191    struct {
192       VkPipelineLayout img_p_layout;
193       VkDescriptorSetLayout img_ds_layout;
194       VkPipeline pipeline;
195    } cleari_r32g32b32;
196    struct {
197       VkPipelineLayout p_layout;
198       VkDescriptorSetLayout ds_layout;
199       VkPipeline pipeline[MAX_SAMPLES_LOG2];
200    } fmask_copy;
201 
202    struct {
203       VkPipelineLayout p_layout;
204       VkPipeline pipeline[NUM_META_FS_KEYS];
205    } resolve;
206 
207    struct {
208       VkDescriptorSetLayout ds_layout;
209       VkPipelineLayout p_layout;
210       struct {
211          VkPipeline pipeline;
212          VkPipeline i_pipeline;
213          VkPipeline srgb_pipeline;
214       } rc[MAX_SAMPLES_LOG2];
215 
216       VkPipeline depth_zero_pipeline;
217       struct {
218          VkPipeline average_pipeline;
219          VkPipeline max_pipeline;
220          VkPipeline min_pipeline;
221       } depth[MAX_SAMPLES_LOG2];
222 
223       VkPipeline stencil_zero_pipeline;
224       struct {
225          VkPipeline max_pipeline;
226          VkPipeline min_pipeline;
227       } stencil[MAX_SAMPLES_LOG2];
228    } resolve_compute;
229 
230    struct {
231       VkDescriptorSetLayout ds_layout;
232       VkPipelineLayout p_layout;
233 
234       struct {
235          VkPipeline pipeline[NUM_META_FS_KEYS];
236       } rc[MAX_SAMPLES_LOG2];
237 
238       VkPipeline depth_zero_pipeline;
239       struct {
240          VkPipeline average_pipeline;
241          VkPipeline max_pipeline;
242          VkPipeline min_pipeline;
243       } depth[MAX_SAMPLES_LOG2];
244 
245       VkPipeline stencil_zero_pipeline;
246       struct {
247          VkPipeline max_pipeline;
248          VkPipeline min_pipeline;
249       } stencil[MAX_SAMPLES_LOG2];
250    } resolve_fragment;
251 
252    struct {
253       VkPipelineLayout p_layout;
254       VkPipeline decompress_pipeline[MAX_SAMPLES_LOG2];
255    } depth_decomp;
256 
257    VkDescriptorSetLayout expand_depth_stencil_compute_ds_layout;
258    VkPipelineLayout expand_depth_stencil_compute_p_layout;
259    VkPipeline expand_depth_stencil_compute_pipeline;
260 
261    struct {
262       VkPipelineLayout p_layout;
263       VkPipeline cmask_eliminate_pipeline;
264       VkPipeline fmask_decompress_pipeline;
265       VkPipeline dcc_decompress_pipeline;
266 
267       VkDescriptorSetLayout dcc_decompress_compute_ds_layout;
268       VkPipelineLayout dcc_decompress_compute_p_layout;
269       VkPipeline dcc_decompress_compute_pipeline;
270    } fast_clear_flush;
271 
272    struct {
273       VkPipelineLayout fill_p_layout;
274       VkPipelineLayout copy_p_layout;
275       VkPipeline fill_pipeline;
276       VkPipeline copy_pipeline;
277    } buffer;
278 
279    struct {
280       VkDescriptorSetLayout ds_layout;
281       VkPipelineLayout p_layout;
282       VkPipeline occlusion_query_pipeline;
283       VkPipeline pipeline_statistics_query_pipeline;
284       VkPipeline tfb_query_pipeline;
285       VkPipeline timestamp_query_pipeline;
286       VkPipeline pg_query_pipeline;
287       VkPipeline ms_prim_gen_query_pipeline;
288    } query;
289 
290    struct {
291       VkDescriptorSetLayout ds_layout;
292       VkPipelineLayout p_layout;
293       VkPipeline pipeline[MAX_SAMPLES_LOG2];
294    } fmask_expand;
295 
296    struct {
297       VkDescriptorSetLayout ds_layout;
298       VkPipelineLayout p_layout;
299       VkPipeline pipeline[32];
300    } dcc_retile;
301 
302    struct {
303       VkPipelineLayout leaf_p_layout;
304       VkPipeline leaf_pipeline;
305       VkPipeline leaf_updateable_pipeline;
306       VkPipelineLayout morton_p_layout;
307       VkPipeline morton_pipeline;
308       VkPipelineLayout lbvh_main_p_layout;
309       VkPipeline lbvh_main_pipeline;
310       VkPipelineLayout lbvh_generate_ir_p_layout;
311       VkPipeline lbvh_generate_ir_pipeline;
312       VkPipelineLayout ploc_p_layout;
313       VkPipeline ploc_pipeline;
314       VkPipelineLayout encode_p_layout;
315       VkPipeline encode_pipeline;
316       VkPipeline encode_compact_pipeline;
317       VkPipelineLayout header_p_layout;
318       VkPipeline header_pipeline;
319       VkPipelineLayout update_p_layout;
320       VkPipeline update_pipeline;
321       VkPipelineLayout copy_p_layout;
322       VkPipeline copy_pipeline;
323 
324       struct radix_sort_vk *radix_sort;
325 
326       struct {
327          VkBuffer buffer;
328          VkDeviceMemory memory;
329          VkAccelerationStructureKHR accel_struct;
330       } null;
331    } accel_struct_build;
332 
333    struct vk_texcompress_etc2_state etc_decode;
334 
335    struct vk_texcompress_astc_state *astc_decode;
336 
337    struct {
338       VkDescriptorSetLayout ds_layout;
339       VkPipelineLayout p_layout;
340    } dgc_prepare;
341 };
342 
343 struct radv_memory_trace_data {
344    /* ID of the PTE update event in ftrace data */
345    uint16_t ftrace_update_ptes_id;
346 
347    uint32_t num_cpus;
348    int *pipe_fds;
349 };
350 
351 struct radv_sqtt_timestamp {
352    uint8_t *map;
353    unsigned offset;
354    uint64_t size;
355    struct radeon_winsys_bo *bo;
356    struct list_head list;
357 };
358 
359 #define RADV_BORDER_COLOR_COUNT       4096
360 #define RADV_BORDER_COLOR_BUFFER_SIZE (sizeof(VkClearColorValue) * RADV_BORDER_COLOR_COUNT)
361 
362 struct radv_device_border_color_data {
363    bool used[RADV_BORDER_COLOR_COUNT];
364 
365    struct radeon_winsys_bo *bo;
366    VkClearColorValue *colors_gpu_ptr;
367 
368    /* Mutex is required to guarantee vkCreateSampler thread safety
369     * given that we are writing to a buffer and checking color occupation */
370    mtx_t mutex;
371 };
372 
373 struct radv_pso_cache_stats {
374    uint32_t hits;
375    uint32_t misses;
376 };
377 
378 struct radv_device {
379    struct vk_device vk;
380 
381    struct radeon_winsys *ws;
382 
383    struct radv_layer_dispatch_tables layer_dispatch;
384 
385    struct radeon_winsys_ctx *hw_ctx[RADV_NUM_HW_CTX];
386    struct radv_meta_state meta_state;
387 
388    struct radv_queue *queues[RADV_MAX_QUEUE_FAMILIES];
389    int queue_count[RADV_MAX_QUEUE_FAMILIES];
390 
391    bool pbb_allowed;
392    uint32_t scratch_waves;
393    uint32_t dispatch_initiator;
394    uint32_t dispatch_initiator_task;
395 
396    /* MSAA sample locations.
397     * The first index is the sample index.
398     * The second index is the coordinate: X, Y. */
399    float sample_locations_1x[1][2];
400    float sample_locations_2x[2][2];
401    float sample_locations_4x[4][2];
402    float sample_locations_8x[8][2];
403 
404    /* GFX7 and later */
405    uint32_t gfx_init_size_dw;
406    struct radeon_winsys_bo *gfx_init;
407 
408    struct radeon_winsys_bo *trace_bo;
409    struct radv_trace_data *trace_data;
410 
411    /* Whether to keep shader debug info, for debugging. */
412    bool keep_shader_info;
413 
414    /* Backup in-memory cache to be used if the app doesn't provide one */
415    struct vk_pipeline_cache *mem_cache;
416 
417    /*
418     * use different counters so MSAA MRTs get consecutive surface indices,
419     * even if MASK is allocated in between.
420     */
421    uint32_t image_mrt_offset_counter;
422    uint32_t fmask_mrt_offset_counter;
423 
424    struct list_head shader_arenas;
425    struct hash_table_u64 *capture_replay_arena_vas;
426    unsigned shader_arena_shift;
427    uint8_t shader_free_list_mask;
428    struct radv_shader_free_list shader_free_list;
429    struct radv_shader_free_list capture_replay_free_list;
430    struct list_head shader_block_obj_pool;
431    mtx_t shader_arena_mutex;
432 
433    mtx_t shader_upload_hw_ctx_mutex;
434    struct radeon_winsys_ctx *shader_upload_hw_ctx;
435    VkSemaphore shader_upload_sem;
436    uint64_t shader_upload_seq;
437    struct list_head shader_dma_submissions;
438    mtx_t shader_dma_submission_list_mutex;
439    cnd_t shader_dma_submission_list_cond;
440 
441    /* Whether to DMA shaders to invisible VRAM or to upload directly through BAR. */
442    bool shader_use_invisible_vram;
443 
444    /* Whether to inline the compute dispatch size in user sgprs. */
445    bool load_grid_size_from_user_sgpr;
446 
447    /* Whether the driver uses a global BO list. */
448    bool use_global_bo_list;
449 
450    /* Whether anisotropy is forced with RADV_TEX_ANISO (-1 is disabled). */
451    int force_aniso;
452 
453    /* Always disable TRUNC_COORD. */
454    bool disable_trunc_coord;
455 
456    struct radv_device_border_color_data border_color_data;
457 
458    /* Thread trace. */
459    struct ac_sqtt sqtt;
460    bool sqtt_enabled;
461    bool sqtt_triggered;
462 
463    /* SQTT timestamps for queue events. */
464    simple_mtx_t sqtt_timestamp_mtx;
465    struct radv_sqtt_timestamp sqtt_timestamp;
466 
467    /* SQTT timed cmd buffers. */
468    simple_mtx_t sqtt_command_pool_mtx;
469    struct vk_command_pool *sqtt_command_pool[2];
470 
471    /* Memory trace. */
472    struct radv_memory_trace_data memory_trace;
473 
474    /* SPM. */
475    struct ac_spm spm;
476 
477    /* Radeon Raytracing Analyzer trace. */
478    struct radv_rra_trace_data rra_trace;
479 
480    FILE *ctx_roll_file;
481    simple_mtx_t ctx_roll_mtx;
482 
483    /* Trap handler. */
484    struct radv_shader *trap_handler_shader;
485    struct radeon_winsys_bo *tma_bo; /* Trap Memory Address */
486    uint32_t *tma_ptr;
487 
488    /* Overallocation. */
489    bool overallocation_disallowed;
490    uint64_t allocated_memory_size[VK_MAX_MEMORY_HEAPS];
491    mtx_t overallocation_mutex;
492 
493    /* RADV_FORCE_VRS. */
494    struct radv_notifier notifier;
495    enum radv_force_vrs force_vrs;
496 
497    /* Depth image for VRS when not bound by the app. */
498    struct {
499       struct radv_image *image;
500       struct radv_buffer *buffer; /* HTILE */
501       struct radv_device_memory *mem;
502    } vrs;
503 
504    /* Prime blit sdma queue */
505    struct radv_queue *private_sdma_queue;
506 
507    struct radv_shader_part_cache vs_prologs;
508    struct radv_shader_part *simple_vs_prologs[MAX_VERTEX_ATTRIBS];
509    struct radv_shader_part *instance_rate_vs_prologs[816];
510 
511    struct radv_shader_part_cache ps_epilogs;
512 
513    simple_mtx_t trace_mtx;
514 
515    /* Whether per-vertex VRS is forced. */
516    bool force_vrs_enabled;
517 
518    simple_mtx_t pstate_mtx;
519    unsigned pstate_cnt;
520 
521    /* BO to contain some performance counter helpers:
522     * - A lock for profiling cmdbuffers.
523     * - a temporary fence for the end query synchronization.
524     * - the pass to use for profiling. (as an array of bools)
525     */
526    struct radeon_winsys_bo *perf_counter_bo;
527 
528    /* Interleaved lock/unlock commandbuffers for perfcounter passes. */
529    struct radeon_cmdbuf **perf_counter_lock_cs;
530 
531    bool uses_shadow_regs;
532 
533    struct hash_table *rt_handles;
534    simple_mtx_t rt_handles_mtx;
535 
536    struct radv_printf_data printf;
537 
538    struct radv_device_cache_key cache_key;
539    blake3_hash cache_hash;
540 
541    /* Not NULL if a GPU hang report has been generated for VK_EXT_device_fault. */
542    char *gpu_hang_report;
543 
544    /* For indirect compute pipeline binds with DGC only. */
545    simple_mtx_t compute_scratch_mtx;
546    uint32_t compute_scratch_size_per_wave;
547    uint32_t compute_scratch_waves;
548 
549    /* PSO cache stats */
550    simple_mtx_t pso_cache_stats_mtx;
551    struct radv_pso_cache_stats pso_cache_stats[RADV_PIPELINE_TYPE_COUNT];
552 };
553 
554 VK_DEFINE_HANDLE_CASTS(radv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
555 
556 static inline struct radv_physical_device *
radv_device_physical(const struct radv_device * dev)557 radv_device_physical(const struct radv_device *dev)
558 {
559    return (struct radv_physical_device *)dev->vk.physical;
560 }
561 
562 static inline bool
radv_uses_device_generated_commands(const struct radv_device * device)563 radv_uses_device_generated_commands(const struct radv_device *device)
564 {
565    return device->vk.enabled_features.deviceGeneratedCommandsNV || device->vk.enabled_features.deviceGeneratedCompute;
566 }
567 
568 static inline bool
radv_uses_primitives_generated_query(const struct radv_device * device)569 radv_uses_primitives_generated_query(const struct radv_device *device)
570 {
571    return device->vk.enabled_features.primitivesGeneratedQuery ||
572           device->vk.enabled_features.primitivesGeneratedQueryWithRasterizerDiscard ||
573           device->vk.enabled_features.primitivesGeneratedQueryWithNonZeroStreams;
574 }
575 
576 static inline bool
radv_uses_image_float32_atomics(const struct radv_device * device)577 radv_uses_image_float32_atomics(const struct radv_device *device)
578 {
579    return device->vk.enabled_features.shaderImageFloat32Atomics ||
580           device->vk.enabled_features.sparseImageFloat32Atomics ||
581           device->vk.enabled_features.shaderImageFloat32AtomicMinMax ||
582           device->vk.enabled_features.sparseImageFloat32AtomicMinMax;
583 }
584 
585 VkResult radv_device_init_vrs_state(struct radv_device *device);
586 
587 unsigned radv_get_default_max_sample_dist(int log_samples);
588 
589 void radv_emit_default_sample_locations(const struct radv_physical_device *pdev, struct radeon_cmdbuf *cs,
590                                         int nr_samples);
591 
592 bool radv_get_memory_fd(struct radv_device *device, struct radv_device_memory *memory, int *pFD);
593 
594 unsigned radv_get_dcc_max_uncompressed_block_size(const struct radv_device *device, const struct radv_image *image);
595 
596 struct radv_color_buffer_info {
597    struct ac_cb_surface ac;
598 };
599 
600 struct radv_ds_buffer_info {
601    struct ac_ds_surface ac;
602 
603    uint32_t db_render_override2;
604    uint32_t db_render_control;
605 };
606 
607 void radv_initialise_color_surface(struct radv_device *device, struct radv_color_buffer_info *cb,
608                                    struct radv_image_view *iview);
609 
610 void radv_initialise_vrs_surface(struct radv_image *image, struct radv_buffer *htile_buffer,
611                                  struct radv_ds_buffer_info *ds);
612 
613 
614 void radv_initialise_ds_surface(const struct radv_device *device, struct radv_ds_buffer_info *ds,
615                                 struct radv_image_view *iview, VkImageAspectFlags ds_aspects);
616 
617 void radv_gfx11_set_db_render_control(const struct radv_device *device, unsigned num_samples,
618                                       unsigned *db_render_control);
619 
620 bool radv_device_set_pstate(struct radv_device *device, bool enable);
621 
622 bool radv_device_acquire_performance_counters(struct radv_device *device);
623 
624 void radv_device_release_performance_counters(struct radv_device *device);
625 
626 #endif /* RADV_DEVICE_H */
627