xref: /aosp_15_r20/external/mesa3d/src/asahi/vulkan/hk_cmd_dispatch.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2024 Valve Corporation
3  * Copyright 2024 Alyssa Rosenzweig
4  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5  * SPDX-License-Identifier: MIT
6  */
7 #include "shaders/query.h"
8 #include "vulkan/vulkan_core.h"
9 #include "agx_helpers.h"
10 #include "agx_linker.h"
11 #include "agx_nir_lower_gs.h"
12 #include "agx_pack.h"
13 #include "agx_scratch.h"
14 #include "agx_tilebuffer.h"
15 #include "hk_buffer.h"
16 #include "hk_cmd_buffer.h"
17 #include "hk_descriptor_set.h"
18 #include "hk_device.h"
19 #include "hk_entrypoints.h"
20 #include "hk_physical_device.h"
21 #include "hk_shader.h"
22 #include "pool.h"
23 
24 void
hk_cmd_buffer_begin_compute(struct hk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)25 hk_cmd_buffer_begin_compute(struct hk_cmd_buffer *cmd,
26                             const VkCommandBufferBeginInfo *pBeginInfo)
27 {
28 }
29 
30 void
hk_cmd_invalidate_compute_state(struct hk_cmd_buffer * cmd)31 hk_cmd_invalidate_compute_state(struct hk_cmd_buffer *cmd)
32 {
33    memset(&cmd->state.cs, 0, sizeof(cmd->state.cs));
34 }
35 
36 void
hk_cmd_bind_compute_shader(struct hk_cmd_buffer * cmd,struct hk_api_shader * shader)37 hk_cmd_bind_compute_shader(struct hk_cmd_buffer *cmd,
38                            struct hk_api_shader *shader)
39 {
40    cmd->state.cs.shader = shader;
41 }
42 
43 void
hk_cdm_cache_flush(struct hk_device * dev,struct hk_cs * cs)44 hk_cdm_cache_flush(struct hk_device *dev, struct hk_cs *cs)
45 {
46    assert(cs->type == HK_CS_CDM);
47    assert(cs->current + AGX_CDM_BARRIER_LENGTH < cs->end &&
48           "caller must ensure space");
49 
50    uint8_t *out = cs->current;
51 
52    agx_push(out, CDM_BARRIER, cfg) {
53       cfg.unk_5 = true;
54       cfg.unk_6 = true;
55       cfg.unk_8 = true;
56       // cfg.unk_11 = true;
57       // cfg.unk_20 = true;
58       if (dev->dev.params.num_clusters_total > 1) {
59          // cfg.unk_24 = true;
60          if (dev->dev.params.gpu_generation == 13) {
61             cfg.unk_4 = true;
62             // cfg.unk_26 = true;
63          }
64       }
65 
66       /* With multiple launches in the same CDM stream, we can get cache
67        * coherency (? or sync?) issues. We hit this with blits, which need - in
68        * between dispatches - need the PBE cache to be flushed and the texture
69        * cache to be invalidated. Until we know what bits mean what exactly,
70        * let's just set these after every launch to be safe. We can revisit in
71        * the future when we figure out what the bits mean.
72        */
73       cfg.unk_0 = true;
74       cfg.unk_1 = true;
75       cfg.unk_2 = true;
76       cfg.usc_cache_inval = true;
77       cfg.unk_4 = true;
78       cfg.unk_5 = true;
79       cfg.unk_6 = true;
80       cfg.unk_7 = true;
81       cfg.unk_8 = true;
82       cfg.unk_9 = true;
83       cfg.unk_10 = true;
84       cfg.unk_11 = true;
85       cfg.unk_12 = true;
86       cfg.unk_13 = true;
87       cfg.unk_14 = true;
88       cfg.unk_15 = true;
89       cfg.unk_16 = true;
90       cfg.unk_17 = true;
91       cfg.unk_18 = true;
92       cfg.unk_19 = true;
93    }
94 
95    cs->current = out;
96 }
97 
98 /*
99  * Enqueue workgroups to a given CDM control stream with a given prepared USC
100  * words. This does not interact with any global state, so it is suitable for
101  * internal dispatches that do not save/restore state. That can be simpler /
102  * lower overhead than vk_meta for special operations that logically operate
103  * as graphics.
104  */
105 void
hk_dispatch_with_usc(struct hk_device * dev,struct hk_cs * cs,struct hk_shader * s,uint32_t usc,struct hk_grid grid,struct hk_grid local_size)106 hk_dispatch_with_usc(struct hk_device *dev, struct hk_cs *cs,
107                      struct hk_shader *s, uint32_t usc, struct hk_grid grid,
108                      struct hk_grid local_size)
109 {
110    assert(cs->current + 0x2000 < cs->end && "should have ensured space");
111    uint8_t *out = cs->current;
112 
113    agx_push(out, CDM_LAUNCH_WORD_0, cfg) {
114       if (grid.indirect)
115          cfg.mode = AGX_CDM_MODE_INDIRECT_GLOBAL;
116       else
117          cfg.mode = AGX_CDM_MODE_DIRECT;
118 
119       /* For now, always bind the txf sampler and nothing else */
120       cfg.sampler_state_register_count = 1;
121 
122       cfg.uniform_register_count = s->b.info.push_count;
123       cfg.preshader_register_count = s->b.info.nr_preamble_gprs;
124    }
125 
126    agx_push(out, CDM_LAUNCH_WORD_1, cfg) {
127       cfg.pipeline = usc;
128    }
129 
130    /* Added in G14X */
131    if (dev->dev.params.gpu_generation >= 14 &&
132        dev->dev.params.num_clusters_total > 1) {
133 
134       agx_push(out, CDM_UNK_G14X, cfg)
135          ;
136    }
137 
138    assert(!local_size.indirect);
139 
140    if (grid.indirect) {
141       agx_push(out, CDM_INDIRECT, cfg) {
142          cfg.address_hi = grid.ptr >> 32;
143          cfg.address_lo = grid.ptr & BITFIELD64_MASK(32);
144       }
145    } else {
146       agx_push(out, CDM_GLOBAL_SIZE, cfg) {
147          cfg.x = grid.count[0];
148          cfg.y = grid.count[1];
149          cfg.z = grid.count[2];
150       }
151    }
152 
153    agx_push(out, CDM_LOCAL_SIZE, cfg) {
154       cfg.x = local_size.count[0];
155       cfg.y = local_size.count[1];
156       cfg.z = local_size.count[2];
157    }
158 
159    cs->current = out;
160    hk_cdm_cache_flush(dev, cs);
161 }
162 
163 static void
dispatch(struct hk_cmd_buffer * cmd,struct hk_grid grid)164 dispatch(struct hk_cmd_buffer *cmd, struct hk_grid grid)
165 {
166    struct hk_device *dev = hk_cmd_buffer_device(cmd);
167    struct hk_shader *s = hk_only_variant(cmd->state.cs.shader);
168    struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, true /* compute */);
169    if (!cs)
170       return;
171 
172    uint64_t stat = hk_pipeline_stat_addr(
173       cmd, VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT);
174 
175    if (stat) {
176       uint32_t local_size_threads = s->info.cs.local_size[0] *
177                                     s->info.cs.local_size[1] *
178                                     s->info.cs.local_size[2];
179 
180       struct libagx_cs_invocation_params p = {
181          .grid = cmd->state.cs.descriptors.root.cs.group_count_addr,
182          .local_size_threads = local_size_threads,
183          .statistic = stat,
184       };
185 
186       struct hk_shader *s =
187          hk_meta_kernel(dev, agx_nir_increment_cs_invocations, NULL, 0);
188 
189       uint64_t params = hk_pool_upload(cmd, &p, sizeof(p), 8);
190       uint32_t usc =
191          hk_upload_usc_words_kernel(cmd, s, &params, sizeof(params));
192 
193       hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1, 1, 1), hk_grid(1, 1, 1));
194    }
195 
196    hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
197    hk_dispatch(cmd, cs, s, grid);
198 }
199 
200 VKAPI_ATTR void VKAPI_CALL
hk_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)201 hk_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t baseGroupX,
202                    uint32_t baseGroupY, uint32_t baseGroupZ,
203                    uint32_t groupCountX, uint32_t groupCountY,
204                    uint32_t groupCountZ)
205 {
206    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
207    struct hk_descriptor_state *desc = &cmd->state.cs.descriptors;
208    if (desc->push_dirty)
209       hk_cmd_buffer_flush_push_descriptors(cmd, desc);
210 
211    desc->root.cs.base_group[0] = baseGroupX;
212    desc->root.cs.base_group[1] = baseGroupY;
213    desc->root.cs.base_group[2] = baseGroupZ;
214 
215    /* We don't want to key the shader to whether we're indirectly dispatching,
216     * so treat everything as indirect.
217     */
218    VkDispatchIndirectCommand group_count = {
219       .x = groupCountX,
220       .y = groupCountY,
221       .z = groupCountZ,
222    };
223 
224    desc->root.cs.group_count_addr =
225       hk_pool_upload(cmd, &group_count, sizeof(group_count), 8);
226 
227    dispatch(cmd, hk_grid(groupCountX, groupCountY, groupCountZ));
228 }
229 
230 VKAPI_ATTR void VKAPI_CALL
hk_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)231 hk_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
232                        VkDeviceSize offset)
233 {
234    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
235    VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
236    struct hk_descriptor_state *desc = &cmd->state.cs.descriptors;
237    if (desc->push_dirty)
238       hk_cmd_buffer_flush_push_descriptors(cmd, desc);
239 
240    desc->root.cs.base_group[0] = 0;
241    desc->root.cs.base_group[1] = 0;
242    desc->root.cs.base_group[2] = 0;
243 
244    uint64_t dispatch_addr = hk_buffer_address(buffer, offset);
245    assert(dispatch_addr != 0);
246 
247    desc->root.cs.group_count_addr = dispatch_addr;
248    dispatch(cmd, hk_grid_indirect(dispatch_addr));
249 }
250