xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/xe/anv_batch_chain.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2023 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "xe/anv_batch_chain.h"
25 
26 #include "anv_private.h"
27 #include "anv_measure.h"
28 #include "common/intel_bind_timeline.h"
29 #include "perf/intel_perf.h"
30 
31 #include "drm-uapi/xe_drm.h"
32 
33 #define TYPE_SIGNAL true
34 #define TYPE_WAIT false
35 
36 struct drm_xe_sync
vk_sync_to_drm_xe_sync(struct vk_sync * vk_sync,uint64_t value,bool signal)37 vk_sync_to_drm_xe_sync(struct vk_sync *vk_sync, uint64_t value, bool signal)
38 {
39    const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(vk_sync);
40    assert(syncobj);
41 
42    struct drm_xe_sync drm_sync = {
43       .type = value ? DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ :
44                       DRM_XE_SYNC_TYPE_SYNCOBJ,
45       .flags = signal ? DRM_XE_SYNC_FLAG_SIGNAL : 0,
46       .handle = syncobj->syncobj,
47       .timeline_value = value,
48    };
49 
50    return drm_sync;
51 }
52 
53 static VkResult
xe_exec_process_syncs(struct anv_queue * queue,uint32_t wait_count,const struct vk_sync_wait * waits,uint32_t signal_count,const struct vk_sync_signal * signals,uint32_t extra_sync_count,const struct drm_xe_sync * extra_syncs,struct anv_utrace_submit * utrace_submit,bool is_companion_rcs_queue,struct drm_xe_sync ** ret,uint32_t * ret_count)54 xe_exec_process_syncs(struct anv_queue *queue,
55                       uint32_t wait_count, const struct vk_sync_wait *waits,
56                       uint32_t signal_count, const struct vk_sync_signal *signals,
57                       uint32_t extra_sync_count, const struct drm_xe_sync *extra_syncs,
58                       struct anv_utrace_submit *utrace_submit,
59                       bool is_companion_rcs_queue,
60                       struct drm_xe_sync **ret, uint32_t *ret_count)
61 {
62    struct anv_device *device = queue->device;
63    /* Signal the utrace sync only if it doesn't have a batch. Otherwise the
64     * it's the utrace batch that should signal its own sync.
65     */
66    const bool has_utrace_sync =
67       utrace_submit &&
68       util_dynarray_num_elements(&utrace_submit->base.batch_bos, struct anv_bo *) == 0;
69    const uint32_t num_syncs = wait_count + signal_count + extra_sync_count +
70                               (has_utrace_sync ? 1 : 0) +
71                               ((queue->sync && !is_companion_rcs_queue) ? 1 : 0) +
72                               1 /* vm bind sync */;
73    struct drm_xe_sync *xe_syncs = vk_zalloc(&device->vk.alloc,
74                                             sizeof(*xe_syncs) * num_syncs, 8,
75                                             VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
76    if (!xe_syncs)
77       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
78 
79    uint32_t count = 0;
80 
81    if (has_utrace_sync) {
82       xe_syncs[count++] = vk_sync_to_drm_xe_sync(utrace_submit->base.signal.sync,
83                                                  utrace_submit->base.signal.signal_value,
84                                                  TYPE_SIGNAL);
85    }
86 
87    for (uint32_t i = 0; i < wait_count; i++) {
88       xe_syncs[count++] = vk_sync_to_drm_xe_sync(waits[i].sync,
89                                                  waits[i].wait_value,
90                                                  TYPE_WAIT);
91    }
92 
93    for (uint32_t i = 0; i < signal_count; i++) {
94       xe_syncs[count++] = vk_sync_to_drm_xe_sync(signals[i].sync,
95                                                  signals[i].signal_value,
96                                                  TYPE_SIGNAL);
97    }
98 
99    for (uint32_t i = 0; i < extra_sync_count; i++)
100       xe_syncs[count++] = extra_syncs[i];
101 
102    if (queue->sync && !is_companion_rcs_queue)
103       xe_syncs[count++] = vk_sync_to_drm_xe_sync(queue->sync, 0, TYPE_SIGNAL);
104 
105    /* vm bind sync */
106    xe_syncs[count++] = (struct drm_xe_sync) {
107       .type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
108       .flags = 0 /* TYPE_WAIT */,
109       .handle = intel_bind_timeline_get_syncobj(&device->bind_timeline),
110       .timeline_value = intel_bind_timeline_get_last_point(&device->bind_timeline),
111    };
112 
113    assert(count == num_syncs);
114    *ret = xe_syncs;
115    *ret_count = num_syncs;
116    return VK_SUCCESS;
117 }
118 
119 static void
xe_exec_print_debug(struct anv_queue * queue,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,struct anv_query_pool * perf_query_pool,uint32_t perf_query_pass,struct drm_xe_exec * exec)120 xe_exec_print_debug(struct anv_queue *queue, uint32_t cmd_buffer_count,
121                     struct anv_cmd_buffer **cmd_buffers, struct anv_query_pool *perf_query_pool,
122                     uint32_t perf_query_pass, struct drm_xe_exec *exec)
123 {
124    if (INTEL_DEBUG(DEBUG_SUBMIT))
125       fprintf(stderr, "Batch offset=0x%016"PRIx64" on queue %u\n",
126               (uint64_t)exec->address, queue->vk.index_in_family);
127 
128    anv_cmd_buffer_exec_batch_debug(queue, cmd_buffer_count, cmd_buffers,
129                                    perf_query_pool, perf_query_pass);
130 }
131 
132 VkResult
xe_queue_exec_async(struct anv_async_submit * submit,uint32_t wait_count,const struct vk_sync_wait * waits,uint32_t signal_count,const struct vk_sync_signal * signals)133 xe_queue_exec_async(struct anv_async_submit *submit,
134                     uint32_t wait_count,
135                     const struct vk_sync_wait *waits,
136                     uint32_t signal_count,
137                     const struct vk_sync_signal *signals)
138 {
139    struct anv_queue *queue = submit->queue;
140    struct anv_device *device = queue->device;
141    STACK_ARRAY(struct drm_xe_sync, xe_syncs,
142                wait_count + signal_count +
143                ((submit->signal.sync != NULL) ? 1 : 0) +
144                (queue->sync != NULL ? 1 : 0) +
145                + 1);
146    uint32_t n_syncs = 0;
147 
148    for (uint32_t i = 0; i < wait_count; i++) {
149       xe_syncs[n_syncs++] = vk_sync_to_drm_xe_sync(waits[i].sync,
150                                                    waits[i].wait_value,
151                                                    TYPE_WAIT);
152    }
153    for (uint32_t i = 0; i < signal_count; i++) {
154       xe_syncs[n_syncs++] = vk_sync_to_drm_xe_sync(signals[i].sync,
155                                                    signals[i].signal_value,
156                                                    TYPE_SIGNAL);
157    }
158    if (submit->signal.sync) {
159       xe_syncs[n_syncs++] = vk_sync_to_drm_xe_sync(submit->signal.sync,
160                                                    submit->signal.signal_value,
161                                                    TYPE_SIGNAL);
162    }
163    if (queue->sync)
164       xe_syncs[n_syncs++] = vk_sync_to_drm_xe_sync(queue->sync, 0, TYPE_SIGNAL);
165 
166    xe_syncs[n_syncs++] = (struct drm_xe_sync) {
167       .type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
168       .flags = 0 /* TYPE_WAIT */,
169       .handle = intel_bind_timeline_get_syncobj(&device->bind_timeline),
170       .timeline_value = intel_bind_timeline_get_last_point(&device->bind_timeline),
171    };
172 
173 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
174    if (device->physical->memory.need_flush &&
175        anv_bo_needs_host_cache_flush(device->utrace_bo_pool.bo_alloc_flags)) {
176       util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
177          intel_flush_range((*bo)->map, (*bo)->size);
178    }
179 #endif
180 
181    struct anv_bo *batch_bo =
182       *util_dynarray_element(&submit->batch_bos, struct anv_bo *, 0);
183    struct drm_xe_exec exec = {
184       .exec_queue_id = submit->use_companion_rcs ?
185                        queue->companion_rcs_id : queue->exec_queue_id,
186       .num_batch_buffer = 1,
187       .syncs = (uintptr_t)xe_syncs,
188       .num_syncs = n_syncs,
189       .address = batch_bo->offset,
190    };
191 
192    xe_exec_print_debug(queue, 0, NULL, NULL, 0, &exec);
193 
194    if (likely(!device->info->no_hw)) {
195       if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
196          return vk_device_set_lost(&device->vk, "anv_xe_queue_exec_locked failed: %m");
197    }
198 
199    return anv_queue_post_submit(queue, VK_SUCCESS);
200 }
201 
202 static VkResult
xe_companion_rcs_queue_exec_locked(struct anv_queue * queue,struct anv_cmd_buffer * companion_rcs_cmd_buffer,uint32_t wait_count,const struct vk_sync_wait * waits)203 xe_companion_rcs_queue_exec_locked(struct anv_queue *queue,
204                                    struct anv_cmd_buffer *companion_rcs_cmd_buffer,
205                                    uint32_t wait_count,
206                                    const struct vk_sync_wait *waits)
207 {
208    struct anv_device *device = queue->device;
209    VkResult result;
210 
211    struct vk_sync_signal companion_sync = {
212       .sync = queue->companion_sync,
213    };
214    struct drm_xe_sync *xe_syncs = NULL;
215    uint32_t xe_syncs_count = 0;
216    result = xe_exec_process_syncs(queue,
217                                   wait_count, waits,
218                                   1, &companion_sync,
219                                   0, NULL, /* extra_syncs */
220                                   NULL /* utrace_submit */,
221                                   true /* is_companion_rcs_queue */,
222                                   &xe_syncs,
223                                   &xe_syncs_count);
224    if (result != VK_SUCCESS)
225       return result;
226 
227    struct drm_xe_exec exec = {
228       .exec_queue_id = queue->companion_rcs_id,
229       .num_batch_buffer = 1,
230       .syncs = (uintptr_t)xe_syncs,
231       .num_syncs = xe_syncs_count,
232    };
233 
234    struct anv_batch_bo *batch_bo =
235       list_first_entry(&companion_rcs_cmd_buffer->batch_bos,
236                        struct anv_batch_bo, link);
237    exec.address = batch_bo->bo->offset;
238 
239    anv_measure_submit(companion_rcs_cmd_buffer);
240    xe_exec_print_debug(queue, 1, &companion_rcs_cmd_buffer, NULL, 0, &exec);
241 
242    if (!device->info->no_hw) {
243       if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
244          result = vk_device_set_lost(&device->vk, "anv_xe_queue_exec_locked failed: %m");
245    }
246    vk_free(&device->vk.alloc, xe_syncs);
247 
248    return result;
249 }
250 
251 VkResult
xe_queue_exec_locked(struct anv_queue * queue,uint32_t wait_count,const struct vk_sync_wait * waits,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,uint32_t signal_count,const struct vk_sync_signal * signals,struct anv_query_pool * perf_query_pool,uint32_t perf_query_pass,struct anv_utrace_submit * utrace_submit)252 xe_queue_exec_locked(struct anv_queue *queue,
253                      uint32_t wait_count,
254                      const struct vk_sync_wait *waits,
255                      uint32_t cmd_buffer_count,
256                      struct anv_cmd_buffer **cmd_buffers,
257                      uint32_t signal_count,
258                      const struct vk_sync_signal *signals,
259                      struct anv_query_pool *perf_query_pool,
260                      uint32_t perf_query_pass,
261                      struct anv_utrace_submit *utrace_submit)
262 {
263    struct anv_device *device = queue->device;
264    VkResult result;
265 
266    struct drm_xe_sync *xe_syncs = NULL;
267    uint32_t xe_syncs_count = 0;
268    result = xe_exec_process_syncs(queue, wait_count, waits,
269                                   signal_count, signals,
270                                   0, NULL, /* extra_syncs */
271                                   utrace_submit,
272                                   false, /* is_companion_rcs_queue */
273                                   &xe_syncs, &xe_syncs_count);
274    if (result != VK_SUCCESS)
275       return result;
276 
277    /* If there is a utrace submission but no batch, it means there is no
278     * commands to run for utrace so ignore the submission.
279     */
280    if (utrace_submit &&
281        util_dynarray_num_elements(&utrace_submit->base.batch_bos,
282                                   struct anv_bo *) == 0)
283       utrace_submit = NULL;
284 
285    struct drm_xe_exec exec = {
286       .exec_queue_id = queue->exec_queue_id,
287       .num_batch_buffer = 1,
288       .syncs = (uintptr_t)xe_syncs,
289       .num_syncs = xe_syncs_count,
290    };
291 
292    if (cmd_buffer_count) {
293       if (unlikely(device->physical->measure_device.config)) {
294          for (uint32_t i = 0; i < cmd_buffer_count; i++)
295             anv_measure_submit(cmd_buffers[i]);
296       }
297 
298       anv_cmd_buffer_chain_command_buffers(cmd_buffers, cmd_buffer_count);
299 
300 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
301       if (device->physical->memory.need_flush &&
302           anv_bo_needs_host_cache_flush(device->batch_bo_pool.bo_alloc_flags))
303          anv_cmd_buffer_clflush(cmd_buffers, cmd_buffer_count);
304 #endif
305 
306       struct anv_cmd_buffer *first_cmd_buffer = cmd_buffers[0];
307       struct anv_batch_bo *first_batch_bo = list_first_entry(&first_cmd_buffer->batch_bos,
308                                                              struct anv_batch_bo, link);
309       exec.address = first_batch_bo->bo->offset;
310    } else {
311       exec.address = device->trivial_batch_bo->offset;
312    }
313 
314    xe_exec_print_debug(queue, cmd_buffer_count, cmd_buffers, perf_query_pool,
315                        perf_query_pass, &exec);
316 
317    if (perf_query_pool && cmd_buffer_count) {
318       struct drm_xe_sync xe_syncs[1] = {};
319       struct drm_xe_exec perf_query_exec = {
320             .exec_queue_id = queue->exec_queue_id,
321             .num_batch_buffer = 1,
322             .address = perf_query_pool->bo->offset +
323                        khr_perf_query_preamble_offset(perf_query_pool, perf_query_pass),
324             .num_syncs = 1,
325             .syncs = (uintptr_t)xe_syncs,
326       };
327       assert(perf_query_pass < perf_query_pool->n_passes);
328       struct intel_perf_query_info *query_info = perf_query_pool->pass_query[perf_query_pass];
329 
330       /* Some performance queries just the pipeline statistic HW, no need for
331        * OA in that case, so no need to reconfigure.
332        */
333       if (!INTEL_DEBUG(DEBUG_NO_OACONFIG) &&
334           (query_info->kind == INTEL_PERF_QUERY_TYPE_OA ||
335            query_info->kind == INTEL_PERF_QUERY_TYPE_RAW)) {
336          int ret = intel_perf_stream_set_metrics_id(device->physical->perf,
337                                                     device->perf_fd,
338                                                     query_info->oa_metrics_set_id);
339          if (ret < 0) {
340             result = vk_device_set_lost(&device->vk,
341                                         "intel_perf_stream_set_metrics_id failed: %s",
342                                         strerror(errno));
343          }
344       }
345 
346       xe_syncs[0].type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ;
347       xe_syncs[0].flags = 0;/* wait */
348       xe_syncs[0].handle = intel_bind_timeline_get_syncobj(&device->bind_timeline);
349       xe_syncs[0].timeline_value = intel_bind_timeline_get_last_point(&device->bind_timeline);
350 
351       if (!device->info->no_hw && result == VK_SUCCESS) {
352          if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &perf_query_exec))
353             result = vk_device_set_lost(&device->vk, "perf_query_exec failed: %m");
354       }
355    }
356 
357    if (!device->info->no_hw && result == VK_SUCCESS) {
358       if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
359          result = vk_device_set_lost(&device->vk, "anv_xe_queue_exec_locked failed: %m");
360    }
361    vk_free(&device->vk.alloc, xe_syncs);
362 
363    if (cmd_buffer_count != 0 && cmd_buffers[0]->companion_rcs_cmd_buffer) {
364       /* not allowed to chain cmd_buffers with companion_rcs_cmd_buffer  */
365       assert(cmd_buffer_count == 1);
366       result = xe_companion_rcs_queue_exec_locked(queue,
367                                                   cmd_buffers[0]->companion_rcs_cmd_buffer,
368                                                   wait_count, waits);
369    }
370 
371    result = anv_queue_post_submit(queue, result);
372 
373    if (result == VK_SUCCESS && utrace_submit) {
374       struct vk_sync_signal signal = {
375          .sync = utrace_submit->base.signal.sync,
376          .signal_value = utrace_submit->base.signal.signal_value,
377       };
378       result = xe_queue_exec_async(&utrace_submit->base,
379                                    0, NULL, 1, &signal);
380    }
381 
382    return result;
383 }
384