xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/anv_utrace.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "anv_private.h"
25 #include "anv_internal_kernels.h"
26 
27 #include "common/intel_debug_identifier.h"
28 #include "ds/intel_tracepoints.h"
29 #include "genxml/gen9_pack.h"
30 #include "perf/intel_perf.h"
31 #include "util/perf/cpu_trace.h"
32 
33 #include "vk_common_entrypoints.h"
34 
35 /** Timestamp structure format */
36 union anv_utrace_timestamp {
37    /* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or
38     * PIPE_CONTROL.
39     */
40    uint64_t timestamp;
41 
42    /* Timestamp written by COMPUTE_WALKER::PostSync
43     *
44     * Layout is described in PRMs.
45     * ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA:
46     *
47     *    "The timestamp layout :
48     *        [0] = 32b Context Timestamp Start
49     *        [1] = 32b Global Timestamp Start
50     *        [2] = 32b Context Timestamp End
51     *        [3] = 32b Global Timestamp End"
52     */
53    uint32_t gfx125_postsync_data[4];
54 
55    /* Timestamp written by COMPUTE_WALKER::PostSync
56     *
57     * BSpec 56591:
58     *
59     *    "The timestamp layout :
60     *       [0] = 64b Context Timestamp Start
61     *       [1] = 64b Global Timestamp Start
62     *       [2] = 64b Context Timestamp End
63     *       [3] = 64b Global Timestamp End"
64     */
65    uint64_t gfx20_postsync_data[4];
66 };
67 
68 static uint32_t
command_buffers_count_utraces(struct anv_device * device,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,uint32_t * utrace_copies)69 command_buffers_count_utraces(struct anv_device *device,
70                               uint32_t cmd_buffer_count,
71                               struct anv_cmd_buffer **cmd_buffers,
72                               uint32_t *utrace_copies)
73 {
74    if (!u_trace_should_process(&device->ds.trace_context))
75       return 0;
76 
77    uint32_t utraces = 0;
78    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
79       if (u_trace_has_points(&cmd_buffers[i]->trace)) {
80          utraces++;
81          if (!(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
82             *utrace_copies += list_length(&cmd_buffers[i]->trace.trace_chunks);
83       }
84    }
85 
86    return utraces;
87 }
88 
89 static void
anv_utrace_delete_submit(struct u_trace_context * utctx,void * submit_data)90 anv_utrace_delete_submit(struct u_trace_context *utctx, void *submit_data)
91 {
92    struct anv_device *device =
93       container_of(utctx, struct anv_device, ds.trace_context);
94    struct anv_utrace_submit *submit =
95       container_of(submit_data, struct anv_utrace_submit, ds);
96 
97    intel_ds_flush_data_fini(&submit->ds);
98 
99    anv_state_stream_finish(&submit->dynamic_state_stream);
100    anv_state_stream_finish(&submit->general_state_stream);
101 
102    anv_async_submit_fini(&submit->base);
103 
104    vk_free(&device->vk.alloc, submit);
105 }
106 
107 void
anv_device_utrace_emit_gfx_copy_buffer(struct u_trace_context * utctx,void * cmdstream,void * ts_from,uint64_t from_offset_B,void * ts_to,uint64_t to_offset_B,uint64_t size_B)108 anv_device_utrace_emit_gfx_copy_buffer(struct u_trace_context *utctx,
109                                        void *cmdstream,
110                                        void *ts_from, uint64_t from_offset_B,
111                                        void *ts_to, uint64_t to_offset_B,
112                                        uint64_t size_B)
113 {
114    struct anv_device *device =
115       container_of(utctx, struct anv_device, ds.trace_context);
116    struct anv_memcpy_state *memcpy_state = cmdstream;
117    struct anv_address from_addr = (struct anv_address) {
118       .bo = ts_from, .offset = from_offset_B };
119    struct anv_address to_addr = (struct anv_address) {
120       .bo = ts_to, .offset = to_offset_B };
121 
122    anv_genX(device->info, emit_so_memcpy)(memcpy_state,
123                                           to_addr, from_addr, size_B);
124 }
125 
126 static void
anv_device_utrace_emit_cs_copy_buffer(struct u_trace_context * utctx,void * cmdstream,void * ts_from,uint64_t from_offset_B,void * ts_to,uint64_t to_offset_B,uint64_t size_B)127 anv_device_utrace_emit_cs_copy_buffer(struct u_trace_context *utctx,
128                                       void *cmdstream,
129                                       void *ts_from, uint64_t from_offset_B,
130                                       void *ts_to, uint64_t to_offset_B,
131                                       uint64_t size_B)
132 {
133    struct anv_device *device =
134       container_of(utctx, struct anv_device, ds.trace_context);
135    struct anv_simple_shader *simple_state = cmdstream;
136    struct anv_address from_addr = (struct anv_address) {
137       .bo = ts_from, .offset = from_offset_B };
138    struct anv_address to_addr = (struct anv_address) {
139       .bo = ts_to, .offset = to_offset_B };
140 
141    struct anv_state push_data_state =
142       anv_genX(device->info, simple_shader_alloc_push)(
143          simple_state, sizeof(struct anv_memcpy_params));
144    struct anv_memcpy_params *params = push_data_state.map;
145 
146    *params = (struct anv_memcpy_params) {
147       .num_dwords = size_B / 4,
148       .src_addr   = anv_address_physical(from_addr),
149       .dst_addr   = anv_address_physical(to_addr),
150    };
151 
152    anv_genX(device->info, emit_simple_shader_dispatch)(
153       simple_state, DIV_ROUND_UP(params->num_dwords, 4),
154       push_data_state);
155 }
156 
157 VkResult
anv_device_utrace_flush_cmd_buffers(struct anv_queue * queue,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,struct anv_utrace_submit ** out_submit)158 anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
159                                     uint32_t cmd_buffer_count,
160                                     struct anv_cmd_buffer **cmd_buffers,
161                                     struct anv_utrace_submit **out_submit)
162 {
163    struct anv_device *device = queue->device;
164    uint32_t utrace_copies = 0;
165    uint32_t utraces = command_buffers_count_utraces(device,
166                                                     cmd_buffer_count,
167                                                     cmd_buffers,
168                                                     &utrace_copies);
169    if (!utraces) {
170       *out_submit = NULL;
171       return VK_SUCCESS;
172    }
173 
174    VkResult result;
175    struct anv_utrace_submit *submit =
176       vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
177                 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
178    if (!submit)
179       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
180 
181    result = anv_async_submit_init(&submit->base, queue,
182                                   &device->batch_bo_pool,
183                                   false, true);
184    if (result != VK_SUCCESS)
185       goto error_async;
186 
187    intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
188 
189    struct anv_batch *batch = &submit->base.batch;
190    if (utrace_copies > 0) {
191       anv_state_stream_init(&submit->dynamic_state_stream,
192                             &device->dynamic_state_pool, 16384);
193       anv_state_stream_init(&submit->general_state_stream,
194                             &device->general_state_pool, 16384);
195 
196       /* Only engine class where we support timestamp copies
197        *
198        * TODO: add INTEL_ENGINE_CLASS_COPY support (should be trivial ;)
199        */
200       assert(queue->family->engine_class == INTEL_ENGINE_CLASS_RENDER ||
201              queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE);
202       if (queue->family->engine_class == INTEL_ENGINE_CLASS_RENDER) {
203 
204          trace_intel_begin_trace_copy_cb(&submit->ds.trace, batch);
205 
206          anv_genX(device->info, emit_so_memcpy_init)(&submit->memcpy_state,
207                                                      device, NULL, batch);
208          uint32_t num_traces = 0;
209          for (uint32_t i = 0; i < cmd_buffer_count; i++) {
210             if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
211                intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
212                                          &submit->ds, device->vk.current_frame, false);
213             } else {
214                num_traces += cmd_buffers[i]->trace.num_traces;
215                u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
216                                     u_trace_end_iterator(&cmd_buffers[i]->trace),
217                                     &submit->ds.trace,
218                                     &submit->memcpy_state,
219                                     anv_device_utrace_emit_gfx_copy_buffer);
220             }
221          }
222          anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state);
223 
224          trace_intel_end_trace_copy_cb(&submit->ds.trace, batch, num_traces);
225 
226          anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state);
227       } else {
228          struct anv_shader_bin *copy_kernel;
229          VkResult ret =
230             anv_device_get_internal_shader(device,
231                                            ANV_INTERNAL_KERNEL_MEMCPY_COMPUTE,
232                                            &copy_kernel);
233          if (ret != VK_SUCCESS)
234             goto error_sync;
235 
236          trace_intel_begin_trace_copy_cb(&submit->ds.trace, batch);
237 
238          submit->simple_state = (struct anv_simple_shader) {
239             .device               = device,
240             .dynamic_state_stream = &submit->dynamic_state_stream,
241             .general_state_stream = &submit->general_state_stream,
242             .batch                = batch,
243             .kernel               = copy_kernel,
244             .l3_config            = device->internal_kernels_l3_config,
245          };
246          anv_genX(device->info, emit_simple_shader_init)(&submit->simple_state);
247 
248          uint32_t num_traces = 0;
249          for (uint32_t i = 0; i < cmd_buffer_count; i++) {
250             num_traces += cmd_buffers[i]->trace.num_traces;
251             if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
252                intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
253                                          &submit->ds, device->vk.current_frame, false);
254             } else {
255                num_traces += cmd_buffers[i]->trace.num_traces;
256                u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
257                                     u_trace_end_iterator(&cmd_buffers[i]->trace),
258                                     &submit->ds.trace,
259                                     &submit->simple_state,
260                                     anv_device_utrace_emit_cs_copy_buffer);
261             }
262          }
263 
264          trace_intel_end_trace_copy_cb(&submit->ds.trace, batch, num_traces);
265 
266          anv_genX(device->info, emit_simple_shader_end)(&submit->simple_state);
267       }
268 
269 
270       if (batch->status != VK_SUCCESS) {
271          result = batch->status;
272          goto error_sync;
273       }
274 
275       intel_ds_queue_flush_data(&queue->ds, &submit->ds.trace, &submit->ds,
276                                 device->vk.current_frame, true);
277    } else {
278       for (uint32_t i = 0; i < cmd_buffer_count; i++) {
279          assert(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
280          intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
281                                    &submit->ds, device->vk.current_frame,
282                                    i == (cmd_buffer_count - 1));
283       }
284    }
285 
286    *out_submit = submit;
287 
288    return VK_SUCCESS;
289 
290  error_sync:
291    intel_ds_flush_data_fini(&submit->ds);
292    anv_async_submit_fini(&submit->base);
293  error_async:
294    vk_free(&device->vk.alloc, submit);
295    return result;
296 }
297 
298 static void *
anv_utrace_create_buffer(struct u_trace_context * utctx,uint64_t size_B)299 anv_utrace_create_buffer(struct u_trace_context *utctx, uint64_t size_B)
300 {
301    struct anv_device *device =
302       container_of(utctx, struct anv_device, ds.trace_context);
303 
304    struct anv_bo *bo = NULL;
305    UNUSED VkResult result =
306       anv_bo_pool_alloc(&device->utrace_bo_pool,
307                         align(size_B, 4096),
308                         &bo);
309    assert(result == VK_SUCCESS);
310 
311    memset(bo->map, 0, bo->size);
312 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
313    if (device->physical->memory.need_flush &&
314        anv_bo_needs_host_cache_flush(bo->alloc_flags))
315       intel_flush_range(bo->map, bo->size);
316 #endif
317 
318    return bo;
319 }
320 
321 static void
anv_utrace_destroy_buffer(struct u_trace_context * utctx,void * timestamps)322 anv_utrace_destroy_buffer(struct u_trace_context *utctx, void *timestamps)
323 {
324    struct anv_device *device =
325       container_of(utctx, struct anv_device, ds.trace_context);
326    struct anv_bo *bo = timestamps;
327 
328    anv_bo_pool_free(&device->utrace_bo_pool, bo);
329 }
330 
331 static void
anv_utrace_record_ts(struct u_trace * ut,void * cs,void * timestamps,uint64_t offset_B,uint32_t flags)332 anv_utrace_record_ts(struct u_trace *ut, void *cs,
333                      void *timestamps, uint64_t offset_B,
334                      uint32_t flags)
335 {
336    struct anv_device *device =
337       container_of(ut->utctx, struct anv_device, ds.trace_context);
338    struct anv_cmd_buffer *cmd_buffer =
339       container_of(ut, struct anv_cmd_buffer, trace);
340    /* cmd_buffer is only valid if cs == NULL */
341    struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
342    struct anv_bo *bo = timestamps;
343 
344    assert(offset_B % sizeof(union anv_utrace_timestamp) == 0);
345    struct anv_address ts_address = (struct anv_address) {
346       .bo = bo,
347       .offset = offset_B,
348    };
349 
350    /* Is this a end of compute trace point? */
351    const bool is_end_compute =
352       cs == NULL &&
353       (flags & INTEL_DS_TRACEPOINT_FLAG_END_OF_PIPE_CS);
354 
355    assert(device->info->verx10 < 125 ||
356           !is_end_compute ||
357           cmd_buffer->state.last_indirect_dispatch != NULL ||
358           cmd_buffer->state.last_compute_walker != NULL);
359 
360    enum anv_timestamp_capture_type capture_type =
361       (device->info->verx10 >= 125 && is_end_compute) ?
362       (cmd_buffer->state.last_indirect_dispatch != NULL ?
363        ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH : ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER) :
364       (flags & (INTEL_DS_TRACEPOINT_FLAG_END_OF_PIPE |
365                 INTEL_DS_TRACEPOINT_FLAG_END_OF_PIPE_CS)) ?
366       ANV_TIMESTAMP_CAPTURE_END_OF_PIPE : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
367 
368 
369    void *addr = capture_type ==  ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH ?
370                 cmd_buffer->state.last_indirect_dispatch :
371                 capture_type ==  ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER ?
372                 cmd_buffer->state.last_compute_walker : NULL;
373 
374    device->physical->cmd_emit_timestamp(batch, device, ts_address,
375                                         capture_type,
376                                         addr);
377    if (is_end_compute) {
378       cmd_buffer->state.last_compute_walker = NULL;
379       cmd_buffer->state.last_indirect_dispatch = NULL;
380    }
381 }
382 
383 static uint64_t
anv_utrace_read_ts(struct u_trace_context * utctx,void * timestamps,uint64_t offset_B,void * flush_data)384 anv_utrace_read_ts(struct u_trace_context *utctx,
385                    void *timestamps, uint64_t offset_B,
386                    void *flush_data)
387 {
388    struct anv_device *device =
389       container_of(utctx, struct anv_device, ds.trace_context);
390    struct anv_bo *bo = timestamps;
391    struct anv_utrace_submit *submit =
392       container_of(flush_data, struct anv_utrace_submit, ds);
393 
394    /* Only need to stall on results for the first entry: */
395    if (offset_B == 0) {
396       MESA_TRACE_SCOPE("anv utrace wait timestamps");
397       UNUSED VkResult result =
398          vk_sync_wait(&device->vk,
399                       submit->base.signal.sync,
400                       submit->base.signal.signal_value,
401                       VK_SYNC_WAIT_COMPLETE,
402                       os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE));
403       assert(result == VK_SUCCESS);
404    }
405 
406    assert(offset_B % sizeof(union anv_utrace_timestamp) == 0);
407    union anv_utrace_timestamp *ts =
408       (union anv_utrace_timestamp *)(bo->map + offset_B);
409 
410    /* Don't translate the no-timestamp marker: */
411    if (ts->timestamp == U_TRACE_NO_TIMESTAMP)
412       return U_TRACE_NO_TIMESTAMP;
413 
414    /* Detect a 16/32 bytes timestamp write */
415    if (ts->gfx20_postsync_data[1] != 0 ||
416        ts->gfx20_postsync_data[2] != 0 ||
417        ts->gfx20_postsync_data[3] != 0) {
418       if (device->info->ver >= 20) {
419          return intel_device_info_timebase_scale(device->info,
420                                                  ts->gfx20_postsync_data[3]);
421       }
422 
423       /* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We
424        * need to rebuild the full 64bits using the previous timestamp. We
425        * assume that utrace is reading the timestamp in order. Anyway
426        * timestamp rollover on 32bits in a few minutes so in most cases that
427        * should be correct.
428        */
429       uint64_t timestamp =
430          (submit->last_full_timestamp & 0xffffffff00000000) |
431          (uint64_t) ts->gfx125_postsync_data[3];
432 
433       return intel_device_info_timebase_scale(device->info, timestamp);
434    }
435 
436    submit->last_full_timestamp = ts->timestamp;
437 
438    return intel_device_info_timebase_scale(device->info, ts->timestamp);
439 }
440 
441 static void
anv_utrace_capture_data(struct u_trace * ut,void * cs,void * dst_buffer,uint64_t dst_offset_B,void * src_buffer,uint64_t src_offset_B,uint32_t size_B)442 anv_utrace_capture_data(struct u_trace *ut,
443                         void *cs,
444                         void *dst_buffer,
445                         uint64_t dst_offset_B,
446                         void *src_buffer,
447                         uint64_t src_offset_B,
448                         uint32_t size_B)
449 {
450    struct anv_device *device =
451       container_of(ut->utctx, struct anv_device, ds.trace_context);
452    struct anv_cmd_buffer *cmd_buffer =
453       container_of(ut, struct anv_cmd_buffer, trace);
454    /* cmd_buffer is only valid if cs == NULL */
455    struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
456    struct anv_address dst_addr = {
457       .bo = dst_buffer,
458       .offset = dst_offset_B,
459    };
460    struct anv_address src_addr = {
461       .bo = src_buffer,
462       .offset = src_offset_B,
463    };
464 
465    device->physical->cmd_capture_data(batch, device, dst_addr, src_addr, size_B);
466 }
467 
468 static const void *
anv_utrace_get_data(struct u_trace_context * utctx,void * buffer,uint64_t offset_B,uint32_t size_B)469 anv_utrace_get_data(struct u_trace_context *utctx,
470                      void *buffer, uint64_t offset_B, uint32_t size_B)
471 {
472    struct anv_bo *bo = buffer;
473 
474    return bo->map + offset_B;
475 }
476 
477 void
anv_device_utrace_init(struct anv_device * device)478 anv_device_utrace_init(struct anv_device *device)
479 {
480    device->utrace_timestamp_size = sizeof(union anv_utrace_timestamp);
481 
482    anv_bo_pool_init(&device->utrace_bo_pool, device, "utrace",
483                     ANV_BO_ALLOC_MAPPED | ANV_BO_ALLOC_HOST_CACHED_COHERENT);
484    intel_ds_device_init(&device->ds, device->info, device->fd,
485                         device->physical->local_minor,
486                         INTEL_DS_API_VULKAN);
487    u_trace_context_init(&device->ds.trace_context,
488                         &device->ds,
489                         device->utrace_timestamp_size,
490                         12,
491                         anv_utrace_create_buffer,
492                         anv_utrace_destroy_buffer,
493                         anv_utrace_record_ts,
494                         anv_utrace_read_ts,
495                         anv_utrace_capture_data,
496                         anv_utrace_get_data,
497                         anv_utrace_delete_submit);
498 
499    for (uint32_t q = 0; q < device->queue_count; q++) {
500       struct anv_queue *queue = &device->queues[q];
501 
502       intel_ds_device_init_queue(&device->ds, &queue->ds, "%s%u",
503                                  intel_engines_class_to_string(queue->family->engine_class),
504                                  queue->vk.index_in_family);
505    }
506 }
507 
508 void
anv_device_utrace_finish(struct anv_device * device)509 anv_device_utrace_finish(struct anv_device *device)
510 {
511    intel_ds_device_process(&device->ds, true);
512    intel_ds_device_fini(&device->ds);
513    anv_bo_pool_finish(&device->utrace_bo_pool);
514 }
515 
516 enum intel_ds_stall_flag
anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)517 anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)
518 {
519    static const struct {
520       enum anv_pipe_bits anv;
521       enum intel_ds_stall_flag ds;
522    } anv_to_ds_flags[] = {
523       { .anv = ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,            .ds = INTEL_DS_DEPTH_CACHE_FLUSH_BIT, },
524       { .anv = ANV_PIPE_DATA_CACHE_FLUSH_BIT,             .ds = INTEL_DS_DATA_CACHE_FLUSH_BIT, },
525       { .anv = ANV_PIPE_TILE_CACHE_FLUSH_BIT,             .ds = INTEL_DS_TILE_CACHE_FLUSH_BIT, },
526       { .anv = ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,    .ds = INTEL_DS_RENDER_TARGET_CACHE_FLUSH_BIT, },
527       { .anv = ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,       .ds = INTEL_DS_STATE_CACHE_INVALIDATE_BIT, },
528       { .anv = ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,    .ds = INTEL_DS_CONST_CACHE_INVALIDATE_BIT, },
529       { .anv = ANV_PIPE_VF_CACHE_INVALIDATE_BIT,          .ds = INTEL_DS_VF_CACHE_INVALIDATE_BIT, },
530       { .anv = ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT,     .ds = INTEL_DS_TEXTURE_CACHE_INVALIDATE_BIT, },
531       { .anv = ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_INST_CACHE_INVALIDATE_BIT, },
532       { .anv = ANV_PIPE_DEPTH_STALL_BIT,                  .ds = INTEL_DS_DEPTH_STALL_BIT, },
533       { .anv = ANV_PIPE_CS_STALL_BIT,                     .ds = INTEL_DS_CS_STALL_BIT, },
534       { .anv = ANV_PIPE_HDC_PIPELINE_FLUSH_BIT,           .ds = INTEL_DS_HDC_PIPELINE_FLUSH_BIT, },
535       { .anv = ANV_PIPE_STALL_AT_SCOREBOARD_BIT,          .ds = INTEL_DS_STALL_AT_SCOREBOARD_BIT, },
536       { .anv = ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, .ds = INTEL_DS_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, },
537       { .anv = ANV_PIPE_PSS_STALL_SYNC_BIT,               .ds = INTEL_DS_PSS_STALL_SYNC_BIT, },
538       { .anv = ANV_PIPE_END_OF_PIPE_SYNC_BIT,             .ds = INTEL_DS_END_OF_PIPE_BIT, },
539       { .anv = ANV_PIPE_CCS_CACHE_FLUSH_BIT,              .ds = INTEL_DS_CCS_CACHE_FLUSH_BIT, },
540    };
541 
542    enum intel_ds_stall_flag ret = 0;
543    for (uint32_t i = 0; i < ARRAY_SIZE(anv_to_ds_flags); i++) {
544       if (anv_to_ds_flags[i].anv & bits)
545          ret |= anv_to_ds_flags[i].ds;
546    }
547 
548    return ret;
549 }
550 
anv_CmdBeginDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer,const VkDebugUtilsLabelEXT * pLabelInfo)551 void anv_CmdBeginDebugUtilsLabelEXT(
552    VkCommandBuffer _commandBuffer,
553    const VkDebugUtilsLabelEXT *pLabelInfo)
554 {
555    VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
556 
557    vk_common_CmdBeginDebugUtilsLabelEXT(_commandBuffer, pLabelInfo);
558 
559    trace_intel_begin_cmd_buffer_annotation(&cmd_buffer->trace);
560 }
561 
anv_CmdEndDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer)562 void anv_CmdEndDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer)
563 {
564    VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
565 
566    if (cmd_buffer->vk.labels.size > 0) {
567       const VkDebugUtilsLabelEXT *label =
568          util_dynarray_top_ptr(&cmd_buffer->vk.labels, VkDebugUtilsLabelEXT);
569 
570       trace_intel_end_cmd_buffer_annotation(&cmd_buffer->trace,
571                                             strlen(label->pLabelName),
572                                             label->pLabelName);
573    }
574 
575    vk_common_CmdEndDebugUtilsLabelEXT(_commandBuffer);
576 }
577 
578 void
anv_queue_trace(struct anv_queue * queue,const char * label,bool frame,bool begin)579 anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool begin)
580 {
581    struct anv_device *device = queue->device;
582 
583    VkResult result;
584    struct anv_utrace_submit *submit =
585       vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
586                 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
587    if (!submit)
588       return;
589 
590    result = anv_async_submit_init(&submit->base, queue,
591                                   &device->batch_bo_pool,
592                                   false, true);
593    if (result != VK_SUCCESS)
594       goto error_async;
595 
596    intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
597 
598    struct anv_batch *batch = &submit->base.batch;
599    if (frame) {
600       if (begin)
601          trace_intel_begin_frame(&submit->ds.trace, batch);
602       else
603          trace_intel_end_frame(&submit->ds.trace, batch,
604                                device->debug_frame_desc->frame_id);
605    } else {
606       if (begin) {
607          trace_intel_begin_queue_annotation(&submit->ds.trace, batch);
608       } else {
609          trace_intel_end_queue_annotation(&submit->ds.trace, batch,
610                                           strlen(label), label);
611       }
612    }
613 
614    anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_END, bbs);
615    anv_batch_emit(batch, GFX9_MI_NOOP, noop);
616 
617    if (batch->status != VK_SUCCESS) {
618       result = batch->status;
619       goto error_batch;
620    }
621 
622    intel_ds_queue_flush_data(&queue->ds, &submit->ds.trace, &submit->ds,
623                              device->vk.current_frame, true);
624 
625    result =
626       device->kmd_backend->queue_exec_async(&submit->base,
627                                             0, NULL, 0, NULL);
628    if (result != VK_SUCCESS)
629       goto error_batch;
630 
631    if (frame && !begin)
632       intel_ds_device_process(&device->ds, true);
633 
634    return;
635 
636  error_batch:
637    intel_ds_flush_data_fini(&submit->ds);
638    anv_async_submit_fini(&submit->base);
639  error_async:
640    vk_free(&device->vk.alloc, submit);
641 }
642 
643 void
anv_QueueBeginDebugUtilsLabelEXT(VkQueue _queue,const VkDebugUtilsLabelEXT * pLabelInfo)644 anv_QueueBeginDebugUtilsLabelEXT(
645    VkQueue _queue,
646    const VkDebugUtilsLabelEXT *pLabelInfo)
647 {
648    VK_FROM_HANDLE(anv_queue, queue, _queue);
649 
650    vk_common_QueueBeginDebugUtilsLabelEXT(_queue, pLabelInfo);
651 
652    anv_queue_trace(queue, pLabelInfo->pLabelName,
653                    false /* frame */, true /* begin */);
654 }
655 
656 void
anv_QueueEndDebugUtilsLabelEXT(VkQueue _queue)657 anv_QueueEndDebugUtilsLabelEXT(VkQueue _queue)
658 {
659    VK_FROM_HANDLE(anv_queue, queue, _queue);
660 
661    if (queue->vk.labels.size > 0) {
662       const VkDebugUtilsLabelEXT *label =
663          util_dynarray_top_ptr(&queue->vk.labels, VkDebugUtilsLabelEXT);
664       anv_queue_trace(queue, label->pLabelName,
665                       false /* frame */, false /* begin */);
666 
667       intel_ds_device_process(&queue->device->ds, true);
668    }
669 
670    vk_common_QueueEndDebugUtilsLabelEXT(_queue);
671 }
672