1 /*
2 * Copyright © 2021 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "anv_private.h"
25 #include "anv_internal_kernels.h"
26
27 #include "common/intel_debug_identifier.h"
28 #include "ds/intel_tracepoints.h"
29 #include "genxml/gen9_pack.h"
30 #include "perf/intel_perf.h"
31 #include "util/perf/cpu_trace.h"
32
33 #include "vk_common_entrypoints.h"
34
35 /** Timestamp structure format */
36 union anv_utrace_timestamp {
37 /* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or
38 * PIPE_CONTROL.
39 */
40 uint64_t timestamp;
41
42 /* Timestamp written by COMPUTE_WALKER::PostSync
43 *
44 * Layout is described in PRMs.
45 * ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA:
46 *
47 * "The timestamp layout :
48 * [0] = 32b Context Timestamp Start
49 * [1] = 32b Global Timestamp Start
50 * [2] = 32b Context Timestamp End
51 * [3] = 32b Global Timestamp End"
52 */
53 uint32_t gfx125_postsync_data[4];
54
55 /* Timestamp written by COMPUTE_WALKER::PostSync
56 *
57 * BSpec 56591:
58 *
59 * "The timestamp layout :
60 * [0] = 64b Context Timestamp Start
61 * [1] = 64b Global Timestamp Start
62 * [2] = 64b Context Timestamp End
63 * [3] = 64b Global Timestamp End"
64 */
65 uint64_t gfx20_postsync_data[4];
66 };
67
68 static uint32_t
command_buffers_count_utraces(struct anv_device * device,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,uint32_t * utrace_copies)69 command_buffers_count_utraces(struct anv_device *device,
70 uint32_t cmd_buffer_count,
71 struct anv_cmd_buffer **cmd_buffers,
72 uint32_t *utrace_copies)
73 {
74 if (!u_trace_should_process(&device->ds.trace_context))
75 return 0;
76
77 uint32_t utraces = 0;
78 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
79 if (u_trace_has_points(&cmd_buffers[i]->trace)) {
80 utraces++;
81 if (!(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
82 *utrace_copies += list_length(&cmd_buffers[i]->trace.trace_chunks);
83 }
84 }
85
86 return utraces;
87 }
88
89 static void
anv_utrace_delete_submit(struct u_trace_context * utctx,void * submit_data)90 anv_utrace_delete_submit(struct u_trace_context *utctx, void *submit_data)
91 {
92 struct anv_device *device =
93 container_of(utctx, struct anv_device, ds.trace_context);
94 struct anv_utrace_submit *submit =
95 container_of(submit_data, struct anv_utrace_submit, ds);
96
97 intel_ds_flush_data_fini(&submit->ds);
98
99 anv_state_stream_finish(&submit->dynamic_state_stream);
100 anv_state_stream_finish(&submit->general_state_stream);
101
102 anv_async_submit_fini(&submit->base);
103
104 vk_free(&device->vk.alloc, submit);
105 }
106
107 void
anv_device_utrace_emit_gfx_copy_buffer(struct u_trace_context * utctx,void * cmdstream,void * ts_from,uint64_t from_offset_B,void * ts_to,uint64_t to_offset_B,uint64_t size_B)108 anv_device_utrace_emit_gfx_copy_buffer(struct u_trace_context *utctx,
109 void *cmdstream,
110 void *ts_from, uint64_t from_offset_B,
111 void *ts_to, uint64_t to_offset_B,
112 uint64_t size_B)
113 {
114 struct anv_device *device =
115 container_of(utctx, struct anv_device, ds.trace_context);
116 struct anv_memcpy_state *memcpy_state = cmdstream;
117 struct anv_address from_addr = (struct anv_address) {
118 .bo = ts_from, .offset = from_offset_B };
119 struct anv_address to_addr = (struct anv_address) {
120 .bo = ts_to, .offset = to_offset_B };
121
122 anv_genX(device->info, emit_so_memcpy)(memcpy_state,
123 to_addr, from_addr, size_B);
124 }
125
126 static void
anv_device_utrace_emit_cs_copy_buffer(struct u_trace_context * utctx,void * cmdstream,void * ts_from,uint64_t from_offset_B,void * ts_to,uint64_t to_offset_B,uint64_t size_B)127 anv_device_utrace_emit_cs_copy_buffer(struct u_trace_context *utctx,
128 void *cmdstream,
129 void *ts_from, uint64_t from_offset_B,
130 void *ts_to, uint64_t to_offset_B,
131 uint64_t size_B)
132 {
133 struct anv_device *device =
134 container_of(utctx, struct anv_device, ds.trace_context);
135 struct anv_simple_shader *simple_state = cmdstream;
136 struct anv_address from_addr = (struct anv_address) {
137 .bo = ts_from, .offset = from_offset_B };
138 struct anv_address to_addr = (struct anv_address) {
139 .bo = ts_to, .offset = to_offset_B };
140
141 struct anv_state push_data_state =
142 anv_genX(device->info, simple_shader_alloc_push)(
143 simple_state, sizeof(struct anv_memcpy_params));
144 struct anv_memcpy_params *params = push_data_state.map;
145
146 *params = (struct anv_memcpy_params) {
147 .num_dwords = size_B / 4,
148 .src_addr = anv_address_physical(from_addr),
149 .dst_addr = anv_address_physical(to_addr),
150 };
151
152 anv_genX(device->info, emit_simple_shader_dispatch)(
153 simple_state, DIV_ROUND_UP(params->num_dwords, 4),
154 push_data_state);
155 }
156
157 VkResult
anv_device_utrace_flush_cmd_buffers(struct anv_queue * queue,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,struct anv_utrace_submit ** out_submit)158 anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
159 uint32_t cmd_buffer_count,
160 struct anv_cmd_buffer **cmd_buffers,
161 struct anv_utrace_submit **out_submit)
162 {
163 struct anv_device *device = queue->device;
164 uint32_t utrace_copies = 0;
165 uint32_t utraces = command_buffers_count_utraces(device,
166 cmd_buffer_count,
167 cmd_buffers,
168 &utrace_copies);
169 if (!utraces) {
170 *out_submit = NULL;
171 return VK_SUCCESS;
172 }
173
174 VkResult result;
175 struct anv_utrace_submit *submit =
176 vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
177 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
178 if (!submit)
179 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
180
181 result = anv_async_submit_init(&submit->base, queue,
182 &device->batch_bo_pool,
183 false, true);
184 if (result != VK_SUCCESS)
185 goto error_async;
186
187 intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
188
189 struct anv_batch *batch = &submit->base.batch;
190 if (utrace_copies > 0) {
191 anv_state_stream_init(&submit->dynamic_state_stream,
192 &device->dynamic_state_pool, 16384);
193 anv_state_stream_init(&submit->general_state_stream,
194 &device->general_state_pool, 16384);
195
196 /* Only engine class where we support timestamp copies
197 *
198 * TODO: add INTEL_ENGINE_CLASS_COPY support (should be trivial ;)
199 */
200 assert(queue->family->engine_class == INTEL_ENGINE_CLASS_RENDER ||
201 queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE);
202 if (queue->family->engine_class == INTEL_ENGINE_CLASS_RENDER) {
203
204 trace_intel_begin_trace_copy_cb(&submit->ds.trace, batch);
205
206 anv_genX(device->info, emit_so_memcpy_init)(&submit->memcpy_state,
207 device, NULL, batch);
208 uint32_t num_traces = 0;
209 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
210 if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
211 intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
212 &submit->ds, device->vk.current_frame, false);
213 } else {
214 num_traces += cmd_buffers[i]->trace.num_traces;
215 u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
216 u_trace_end_iterator(&cmd_buffers[i]->trace),
217 &submit->ds.trace,
218 &submit->memcpy_state,
219 anv_device_utrace_emit_gfx_copy_buffer);
220 }
221 }
222 anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state);
223
224 trace_intel_end_trace_copy_cb(&submit->ds.trace, batch, num_traces);
225
226 anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state);
227 } else {
228 struct anv_shader_bin *copy_kernel;
229 VkResult ret =
230 anv_device_get_internal_shader(device,
231 ANV_INTERNAL_KERNEL_MEMCPY_COMPUTE,
232 ©_kernel);
233 if (ret != VK_SUCCESS)
234 goto error_sync;
235
236 trace_intel_begin_trace_copy_cb(&submit->ds.trace, batch);
237
238 submit->simple_state = (struct anv_simple_shader) {
239 .device = device,
240 .dynamic_state_stream = &submit->dynamic_state_stream,
241 .general_state_stream = &submit->general_state_stream,
242 .batch = batch,
243 .kernel = copy_kernel,
244 .l3_config = device->internal_kernels_l3_config,
245 };
246 anv_genX(device->info, emit_simple_shader_init)(&submit->simple_state);
247
248 uint32_t num_traces = 0;
249 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
250 num_traces += cmd_buffers[i]->trace.num_traces;
251 if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
252 intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
253 &submit->ds, device->vk.current_frame, false);
254 } else {
255 num_traces += cmd_buffers[i]->trace.num_traces;
256 u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
257 u_trace_end_iterator(&cmd_buffers[i]->trace),
258 &submit->ds.trace,
259 &submit->simple_state,
260 anv_device_utrace_emit_cs_copy_buffer);
261 }
262 }
263
264 trace_intel_end_trace_copy_cb(&submit->ds.trace, batch, num_traces);
265
266 anv_genX(device->info, emit_simple_shader_end)(&submit->simple_state);
267 }
268
269
270 if (batch->status != VK_SUCCESS) {
271 result = batch->status;
272 goto error_sync;
273 }
274
275 intel_ds_queue_flush_data(&queue->ds, &submit->ds.trace, &submit->ds,
276 device->vk.current_frame, true);
277 } else {
278 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
279 assert(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
280 intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
281 &submit->ds, device->vk.current_frame,
282 i == (cmd_buffer_count - 1));
283 }
284 }
285
286 *out_submit = submit;
287
288 return VK_SUCCESS;
289
290 error_sync:
291 intel_ds_flush_data_fini(&submit->ds);
292 anv_async_submit_fini(&submit->base);
293 error_async:
294 vk_free(&device->vk.alloc, submit);
295 return result;
296 }
297
298 static void *
anv_utrace_create_buffer(struct u_trace_context * utctx,uint64_t size_B)299 anv_utrace_create_buffer(struct u_trace_context *utctx, uint64_t size_B)
300 {
301 struct anv_device *device =
302 container_of(utctx, struct anv_device, ds.trace_context);
303
304 struct anv_bo *bo = NULL;
305 UNUSED VkResult result =
306 anv_bo_pool_alloc(&device->utrace_bo_pool,
307 align(size_B, 4096),
308 &bo);
309 assert(result == VK_SUCCESS);
310
311 memset(bo->map, 0, bo->size);
312 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
313 if (device->physical->memory.need_flush &&
314 anv_bo_needs_host_cache_flush(bo->alloc_flags))
315 intel_flush_range(bo->map, bo->size);
316 #endif
317
318 return bo;
319 }
320
321 static void
anv_utrace_destroy_buffer(struct u_trace_context * utctx,void * timestamps)322 anv_utrace_destroy_buffer(struct u_trace_context *utctx, void *timestamps)
323 {
324 struct anv_device *device =
325 container_of(utctx, struct anv_device, ds.trace_context);
326 struct anv_bo *bo = timestamps;
327
328 anv_bo_pool_free(&device->utrace_bo_pool, bo);
329 }
330
331 static void
anv_utrace_record_ts(struct u_trace * ut,void * cs,void * timestamps,uint64_t offset_B,uint32_t flags)332 anv_utrace_record_ts(struct u_trace *ut, void *cs,
333 void *timestamps, uint64_t offset_B,
334 uint32_t flags)
335 {
336 struct anv_device *device =
337 container_of(ut->utctx, struct anv_device, ds.trace_context);
338 struct anv_cmd_buffer *cmd_buffer =
339 container_of(ut, struct anv_cmd_buffer, trace);
340 /* cmd_buffer is only valid if cs == NULL */
341 struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
342 struct anv_bo *bo = timestamps;
343
344 assert(offset_B % sizeof(union anv_utrace_timestamp) == 0);
345 struct anv_address ts_address = (struct anv_address) {
346 .bo = bo,
347 .offset = offset_B,
348 };
349
350 /* Is this a end of compute trace point? */
351 const bool is_end_compute =
352 cs == NULL &&
353 (flags & INTEL_DS_TRACEPOINT_FLAG_END_OF_PIPE_CS);
354
355 assert(device->info->verx10 < 125 ||
356 !is_end_compute ||
357 cmd_buffer->state.last_indirect_dispatch != NULL ||
358 cmd_buffer->state.last_compute_walker != NULL);
359
360 enum anv_timestamp_capture_type capture_type =
361 (device->info->verx10 >= 125 && is_end_compute) ?
362 (cmd_buffer->state.last_indirect_dispatch != NULL ?
363 ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH : ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER) :
364 (flags & (INTEL_DS_TRACEPOINT_FLAG_END_OF_PIPE |
365 INTEL_DS_TRACEPOINT_FLAG_END_OF_PIPE_CS)) ?
366 ANV_TIMESTAMP_CAPTURE_END_OF_PIPE : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
367
368
369 void *addr = capture_type == ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH ?
370 cmd_buffer->state.last_indirect_dispatch :
371 capture_type == ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER ?
372 cmd_buffer->state.last_compute_walker : NULL;
373
374 device->physical->cmd_emit_timestamp(batch, device, ts_address,
375 capture_type,
376 addr);
377 if (is_end_compute) {
378 cmd_buffer->state.last_compute_walker = NULL;
379 cmd_buffer->state.last_indirect_dispatch = NULL;
380 }
381 }
382
383 static uint64_t
anv_utrace_read_ts(struct u_trace_context * utctx,void * timestamps,uint64_t offset_B,void * flush_data)384 anv_utrace_read_ts(struct u_trace_context *utctx,
385 void *timestamps, uint64_t offset_B,
386 void *flush_data)
387 {
388 struct anv_device *device =
389 container_of(utctx, struct anv_device, ds.trace_context);
390 struct anv_bo *bo = timestamps;
391 struct anv_utrace_submit *submit =
392 container_of(flush_data, struct anv_utrace_submit, ds);
393
394 /* Only need to stall on results for the first entry: */
395 if (offset_B == 0) {
396 MESA_TRACE_SCOPE("anv utrace wait timestamps");
397 UNUSED VkResult result =
398 vk_sync_wait(&device->vk,
399 submit->base.signal.sync,
400 submit->base.signal.signal_value,
401 VK_SYNC_WAIT_COMPLETE,
402 os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE));
403 assert(result == VK_SUCCESS);
404 }
405
406 assert(offset_B % sizeof(union anv_utrace_timestamp) == 0);
407 union anv_utrace_timestamp *ts =
408 (union anv_utrace_timestamp *)(bo->map + offset_B);
409
410 /* Don't translate the no-timestamp marker: */
411 if (ts->timestamp == U_TRACE_NO_TIMESTAMP)
412 return U_TRACE_NO_TIMESTAMP;
413
414 /* Detect a 16/32 bytes timestamp write */
415 if (ts->gfx20_postsync_data[1] != 0 ||
416 ts->gfx20_postsync_data[2] != 0 ||
417 ts->gfx20_postsync_data[3] != 0) {
418 if (device->info->ver >= 20) {
419 return intel_device_info_timebase_scale(device->info,
420 ts->gfx20_postsync_data[3]);
421 }
422
423 /* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We
424 * need to rebuild the full 64bits using the previous timestamp. We
425 * assume that utrace is reading the timestamp in order. Anyway
426 * timestamp rollover on 32bits in a few minutes so in most cases that
427 * should be correct.
428 */
429 uint64_t timestamp =
430 (submit->last_full_timestamp & 0xffffffff00000000) |
431 (uint64_t) ts->gfx125_postsync_data[3];
432
433 return intel_device_info_timebase_scale(device->info, timestamp);
434 }
435
436 submit->last_full_timestamp = ts->timestamp;
437
438 return intel_device_info_timebase_scale(device->info, ts->timestamp);
439 }
440
441 static void
anv_utrace_capture_data(struct u_trace * ut,void * cs,void * dst_buffer,uint64_t dst_offset_B,void * src_buffer,uint64_t src_offset_B,uint32_t size_B)442 anv_utrace_capture_data(struct u_trace *ut,
443 void *cs,
444 void *dst_buffer,
445 uint64_t dst_offset_B,
446 void *src_buffer,
447 uint64_t src_offset_B,
448 uint32_t size_B)
449 {
450 struct anv_device *device =
451 container_of(ut->utctx, struct anv_device, ds.trace_context);
452 struct anv_cmd_buffer *cmd_buffer =
453 container_of(ut, struct anv_cmd_buffer, trace);
454 /* cmd_buffer is only valid if cs == NULL */
455 struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
456 struct anv_address dst_addr = {
457 .bo = dst_buffer,
458 .offset = dst_offset_B,
459 };
460 struct anv_address src_addr = {
461 .bo = src_buffer,
462 .offset = src_offset_B,
463 };
464
465 device->physical->cmd_capture_data(batch, device, dst_addr, src_addr, size_B);
466 }
467
468 static const void *
anv_utrace_get_data(struct u_trace_context * utctx,void * buffer,uint64_t offset_B,uint32_t size_B)469 anv_utrace_get_data(struct u_trace_context *utctx,
470 void *buffer, uint64_t offset_B, uint32_t size_B)
471 {
472 struct anv_bo *bo = buffer;
473
474 return bo->map + offset_B;
475 }
476
477 void
anv_device_utrace_init(struct anv_device * device)478 anv_device_utrace_init(struct anv_device *device)
479 {
480 device->utrace_timestamp_size = sizeof(union anv_utrace_timestamp);
481
482 anv_bo_pool_init(&device->utrace_bo_pool, device, "utrace",
483 ANV_BO_ALLOC_MAPPED | ANV_BO_ALLOC_HOST_CACHED_COHERENT);
484 intel_ds_device_init(&device->ds, device->info, device->fd,
485 device->physical->local_minor,
486 INTEL_DS_API_VULKAN);
487 u_trace_context_init(&device->ds.trace_context,
488 &device->ds,
489 device->utrace_timestamp_size,
490 12,
491 anv_utrace_create_buffer,
492 anv_utrace_destroy_buffer,
493 anv_utrace_record_ts,
494 anv_utrace_read_ts,
495 anv_utrace_capture_data,
496 anv_utrace_get_data,
497 anv_utrace_delete_submit);
498
499 for (uint32_t q = 0; q < device->queue_count; q++) {
500 struct anv_queue *queue = &device->queues[q];
501
502 intel_ds_device_init_queue(&device->ds, &queue->ds, "%s%u",
503 intel_engines_class_to_string(queue->family->engine_class),
504 queue->vk.index_in_family);
505 }
506 }
507
508 void
anv_device_utrace_finish(struct anv_device * device)509 anv_device_utrace_finish(struct anv_device *device)
510 {
511 intel_ds_device_process(&device->ds, true);
512 intel_ds_device_fini(&device->ds);
513 anv_bo_pool_finish(&device->utrace_bo_pool);
514 }
515
516 enum intel_ds_stall_flag
anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)517 anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)
518 {
519 static const struct {
520 enum anv_pipe_bits anv;
521 enum intel_ds_stall_flag ds;
522 } anv_to_ds_flags[] = {
523 { .anv = ANV_PIPE_DEPTH_CACHE_FLUSH_BIT, .ds = INTEL_DS_DEPTH_CACHE_FLUSH_BIT, },
524 { .anv = ANV_PIPE_DATA_CACHE_FLUSH_BIT, .ds = INTEL_DS_DATA_CACHE_FLUSH_BIT, },
525 { .anv = ANV_PIPE_TILE_CACHE_FLUSH_BIT, .ds = INTEL_DS_TILE_CACHE_FLUSH_BIT, },
526 { .anv = ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, .ds = INTEL_DS_RENDER_TARGET_CACHE_FLUSH_BIT, },
527 { .anv = ANV_PIPE_STATE_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_STATE_CACHE_INVALIDATE_BIT, },
528 { .anv = ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_CONST_CACHE_INVALIDATE_BIT, },
529 { .anv = ANV_PIPE_VF_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_VF_CACHE_INVALIDATE_BIT, },
530 { .anv = ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_TEXTURE_CACHE_INVALIDATE_BIT, },
531 { .anv = ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_INST_CACHE_INVALIDATE_BIT, },
532 { .anv = ANV_PIPE_DEPTH_STALL_BIT, .ds = INTEL_DS_DEPTH_STALL_BIT, },
533 { .anv = ANV_PIPE_CS_STALL_BIT, .ds = INTEL_DS_CS_STALL_BIT, },
534 { .anv = ANV_PIPE_HDC_PIPELINE_FLUSH_BIT, .ds = INTEL_DS_HDC_PIPELINE_FLUSH_BIT, },
535 { .anv = ANV_PIPE_STALL_AT_SCOREBOARD_BIT, .ds = INTEL_DS_STALL_AT_SCOREBOARD_BIT, },
536 { .anv = ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, .ds = INTEL_DS_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, },
537 { .anv = ANV_PIPE_PSS_STALL_SYNC_BIT, .ds = INTEL_DS_PSS_STALL_SYNC_BIT, },
538 { .anv = ANV_PIPE_END_OF_PIPE_SYNC_BIT, .ds = INTEL_DS_END_OF_PIPE_BIT, },
539 { .anv = ANV_PIPE_CCS_CACHE_FLUSH_BIT, .ds = INTEL_DS_CCS_CACHE_FLUSH_BIT, },
540 };
541
542 enum intel_ds_stall_flag ret = 0;
543 for (uint32_t i = 0; i < ARRAY_SIZE(anv_to_ds_flags); i++) {
544 if (anv_to_ds_flags[i].anv & bits)
545 ret |= anv_to_ds_flags[i].ds;
546 }
547
548 return ret;
549 }
550
anv_CmdBeginDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer,const VkDebugUtilsLabelEXT * pLabelInfo)551 void anv_CmdBeginDebugUtilsLabelEXT(
552 VkCommandBuffer _commandBuffer,
553 const VkDebugUtilsLabelEXT *pLabelInfo)
554 {
555 VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
556
557 vk_common_CmdBeginDebugUtilsLabelEXT(_commandBuffer, pLabelInfo);
558
559 trace_intel_begin_cmd_buffer_annotation(&cmd_buffer->trace);
560 }
561
anv_CmdEndDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer)562 void anv_CmdEndDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer)
563 {
564 VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
565
566 if (cmd_buffer->vk.labels.size > 0) {
567 const VkDebugUtilsLabelEXT *label =
568 util_dynarray_top_ptr(&cmd_buffer->vk.labels, VkDebugUtilsLabelEXT);
569
570 trace_intel_end_cmd_buffer_annotation(&cmd_buffer->trace,
571 strlen(label->pLabelName),
572 label->pLabelName);
573 }
574
575 vk_common_CmdEndDebugUtilsLabelEXT(_commandBuffer);
576 }
577
578 void
anv_queue_trace(struct anv_queue * queue,const char * label,bool frame,bool begin)579 anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool begin)
580 {
581 struct anv_device *device = queue->device;
582
583 VkResult result;
584 struct anv_utrace_submit *submit =
585 vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
586 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
587 if (!submit)
588 return;
589
590 result = anv_async_submit_init(&submit->base, queue,
591 &device->batch_bo_pool,
592 false, true);
593 if (result != VK_SUCCESS)
594 goto error_async;
595
596 intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
597
598 struct anv_batch *batch = &submit->base.batch;
599 if (frame) {
600 if (begin)
601 trace_intel_begin_frame(&submit->ds.trace, batch);
602 else
603 trace_intel_end_frame(&submit->ds.trace, batch,
604 device->debug_frame_desc->frame_id);
605 } else {
606 if (begin) {
607 trace_intel_begin_queue_annotation(&submit->ds.trace, batch);
608 } else {
609 trace_intel_end_queue_annotation(&submit->ds.trace, batch,
610 strlen(label), label);
611 }
612 }
613
614 anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_END, bbs);
615 anv_batch_emit(batch, GFX9_MI_NOOP, noop);
616
617 if (batch->status != VK_SUCCESS) {
618 result = batch->status;
619 goto error_batch;
620 }
621
622 intel_ds_queue_flush_data(&queue->ds, &submit->ds.trace, &submit->ds,
623 device->vk.current_frame, true);
624
625 result =
626 device->kmd_backend->queue_exec_async(&submit->base,
627 0, NULL, 0, NULL);
628 if (result != VK_SUCCESS)
629 goto error_batch;
630
631 if (frame && !begin)
632 intel_ds_device_process(&device->ds, true);
633
634 return;
635
636 error_batch:
637 intel_ds_flush_data_fini(&submit->ds);
638 anv_async_submit_fini(&submit->base);
639 error_async:
640 vk_free(&device->vk.alloc, submit);
641 }
642
643 void
anv_QueueBeginDebugUtilsLabelEXT(VkQueue _queue,const VkDebugUtilsLabelEXT * pLabelInfo)644 anv_QueueBeginDebugUtilsLabelEXT(
645 VkQueue _queue,
646 const VkDebugUtilsLabelEXT *pLabelInfo)
647 {
648 VK_FROM_HANDLE(anv_queue, queue, _queue);
649
650 vk_common_QueueBeginDebugUtilsLabelEXT(_queue, pLabelInfo);
651
652 anv_queue_trace(queue, pLabelInfo->pLabelName,
653 false /* frame */, true /* begin */);
654 }
655
656 void
anv_QueueEndDebugUtilsLabelEXT(VkQueue _queue)657 anv_QueueEndDebugUtilsLabelEXT(VkQueue _queue)
658 {
659 VK_FROM_HANDLE(anv_queue, queue, _queue);
660
661 if (queue->vk.labels.size > 0) {
662 const VkDebugUtilsLabelEXT *label =
663 util_dynarray_top_ptr(&queue->vk.labels, VkDebugUtilsLabelEXT);
664 anv_queue_trace(queue, label->pLabelName,
665 false /* frame */, false /* begin */);
666
667 intel_ds_device_process(&queue->device->ds, true);
668 }
669
670 vk_common_QueueEndDebugUtilsLabelEXT(_queue);
671 }
672