1 /*
2 * Copyright © 2020 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #ifndef _U_TRACE_H
25 #define _U_TRACE_H
26
27 #include <stdbool.h>
28 #include <stdint.h>
29 #include <stdio.h>
30
31 #include "util/macros.h"
32 #include "util/u_atomic.h"
33 #include "util/u_queue.h"
34
35 #ifdef __cplusplus
36 extern "C" {
37 #endif
38
39 /* A trace mechanism (very) loosely inspired by the linux kernel tracepoint
40 * mechanism, in that it allows for defining driver specific (or common)
41 * tracepoints, which generate 'trace_$name()' functions that can be
42 * called at various points in commandstream emit.
43 *
44 * Currently a printf backend is implemented, but the expectation is to
45 * also implement a perfetto backend for shipping out traces to a tool like
46 * AGI.
47 *
48 * Notable differences:
49 *
50 * - GPU timestamps! A driver provided callback is used to emit timestamps
51 * to a buffer. At a later point in time (when stalling to wait for the
52 * GPU is not required), the timestamps are re-united with the trace
53 * payload. This makes the trace mechanism suitable for profiling.
54 *
55 * - Instead of a systemwide trace ringbuffer, buffering of un-retired
56 * tracepoints is split into two stages. Traces are emitted to a
57 * 'u_trace' instance, and at a later time flushed to a 'u_trace_context'
58 * instance. This avoids the requirement that commandstream containing
59 * tracepoints is emitted in the same order as it is generated.
60 *
61 * If the hw has multiple parallel "engines" (for example, 3d/blit/compute)
62 * then a `u_trace_context` per-engine should be used.
63 *
64 * - Unlike kernel tracepoints, u_trace tracepoints are defined in py
65 * from which header and src files are generated. Since we already have
66 * a build dependency on python+mako, this gives more flexibility than
67 * clunky preprocessor macro magic.
68 *
69 */
70
71 struct u_trace_context;
72 struct u_trace;
73 struct u_trace_chunk;
74 struct u_trace_printer;
75
76 /**
77 * Special reserved value to indicate that no timestamp was captured,
78 * and that the timestamp of the previous trace should be reused.
79 */
80 #define U_TRACE_NO_TIMESTAMP ((uint64_t) 0)
81
82 /**
83 * Address representation
84 */
85 struct u_trace_address {
86 /**
87 * Pointer to a buffer object
88 */
89 void *bo;
90 /**
91 * Offset inside the buffer object or address of bo is NULL
92 */
93 uint64_t offset;
94 };
95
96 /**
97 * Driver provided callback to create a buffer which will be read by
98 * u_trace_read_ts function.
99 */
100 typedef void *(*u_trace_create_buffer)(struct u_trace_context *utctx,
101 uint64_t size_B);
102
103 /**
104 * Driver provided callback to delete a buffer.
105 */
106 typedef void (*u_trace_delete_buffer)(struct u_trace_context *utctx,
107 void *buffer);
108
109 /**
110 * Driver provided callback to emit commands into the soecified command
111 * stream to capture a 64b timestamp into the specified timestamps buffer,
112 * at the specified index.
113 *
114 * The hw counter that the driver records should be something that runs at
115 * a fixed rate, even as the GPU freq changes. The same source used for
116 * GL_TIMESTAMP queries should be appropriate.
117 */
118 typedef void (*u_trace_record_ts)(struct u_trace *ut,
119 void *cs,
120 void *timestamps,
121 uint64_t offset_B,
122 uint32_t flags);
123
124 /**
125 * Driver provided callback to capture indirect data.
126 */
127 typedef void (*u_trace_capture_data)(struct u_trace *ut,
128 void *cs,
129 void *dst_buffer,
130 uint64_t dst_offset_B,
131 void *src_buffer,
132 uint64_t src_offset_B,
133 uint32_t size_B);
134
135 /**
136 * Driver provided callback to read back previously recorded indirect data.
137 */
138 typedef const void *(*u_trace_get_data)(struct u_trace_context *utctx,
139 void *buffer,
140 uint64_t offset_B,
141 uint32_t size_B);
142 /**
143 * Driver provided callback to read back a previously recorded timestamp.
144 * If necessary, this should block until the GPU has finished writing back
145 * the timestamps. (The timestamps will be read back in order, so it is
146 * safe to only synchronize on idx==0.)
147 *
148 * flush_data is data provided by the driver via u_trace_flush.
149 *
150 * The returned timestamp should be in units of nanoseconds. The same
151 * timebase as GL_TIMESTAMP queries should be used.
152 *
153 * The driver can return the special U_TRACE_NO_TIMESTAMP value to indicate
154 * that no timestamp was captured and the timestamp from the previous trace
155 * will be re-used. (The first trace in the u_trace buf may not do this.)
156 * This allows the driver to detect cases where multiple tracepoints are
157 * emitted with no other intervening cmdstream, to avoid pointlessly
158 * capturing the same timestamp multiple times in a row.
159 */
160 typedef uint64_t (*u_trace_read_ts)(struct u_trace_context *utctx,
161 void *timestamps,
162 uint64_t offset_B,
163 void *flush_data);
164
165 /**
166 * Driver provided callback to create a buffer which will be read by
167 * u_trace_read_ts function.
168 */
169 typedef void *(*u_trace_copy_data)(struct u_trace *ut,
170 void *cs,
171 void *dst,
172 uint64_t dst_offset_B,
173 void *src,
174 uint64_t src_offset_B,
175 uint64_t size_B);
176
177 /**
178 * Driver provided callback to delete flush data.
179 */
180 typedef void (*u_trace_delete_flush_data)(struct u_trace_context *utctx,
181 void *flush_data);
182
183 enum u_trace_type {
184 U_TRACE_TYPE_PRINT = 1u << 0,
185 U_TRACE_TYPE_JSON = 1u << 1,
186 U_TRACE_TYPE_PERFETTO_ACTIVE = 1u << 2,
187 U_TRACE_TYPE_PERFETTO_ENV = 1u << 3,
188 U_TRACE_TYPE_MARKERS = 1u << 4,
189 U_TRACE_TYPE_INDIRECTS = 1u << 5,
190 U_TRACE_TYPE_CSV = 1u << 6,
191
192 U_TRACE_TYPE_PRINT_CSV = U_TRACE_TYPE_PRINT | U_TRACE_TYPE_CSV,
193 U_TRACE_TYPE_PRINT_JSON = U_TRACE_TYPE_PRINT | U_TRACE_TYPE_JSON,
194 U_TRACE_TYPE_PERFETTO =
195 U_TRACE_TYPE_PERFETTO_ACTIVE | U_TRACE_TYPE_PERFETTO_ENV,
196
197 /*
198 * A mask of traces that require appending to the tracepoint chunk list.
199 */
200 U_TRACE_TYPE_REQUIRE_QUEUING = U_TRACE_TYPE_PRINT | U_TRACE_TYPE_PERFETTO,
201 /*
202 * A mask of traces that require processing the tracepoint chunk list.
203 */
204 U_TRACE_TYPE_REQUIRE_PROCESSING =
205 U_TRACE_TYPE_PRINT | U_TRACE_TYPE_PERFETTO_ACTIVE,
206 };
207
208 /**
209 * The trace context provides tracking for "in-flight" traces, once the
210 * cmdstream that records timestamps has been flushed.
211 */
212 struct u_trace_context {
213 /* All traces enabled in this context */
214 enum u_trace_type enabled_traces;
215
216 void *pctx;
217
218 u_trace_create_buffer create_buffer;
219 u_trace_delete_buffer delete_buffer;
220 u_trace_capture_data capture_data;
221 u_trace_get_data get_data;
222 u_trace_record_ts record_timestamp;
223 u_trace_read_ts read_timestamp;
224 u_trace_delete_flush_data delete_flush_data;
225
226 uint64_t timestamp_size_bytes;
227 uint64_t max_indirect_size_bytes;
228
229 FILE *out;
230 struct u_trace_printer *out_printer;
231
232 /* Once u_trace_flush() is called u_trace_chunk's are queued up to
233 * render tracepoints on a queue. The per-chunk queue jobs block until
234 * timestamps are available.
235 */
236 struct util_queue queue;
237
238 #ifdef HAVE_PERFETTO
239 /* node in global list of trace contexts. */
240 struct list_head node;
241 #endif
242
243 /* State to accumulate time across N chunks associated with a single
244 * batch (u_trace).
245 */
246 uint64_t last_time_ns;
247 uint64_t first_time_ns;
248
249 uint32_t frame_nr;
250 uint32_t batch_nr;
251 uint32_t event_nr;
252 bool start_of_frame;
253
254 void *dummy_indirect_data;
255
256 /* list of unprocessed trace chunks in fifo order: */
257 struct list_head flushed_trace_chunks;
258 };
259
260 /**
261 * The u_trace ptr is passed as the first arg to generated tracepoints.
262 * It provides buffering for tracepoint payload until the corresponding
263 * driver cmdstream containing the emitted commands to capture is
264 * flushed.
265 *
266 * Individual tracepoints emitted to u_trace are expected to be "executed"
267 * (ie. timestamp captured) in FIFO order with respect to other tracepoints
268 * emitted to the same u_trace. But the order WRT other u_trace instances
269 * is undefined util u_trace_flush().
270 */
271 struct u_trace {
272 struct u_trace_context *utctx;
273
274 uint32_t num_traces;
275
276 struct list_head
277 trace_chunks; /* list of unflushed trace chunks in fifo order */
278 };
279
280 void u_trace_context_init(struct u_trace_context *utctx,
281 void *pctx,
282 uint32_t timestamp_size_bytes,
283 uint32_t max_indirect_size_bytes,
284 u_trace_create_buffer create_buffer,
285 u_trace_delete_buffer delete_buffer,
286 u_trace_record_ts record_timestamp,
287 u_trace_read_ts read_timestamp,
288 u_trace_capture_data capture_data,
289 u_trace_get_data get_data,
290 u_trace_delete_flush_data delete_flush_data);
291 void u_trace_context_fini(struct u_trace_context *utctx);
292
293 /**
294 * Flush (trigger processing) of traces previously flushed to the
295 * trace-context by u_trace_flush().
296 *
297 * This should typically be called in the driver's pctx->flush().
298 */
299 void u_trace_context_process(struct u_trace_context *utctx, bool eof);
300
301 void u_trace_init(struct u_trace *ut, struct u_trace_context *utctx);
302 void u_trace_fini(struct u_trace *ut);
303
304 void u_trace_state_init(void);
305 bool u_trace_is_enabled(enum u_trace_type type);
306
307 bool u_trace_has_points(struct u_trace *ut);
308
309 struct u_trace_iterator {
310 struct u_trace *ut;
311 struct u_trace_chunk *chunk;
312 uint32_t event_idx;
313 };
314
315 struct u_trace_iterator u_trace_begin_iterator(struct u_trace *ut);
316
317 struct u_trace_iterator u_trace_end_iterator(struct u_trace *ut);
318
319 bool u_trace_iterator_equal(struct u_trace_iterator a,
320 struct u_trace_iterator b);
321
322 typedef void (*u_trace_copy_buffer)(struct u_trace_context *utctx,
323 void *cmdstream,
324 void *ts_from,
325 uint64_t from_offset,
326 void *ts_to,
327 uint64_t to_offset,
328 uint64_t size_B);
329
330 /**
331 * Clones tracepoints range into target u_trace.
332 * Provides callback for driver to copy timestamps on GPU from
333 * one buffer to another.
334 *
335 * It allows:
336 * - Tracing re-usable command buffer in Vulkan, by copying tracepoints
337 * each time it is submitted.
338 * - Per-tile tracing for tiling GPUs, by copying a range of tracepoints
339 * corresponding to a tile.
340 */
341 void u_trace_clone_append(struct u_trace_iterator begin_it,
342 struct u_trace_iterator end_it,
343 struct u_trace *into,
344 void *cmdstream,
345 u_trace_copy_buffer copy_buffer);
346
347 void u_trace_disable_event_range(struct u_trace_iterator begin_it,
348 struct u_trace_iterator end_it);
349
350 #define U_TRACE_FRAME_UNKNOWN -1
351 /**
352 * Flush traces to the parent trace-context. At this point, the expectation
353 * is that all the tracepoints are "executed" by the GPU following any
354 * previously flushed u_trace batch.
355 *
356 * flush_data is a way for driver to pass additional data, which becomes
357 * available only at the point of flush, to the u_trace_read_ts callback and
358 * perfetto. The typical example of such data would be a fence to wait on in
359 * u_trace_read_ts, and a submission_id to pass into perfetto. The destruction
360 * of the data is done via u_trace_delete_flush_data.
361 *
362 * This should typically be called when the corresponding cmdstream
363 * (containing the timestamp reads) is flushed to the kernel.
364 */
365 void u_trace_flush(struct u_trace *ut,
366 void *flush_data,
367 uint32_t frame_nr,
368 bool free_data);
369
370 #ifdef HAVE_PERFETTO
371 static ALWAYS_INLINE bool
u_trace_perfetto_active(struct u_trace_context * utctx)372 u_trace_perfetto_active(struct u_trace_context *utctx)
373 {
374 return p_atomic_read_relaxed(&utctx->enabled_traces) &
375 U_TRACE_TYPE_PERFETTO_ACTIVE;
376 }
377
378 void u_trace_perfetto_start(void);
379 void u_trace_perfetto_stop(void);
380 #else
381 static ALWAYS_INLINE bool
u_trace_perfetto_active(UNUSED struct u_trace_context * utctx)382 u_trace_perfetto_active(UNUSED struct u_trace_context *utctx)
383 {
384 return false;
385 }
386 #endif
387
388 /**
389 * Return whether utrace is enabled at all or not, this can be used to
390 * gate any expensive traces.
391 */
392 static ALWAYS_INLINE bool
u_trace_enabled(struct u_trace_context * utctx)393 u_trace_enabled(struct u_trace_context *utctx)
394 {
395 return p_atomic_read_relaxed(&utctx->enabled_traces) != 0;
396 }
397
398 /**
399 * Return whether chunks should be processed or not.
400 */
401 static ALWAYS_INLINE bool
u_trace_should_process(struct u_trace_context * utctx)402 u_trace_should_process(struct u_trace_context *utctx)
403 {
404 return p_atomic_read_relaxed(&utctx->enabled_traces) &
405 U_TRACE_TYPE_REQUIRE_PROCESSING;
406 }
407
408 /**
409 * Return whether to emit markers into the command stream even if the queue
410 * isn't active.
411 */
412 static ALWAYS_INLINE bool
u_trace_markers_enabled(struct u_trace_context * utctx)413 u_trace_markers_enabled(struct u_trace_context *utctx)
414 {
415 return p_atomic_read_relaxed(&utctx->enabled_traces) &
416 U_TRACE_TYPE_MARKERS;
417 }
418
419 #ifdef __cplusplus
420 }
421 #endif
422
423 #endif /* _U_TRACE_H */
424