xref: /aosp_15_r20/external/mesa3d/src/util/perf/u_trace.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2020 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #ifndef _U_TRACE_H
25 #define _U_TRACE_H
26 
27 #include <stdbool.h>
28 #include <stdint.h>
29 #include <stdio.h>
30 
31 #include "util/macros.h"
32 #include "util/u_atomic.h"
33 #include "util/u_queue.h"
34 
35 #ifdef __cplusplus
36 extern "C" {
37 #endif
38 
39 /* A trace mechanism (very) loosely inspired by the linux kernel tracepoint
40  * mechanism, in that it allows for defining driver specific (or common)
41  * tracepoints, which generate 'trace_$name()' functions that can be
42  * called at various points in commandstream emit.
43  *
44  * Currently a printf backend is implemented, but the expectation is to
45  * also implement a perfetto backend for shipping out traces to a tool like
46  * AGI.
47  *
48  * Notable differences:
49  *
50  *  - GPU timestamps!  A driver provided callback is used to emit timestamps
51  *    to a buffer.  At a later point in time (when stalling to wait for the
52  *    GPU is not required), the timestamps are re-united with the trace
53  *    payload.  This makes the trace mechanism suitable for profiling.
54  *
55  *  - Instead of a systemwide trace ringbuffer, buffering of un-retired
56  *    tracepoints is split into two stages.  Traces are emitted to a
57  *    'u_trace' instance, and at a later time flushed to a 'u_trace_context'
58  *    instance.  This avoids the requirement that commandstream containing
59  *    tracepoints is emitted in the same order as it is generated.
60  *
61  *    If the hw has multiple parallel "engines" (for example, 3d/blit/compute)
62  *    then a `u_trace_context` per-engine should be used.
63  *
64  *  - Unlike kernel tracepoints, u_trace tracepoints are defined in py
65  *    from which header and src files are generated.  Since we already have
66  *    a build dependency on python+mako, this gives more flexibility than
67  *    clunky preprocessor macro magic.
68  *
69  */
70 
71 struct u_trace_context;
72 struct u_trace;
73 struct u_trace_chunk;
74 struct u_trace_printer;
75 
76 /**
77  * Special reserved value to indicate that no timestamp was captured,
78  * and that the timestamp of the previous trace should be reused.
79  */
80 #define U_TRACE_NO_TIMESTAMP ((uint64_t) 0)
81 
82 /**
83  * Address representation
84  */
85 struct u_trace_address {
86    /**
87     * Pointer to a buffer object
88     */
89    void *bo;
90    /**
91     * Offset inside the buffer object or address of bo is NULL
92     */
93    uint64_t offset;
94 };
95 
96 /**
97  * Driver provided callback to create a buffer which will be read by
98  * u_trace_read_ts function.
99  */
100 typedef void *(*u_trace_create_buffer)(struct u_trace_context *utctx,
101                                        uint64_t size_B);
102 
103 /**
104  * Driver provided callback to delete a buffer.
105  */
106 typedef void (*u_trace_delete_buffer)(struct u_trace_context *utctx,
107                                       void *buffer);
108 
109 /**
110  * Driver provided callback to emit commands into the soecified command
111  * stream to capture a 64b timestamp into the specified timestamps buffer,
112  * at the specified index.
113  *
114  * The hw counter that the driver records should be something that runs at
115  * a fixed rate, even as the GPU freq changes.  The same source used for
116  * GL_TIMESTAMP queries should be appropriate.
117  */
118 typedef void (*u_trace_record_ts)(struct u_trace *ut,
119                                   void *cs,
120                                   void *timestamps,
121                                   uint64_t offset_B,
122                                   uint32_t flags);
123 
124 /**
125  * Driver provided callback to capture indirect data.
126  */
127 typedef void (*u_trace_capture_data)(struct u_trace *ut,
128                                      void *cs,
129                                      void *dst_buffer,
130                                      uint64_t dst_offset_B,
131                                      void *src_buffer,
132                                      uint64_t src_offset_B,
133                                      uint32_t size_B);
134 
135 /**
136  * Driver provided callback to read back previously recorded indirect data.
137  */
138 typedef const void *(*u_trace_get_data)(struct u_trace_context *utctx,
139                                         void *buffer,
140                                         uint64_t offset_B,
141                                         uint32_t size_B);
142 /**
143  * Driver provided callback to read back a previously recorded timestamp.
144  * If necessary, this should block until the GPU has finished writing back
145  * the timestamps.  (The timestamps will be read back in order, so it is
146  * safe to only synchronize on idx==0.)
147  *
148  * flush_data is data provided by the driver via u_trace_flush.
149  *
150  * The returned timestamp should be in units of nanoseconds.  The same
151  * timebase as GL_TIMESTAMP queries should be used.
152  *
153  * The driver can return the special U_TRACE_NO_TIMESTAMP value to indicate
154  * that no timestamp was captured and the timestamp from the previous trace
155  * will be re-used.  (The first trace in the u_trace buf may not do this.)
156  * This allows the driver to detect cases where multiple tracepoints are
157  * emitted with no other intervening cmdstream, to avoid pointlessly
158  * capturing the same timestamp multiple times in a row.
159  */
160 typedef uint64_t (*u_trace_read_ts)(struct u_trace_context *utctx,
161                                     void *timestamps,
162                                     uint64_t offset_B,
163                                     void *flush_data);
164 
165 /**
166  * Driver provided callback to create a buffer which will be read by
167  * u_trace_read_ts function.
168  */
169 typedef void *(*u_trace_copy_data)(struct u_trace *ut,
170                                    void *cs,
171                                    void *dst,
172                                    uint64_t dst_offset_B,
173                                    void *src,
174                                    uint64_t src_offset_B,
175                                    uint64_t size_B);
176 
177 /**
178  * Driver provided callback to delete flush data.
179  */
180 typedef void (*u_trace_delete_flush_data)(struct u_trace_context *utctx,
181                                           void *flush_data);
182 
183 enum u_trace_type {
184    U_TRACE_TYPE_PRINT = 1u << 0,
185    U_TRACE_TYPE_JSON = 1u << 1,
186    U_TRACE_TYPE_PERFETTO_ACTIVE = 1u << 2,
187    U_TRACE_TYPE_PERFETTO_ENV = 1u << 3,
188    U_TRACE_TYPE_MARKERS = 1u << 4,
189    U_TRACE_TYPE_INDIRECTS = 1u << 5,
190    U_TRACE_TYPE_CSV = 1u << 6,
191 
192    U_TRACE_TYPE_PRINT_CSV = U_TRACE_TYPE_PRINT | U_TRACE_TYPE_CSV,
193    U_TRACE_TYPE_PRINT_JSON = U_TRACE_TYPE_PRINT | U_TRACE_TYPE_JSON,
194    U_TRACE_TYPE_PERFETTO =
195       U_TRACE_TYPE_PERFETTO_ACTIVE | U_TRACE_TYPE_PERFETTO_ENV,
196 
197    /*
198     * A mask of traces that require appending to the tracepoint chunk list.
199     */
200    U_TRACE_TYPE_REQUIRE_QUEUING = U_TRACE_TYPE_PRINT | U_TRACE_TYPE_PERFETTO,
201    /*
202     * A mask of traces that require processing the tracepoint chunk list.
203     */
204    U_TRACE_TYPE_REQUIRE_PROCESSING =
205       U_TRACE_TYPE_PRINT | U_TRACE_TYPE_PERFETTO_ACTIVE,
206 };
207 
208 /**
209  * The trace context provides tracking for "in-flight" traces, once the
210  * cmdstream that records timestamps has been flushed.
211  */
212 struct u_trace_context {
213    /* All traces enabled in this context */
214    enum u_trace_type enabled_traces;
215 
216    void *pctx;
217 
218    u_trace_create_buffer create_buffer;
219    u_trace_delete_buffer delete_buffer;
220    u_trace_capture_data capture_data;
221    u_trace_get_data get_data;
222    u_trace_record_ts record_timestamp;
223    u_trace_read_ts read_timestamp;
224    u_trace_delete_flush_data delete_flush_data;
225 
226    uint64_t timestamp_size_bytes;
227    uint64_t max_indirect_size_bytes;
228 
229    FILE *out;
230    struct u_trace_printer *out_printer;
231 
232    /* Once u_trace_flush() is called u_trace_chunk's are queued up to
233     * render tracepoints on a queue.  The per-chunk queue jobs block until
234     * timestamps are available.
235     */
236    struct util_queue queue;
237 
238 #ifdef HAVE_PERFETTO
239    /* node in global list of trace contexts. */
240    struct list_head node;
241 #endif
242 
243    /* State to accumulate time across N chunks associated with a single
244     * batch (u_trace).
245     */
246    uint64_t last_time_ns;
247    uint64_t first_time_ns;
248 
249    uint32_t frame_nr;
250    uint32_t batch_nr;
251    uint32_t event_nr;
252    bool start_of_frame;
253 
254    void *dummy_indirect_data;
255 
256    /* list of unprocessed trace chunks in fifo order: */
257    struct list_head flushed_trace_chunks;
258 };
259 
260 /**
261  * The u_trace ptr is passed as the first arg to generated tracepoints.
262  * It provides buffering for tracepoint payload until the corresponding
263  * driver cmdstream containing the emitted commands to capture is
264  * flushed.
265  *
266  * Individual tracepoints emitted to u_trace are expected to be "executed"
267  * (ie. timestamp captured) in FIFO order with respect to other tracepoints
268  * emitted to the same u_trace.  But the order WRT other u_trace instances
269  * is undefined util u_trace_flush().
270  */
271 struct u_trace {
272    struct u_trace_context *utctx;
273 
274    uint32_t num_traces;
275 
276    struct list_head
277       trace_chunks; /* list of unflushed trace chunks in fifo order */
278 };
279 
280 void u_trace_context_init(struct u_trace_context *utctx,
281                           void *pctx,
282                           uint32_t timestamp_size_bytes,
283                           uint32_t max_indirect_size_bytes,
284                           u_trace_create_buffer create_buffer,
285                           u_trace_delete_buffer delete_buffer,
286                           u_trace_record_ts record_timestamp,
287                           u_trace_read_ts read_timestamp,
288                           u_trace_capture_data capture_data,
289                           u_trace_get_data get_data,
290                           u_trace_delete_flush_data delete_flush_data);
291 void u_trace_context_fini(struct u_trace_context *utctx);
292 
293 /**
294  * Flush (trigger processing) of traces previously flushed to the
295  * trace-context by u_trace_flush().
296  *
297  * This should typically be called in the driver's pctx->flush().
298  */
299 void u_trace_context_process(struct u_trace_context *utctx, bool eof);
300 
301 void u_trace_init(struct u_trace *ut, struct u_trace_context *utctx);
302 void u_trace_fini(struct u_trace *ut);
303 
304 void u_trace_state_init(void);
305 bool u_trace_is_enabled(enum u_trace_type type);
306 
307 bool u_trace_has_points(struct u_trace *ut);
308 
309 struct u_trace_iterator {
310    struct u_trace *ut;
311    struct u_trace_chunk *chunk;
312    uint32_t event_idx;
313 };
314 
315 struct u_trace_iterator u_trace_begin_iterator(struct u_trace *ut);
316 
317 struct u_trace_iterator u_trace_end_iterator(struct u_trace *ut);
318 
319 bool u_trace_iterator_equal(struct u_trace_iterator a,
320                             struct u_trace_iterator b);
321 
322 typedef void (*u_trace_copy_buffer)(struct u_trace_context *utctx,
323                                     void *cmdstream,
324                                     void *ts_from,
325                                     uint64_t from_offset,
326                                     void *ts_to,
327                                     uint64_t to_offset,
328                                     uint64_t size_B);
329 
330 /**
331  * Clones tracepoints range into target u_trace.
332  * Provides callback for driver to copy timestamps on GPU from
333  * one buffer to another.
334  *
335  * It allows:
336  * - Tracing re-usable command buffer in Vulkan, by copying tracepoints
337  *   each time it is submitted.
338  * - Per-tile tracing for tiling GPUs, by copying a range of tracepoints
339  *   corresponding to a tile.
340  */
341 void u_trace_clone_append(struct u_trace_iterator begin_it,
342                           struct u_trace_iterator end_it,
343                           struct u_trace *into,
344                           void *cmdstream,
345                           u_trace_copy_buffer copy_buffer);
346 
347 void u_trace_disable_event_range(struct u_trace_iterator begin_it,
348                                  struct u_trace_iterator end_it);
349 
350 #define U_TRACE_FRAME_UNKNOWN -1
351 /**
352  * Flush traces to the parent trace-context.  At this point, the expectation
353  * is that all the tracepoints are "executed" by the GPU following any
354  * previously flushed u_trace batch.
355  *
356  * flush_data is a way for driver to pass additional data, which becomes
357  * available only at the point of flush, to the u_trace_read_ts callback and
358  * perfetto. The typical example of such data would be a fence to wait on in
359  * u_trace_read_ts, and a submission_id to pass into perfetto. The destruction
360  * of the data is done via u_trace_delete_flush_data.
361  *
362  * This should typically be called when the corresponding cmdstream
363  * (containing the timestamp reads) is flushed to the kernel.
364  */
365 void u_trace_flush(struct u_trace *ut,
366                    void *flush_data,
367                    uint32_t frame_nr,
368                    bool free_data);
369 
370 #ifdef HAVE_PERFETTO
371 static ALWAYS_INLINE bool
u_trace_perfetto_active(struct u_trace_context * utctx)372 u_trace_perfetto_active(struct u_trace_context *utctx)
373 {
374    return p_atomic_read_relaxed(&utctx->enabled_traces) &
375           U_TRACE_TYPE_PERFETTO_ACTIVE;
376 }
377 
378 void u_trace_perfetto_start(void);
379 void u_trace_perfetto_stop(void);
380 #else
381 static ALWAYS_INLINE bool
u_trace_perfetto_active(UNUSED struct u_trace_context * utctx)382 u_trace_perfetto_active(UNUSED struct u_trace_context *utctx)
383 {
384    return false;
385 }
386 #endif
387 
388 /**
389  * Return whether utrace is enabled at all or not, this can be used to
390  * gate any expensive traces.
391  */
392 static ALWAYS_INLINE bool
u_trace_enabled(struct u_trace_context * utctx)393 u_trace_enabled(struct u_trace_context *utctx)
394 {
395    return p_atomic_read_relaxed(&utctx->enabled_traces) != 0;
396 }
397 
398 /**
399  * Return whether chunks should be processed or not.
400  */
401 static ALWAYS_INLINE bool
u_trace_should_process(struct u_trace_context * utctx)402 u_trace_should_process(struct u_trace_context *utctx)
403 {
404    return p_atomic_read_relaxed(&utctx->enabled_traces) &
405           U_TRACE_TYPE_REQUIRE_PROCESSING;
406 }
407 
408 /**
409  * Return whether to emit markers into the command stream even if the queue
410  * isn't active.
411  */
412 static ALWAYS_INLINE bool
u_trace_markers_enabled(struct u_trace_context * utctx)413 u_trace_markers_enabled(struct u_trace_context *utctx)
414 {
415    return p_atomic_read_relaxed(&utctx->enabled_traces) &
416           U_TRACE_TYPE_MARKERS;
417 }
418 
419 #ifdef __cplusplus
420 }
421 #endif
422 
423 #endif /* _U_TRACE_H */
424