xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/freedreno/freedreno_autotune.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Google, Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #ifndef FREEDRENO_AUTOTUNE_H
7 #define FREEDRENO_AUTOTUNE_H
8 
9 #include "util/hash_table.h"
10 #include "util/list.h"
11 
12 #include "freedreno_util.h"
13 
14 struct fd_autotune_results;
15 
16 /**
17  * "autotune" our decisions about bypass vs GMEM rendering, based on historical
18  * data about a given render target.
19  *
20  * In deciding which path to take there are tradeoffs, including some that
21  * are not reasonably estimateable without having some additional information:
22  *
23  *  (1) If you know you are touching every pixel (ie. there is a glClear()),
24  *      then the GMEM path will at least not cost more memory bandwidth than
25  *      sysmem[1]
26  *
27  *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
28  *      due to sysmem->GMEM restore pass.
29  *
30  *  (3) If you see a high draw count, that is an indication that there will be
31  *      enough pixels accessed multiple times to benefit from the reduced
32  *      memory bandwidth that GMEM brings
33  *
34  *  (4) But high draw count where there is not much overdraw can actually be
35  *      faster in bypass mode if it is pushing a lot of state change, due to
36  *      not having to go thru the state changes per-tile[2]
37  *
38  * The approach taken is to measure the samples-passed for the batch to estimate
39  * the amount of overdraw to detect cases where the number of pixels touched is
40  * low.
41  *
42  * Note however, that (at least since a5xx) we have PERF_RB_{Z,C}_{READ,WRITE}
43  * performance countables, which give a more direct measurement of what we want
44  * to know (ie. is framebuffer memory access high enough to prefer GMEM), but
45  * with the downside of consuming half of the available RB counters.  With the
46  * additional complication that external perfcntr collection (fdperf, perfetto)
47  * and the drive could be stomping on each other's feet.  (Also reading the
48  * perfcntrs accurately requires a WFI.)
49  *
50  * [1] ignoring UBWC
51  * [2] ignoring early-tile-exit optimizations, but any draw that touches all/
52  *     most of the tiles late in the tile-pass can defeat that
53  */
54 struct fd_autotune {
55 
56    /**
57     * Cache to map batch->key (also used for batch-cache) to historical
58     * information about rendering to that particular render target.
59     */
60    struct hash_table *ht;
61 
62    /**
63     * List of recently used historical results (to age out old results)
64     */
65    struct list_head lru;
66 
67    /**
68     * GPU buffer used to communicate back results to the CPU
69     */
70    struct fd_bo *results_mem;
71    struct fd_autotune_results *results;
72 
73    /**
74     * List of per-batch results that we are waiting for the GPU to finish
75     * with before reading back the results.
76     */
77    struct list_head pending_results;
78 
79    uint32_t fence_counter;
80    uint32_t idx_counter;
81 };
82 
83 /**
84  * The layout of the memory used to read back per-batch results from the
85  * GPU
86  *
87  * Note this struct is intentionally aligned to 4k.  And hw requires the
88  * sample start/stop locations to be 128b aligned.
89  */
90 struct fd_autotune_results {
91 
92    /**
93     * The GPU writes back a "fence" seqno value from the cmdstream after
94     * it finishes writing it's result slot, so that the CPU knows when
95     * results are valid
96     */
97    uint32_t fence;
98 
99    uint32_t __pad0;
100    uint64_t __pad1;
101 
102    /**
103     * From the cmdstream, the captured samples-passed values are recorded
104     * at the start and end of the batch.
105     *
106     * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
107     * may force us to revisit that.
108     */
109    struct {
110       uint64_t samples_start;
111       uint64_t samples_result;
112       uint64_t samples_end;
113       uint64_t __pad1;
114    } result[127];
115 };
116 
117 #define __offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base))
118 #define results_ptr(at, member)                                                \
119    (at)->results_mem, __offset((at)->results, &(at)->results->member), 0, 0
120 
121 struct fd_batch_history;
122 
123 /**
124  * Tracks the results from an individual batch.  Initially created per batch,
125  * and appended to the tail of at->pending_results.  At a later time, when
126  * the GPU has finished writing the results,
127  *
128  * ralloc parent is the associated fd_batch_history
129  */
130 struct fd_batch_result {
131 
132    /**
133     * The index/slot in fd_autotune_results::result[] to write start/end
134     * counter to
135     */
136    unsigned idx;
137 
138    /**
139     * Fence value to write back to fd_autotune_results::fence after both
140     * start/end values written
141     */
142    uint32_t fence;
143 
144    /*
145     * Below here, only used internally within autotune
146     */
147    struct fd_batch_history *history;
148    struct list_head node;
149    uint32_t cost;
150    uint64_t samples_passed;
151 };
152 
153 void fd_autotune_init(struct fd_autotune *at, struct fd_device *dev);
154 void fd_autotune_fini(struct fd_autotune *at);
155 
156 struct fd_batch;
157 bool fd_autotune_use_bypass(struct fd_autotune *at,
158                             struct fd_batch *batch) assert_dt;
159 
160 #endif /* FREEDRENO_AUTOTUNE_H */
161