xref: /aosp_15_r20/external/mesa3d/src/freedreno/vulkan/tu_autotune.cc (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1*61046927SAndroid Build Coastguard Worker /*
2*61046927SAndroid Build Coastguard Worker  * Copyright © 2021 Igalia S.L.
3*61046927SAndroid Build Coastguard Worker  * SPDX-License-Identifier: MIT
4*61046927SAndroid Build Coastguard Worker  */
5*61046927SAndroid Build Coastguard Worker 
6*61046927SAndroid Build Coastguard Worker #include "tu_autotune.h"
7*61046927SAndroid Build Coastguard Worker 
8*61046927SAndroid Build Coastguard Worker #include "tu_cmd_buffer.h"
9*61046927SAndroid Build Coastguard Worker #include "tu_cs.h"
10*61046927SAndroid Build Coastguard Worker #include "tu_device.h"
11*61046927SAndroid Build Coastguard Worker #include "tu_image.h"
12*61046927SAndroid Build Coastguard Worker #include "tu_pass.h"
13*61046927SAndroid Build Coastguard Worker 
14*61046927SAndroid Build Coastguard Worker /* How does it work?
15*61046927SAndroid Build Coastguard Worker  *
16*61046927SAndroid Build Coastguard Worker  * - For each renderpass we calculate the number of samples passed
17*61046927SAndroid Build Coastguard Worker  *   by storing the number before and after in GPU memory.
18*61046927SAndroid Build Coastguard Worker  * - To store the values each command buffer holds GPU memory which
19*61046927SAndroid Build Coastguard Worker  *   expands with more renderpasses being written.
20*61046927SAndroid Build Coastguard Worker  * - For each renderpass we create tu_renderpass_result entry which
21*61046927SAndroid Build Coastguard Worker  *   points to the results in GPU memory.
22*61046927SAndroid Build Coastguard Worker  *   - Later on tu_renderpass_result would be added to the
23*61046927SAndroid Build Coastguard Worker  *     tu_renderpass_history entry which aggregate results for a
24*61046927SAndroid Build Coastguard Worker  *     given renderpass.
25*61046927SAndroid Build Coastguard Worker  * - On submission:
26*61046927SAndroid Build Coastguard Worker  *   - Process results which fence was signalled.
27*61046927SAndroid Build Coastguard Worker  *   - Free per-submission data which we now don't need.
28*61046927SAndroid Build Coastguard Worker  *
29*61046927SAndroid Build Coastguard Worker  *   - Create a command stream to write a fence value. This way we would
30*61046927SAndroid Build Coastguard Worker  *     know when we could safely read the results.
31*61046927SAndroid Build Coastguard Worker  *   - We cannot rely on the command buffer's lifetime when referencing
32*61046927SAndroid Build Coastguard Worker  *     its resources since the buffer could be destroyed before we process
33*61046927SAndroid Build Coastguard Worker  *     the results.
34*61046927SAndroid Build Coastguard Worker  *   - For each command buffer:
35*61046927SAndroid Build Coastguard Worker  *     - Reference its GPU memory.
36*61046927SAndroid Build Coastguard Worker  *     - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue.
37*61046927SAndroid Build Coastguard Worker  *
38*61046927SAndroid Build Coastguard Worker  * Since the command buffers could be recorded on different threads
39*61046927SAndroid Build Coastguard Worker  * we have to maintaining some amount of locking history table,
40*61046927SAndroid Build Coastguard Worker  * however we change the table only in a single thread at the submission
41*61046927SAndroid Build Coastguard Worker  * time, so in most cases there will be no locking.
42*61046927SAndroid Build Coastguard Worker  */
43*61046927SAndroid Build Coastguard Worker 
44*61046927SAndroid Build Coastguard Worker void
45*61046927SAndroid Build Coastguard Worker tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
46*61046927SAndroid Build Coastguard Worker 
47*61046927SAndroid Build Coastguard Worker #define TU_AUTOTUNE_DEBUG_LOG 0
48*61046927SAndroid Build Coastguard Worker /* Dump history entries on autotuner finish,
49*61046927SAndroid Build Coastguard Worker  * could be used to gather data from traces.
50*61046927SAndroid Build Coastguard Worker  */
51*61046927SAndroid Build Coastguard Worker #define TU_AUTOTUNE_LOG_AT_FINISH 0
52*61046927SAndroid Build Coastguard Worker 
53*61046927SAndroid Build Coastguard Worker /* How many last renderpass stats are taken into account. */
54*61046927SAndroid Build Coastguard Worker #define MAX_HISTORY_RESULTS 5
55*61046927SAndroid Build Coastguard Worker /* For how many submissions we store renderpass stats. */
56*61046927SAndroid Build Coastguard Worker #define MAX_HISTORY_LIFETIME 128
57*61046927SAndroid Build Coastguard Worker 
58*61046927SAndroid Build Coastguard Worker 
59*61046927SAndroid Build Coastguard Worker /**
60*61046927SAndroid Build Coastguard Worker  * Tracks results for a given renderpass key
61*61046927SAndroid Build Coastguard Worker  */
62*61046927SAndroid Build Coastguard Worker struct tu_renderpass_history {
63*61046927SAndroid Build Coastguard Worker    uint64_t key;
64*61046927SAndroid Build Coastguard Worker 
65*61046927SAndroid Build Coastguard Worker    /* We would delete old history entries */
66*61046927SAndroid Build Coastguard Worker    uint32_t last_fence;
67*61046927SAndroid Build Coastguard Worker 
68*61046927SAndroid Build Coastguard Worker    /**
69*61046927SAndroid Build Coastguard Worker     * List of recent fd_renderpass_result's
70*61046927SAndroid Build Coastguard Worker     */
71*61046927SAndroid Build Coastguard Worker    struct list_head results;
72*61046927SAndroid Build Coastguard Worker    uint32_t num_results;
73*61046927SAndroid Build Coastguard Worker 
74*61046927SAndroid Build Coastguard Worker    uint32_t avg_samples;
75*61046927SAndroid Build Coastguard Worker };
76*61046927SAndroid Build Coastguard Worker 
77*61046927SAndroid Build Coastguard Worker /* Holds per-submission cs which writes the fence. */
78*61046927SAndroid Build Coastguard Worker struct tu_submission_data {
79*61046927SAndroid Build Coastguard Worker    struct list_head node;
80*61046927SAndroid Build Coastguard Worker    uint32_t fence;
81*61046927SAndroid Build Coastguard Worker 
82*61046927SAndroid Build Coastguard Worker    struct tu_cs fence_cs;
83*61046927SAndroid Build Coastguard Worker };
84*61046927SAndroid Build Coastguard Worker 
85*61046927SAndroid Build Coastguard Worker static bool
fence_before(uint32_t a,uint32_t b)86*61046927SAndroid Build Coastguard Worker fence_before(uint32_t a, uint32_t b)
87*61046927SAndroid Build Coastguard Worker {
88*61046927SAndroid Build Coastguard Worker    /* essentially a < b, but handle wrapped values */
89*61046927SAndroid Build Coastguard Worker    return (int32_t)(a - b) < 0;
90*61046927SAndroid Build Coastguard Worker }
91*61046927SAndroid Build Coastguard Worker 
92*61046927SAndroid Build Coastguard Worker static uint32_t
get_autotune_fence(struct tu_autotune * at)93*61046927SAndroid Build Coastguard Worker get_autotune_fence(struct tu_autotune *at)
94*61046927SAndroid Build Coastguard Worker {
95*61046927SAndroid Build Coastguard Worker    return at->device->global_bo_map->autotune_fence;
96*61046927SAndroid Build Coastguard Worker }
97*61046927SAndroid Build Coastguard Worker 
98*61046927SAndroid Build Coastguard Worker template <chip CHIP>
99*61046927SAndroid Build Coastguard Worker static void
create_submission_fence(struct tu_device * dev,struct tu_cs * cs,uint32_t fence)100*61046927SAndroid Build Coastguard Worker create_submission_fence(struct tu_device *dev,
101*61046927SAndroid Build Coastguard Worker                         struct tu_cs *cs,
102*61046927SAndroid Build Coastguard Worker                         uint32_t fence)
103*61046927SAndroid Build Coastguard Worker {
104*61046927SAndroid Build Coastguard Worker    uint64_t dst_iova = dev->global_bo->iova + gb_offset(autotune_fence);
105*61046927SAndroid Build Coastguard Worker    if (CHIP >= A7XX) {
106*61046927SAndroid Build Coastguard Worker       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
107*61046927SAndroid Build Coastguard Worker       tu_cs_emit(cs,
108*61046927SAndroid Build Coastguard Worker          CP_EVENT_WRITE7_0(.event = CACHE_FLUSH_TS,
109*61046927SAndroid Build Coastguard Worker                            .write_src = EV_WRITE_USER_32B,
110*61046927SAndroid Build Coastguard Worker                            .write_dst = EV_DST_RAM,
111*61046927SAndroid Build Coastguard Worker                            .write_enabled = true).value);
112*61046927SAndroid Build Coastguard Worker    } else {
113*61046927SAndroid Build Coastguard Worker       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
114*61046927SAndroid Build Coastguard Worker       tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
115*61046927SAndroid Build Coastguard Worker    }
116*61046927SAndroid Build Coastguard Worker 
117*61046927SAndroid Build Coastguard Worker    tu_cs_emit_qw(cs, dst_iova);
118*61046927SAndroid Build Coastguard Worker    tu_cs_emit(cs, fence);
119*61046927SAndroid Build Coastguard Worker }
120*61046927SAndroid Build Coastguard Worker 
121*61046927SAndroid Build Coastguard Worker static struct tu_submission_data *
create_submission_data(struct tu_device * dev,struct tu_autotune * at,uint32_t fence)122*61046927SAndroid Build Coastguard Worker create_submission_data(struct tu_device *dev, struct tu_autotune *at,
123*61046927SAndroid Build Coastguard Worker                        uint32_t fence)
124*61046927SAndroid Build Coastguard Worker {
125*61046927SAndroid Build Coastguard Worker    struct tu_submission_data *submission_data = NULL;
126*61046927SAndroid Build Coastguard Worker    if (!list_is_empty(&at->submission_data_pool)) {
127*61046927SAndroid Build Coastguard Worker       submission_data = list_first_entry(&at->submission_data_pool,
128*61046927SAndroid Build Coastguard Worker                                          struct tu_submission_data, node);
129*61046927SAndroid Build Coastguard Worker       list_del(&submission_data->node);
130*61046927SAndroid Build Coastguard Worker    } else {
131*61046927SAndroid Build Coastguard Worker       submission_data = (struct tu_submission_data *) calloc(
132*61046927SAndroid Build Coastguard Worker          1, sizeof(struct tu_submission_data));
133*61046927SAndroid Build Coastguard Worker       tu_cs_init(&submission_data->fence_cs, dev, TU_CS_MODE_GROW, 5, "autotune fence cs");
134*61046927SAndroid Build Coastguard Worker    }
135*61046927SAndroid Build Coastguard Worker    submission_data->fence = fence;
136*61046927SAndroid Build Coastguard Worker 
137*61046927SAndroid Build Coastguard Worker    struct tu_cs* fence_cs = &submission_data->fence_cs;
138*61046927SAndroid Build Coastguard Worker    tu_cs_begin(fence_cs);
139*61046927SAndroid Build Coastguard Worker    TU_CALLX(dev, create_submission_fence)(dev, fence_cs, fence);
140*61046927SAndroid Build Coastguard Worker    tu_cs_end(fence_cs);
141*61046927SAndroid Build Coastguard Worker 
142*61046927SAndroid Build Coastguard Worker    list_addtail(&submission_data->node, &at->pending_submission_data);
143*61046927SAndroid Build Coastguard Worker 
144*61046927SAndroid Build Coastguard Worker    return submission_data;
145*61046927SAndroid Build Coastguard Worker }
146*61046927SAndroid Build Coastguard Worker 
147*61046927SAndroid Build Coastguard Worker static void
finish_submission_data(struct tu_autotune * at,struct tu_submission_data * data)148*61046927SAndroid Build Coastguard Worker finish_submission_data(struct tu_autotune *at,
149*61046927SAndroid Build Coastguard Worker                        struct tu_submission_data *data)
150*61046927SAndroid Build Coastguard Worker {
151*61046927SAndroid Build Coastguard Worker    list_del(&data->node);
152*61046927SAndroid Build Coastguard Worker    list_addtail(&data->node, &at->submission_data_pool);
153*61046927SAndroid Build Coastguard Worker    tu_cs_reset(&data->fence_cs);
154*61046927SAndroid Build Coastguard Worker }
155*61046927SAndroid Build Coastguard Worker 
156*61046927SAndroid Build Coastguard Worker static void
free_submission_data(struct tu_submission_data * data)157*61046927SAndroid Build Coastguard Worker free_submission_data(struct tu_submission_data *data)
158*61046927SAndroid Build Coastguard Worker {
159*61046927SAndroid Build Coastguard Worker    list_del(&data->node);
160*61046927SAndroid Build Coastguard Worker    tu_cs_finish(&data->fence_cs);
161*61046927SAndroid Build Coastguard Worker 
162*61046927SAndroid Build Coastguard Worker    free(data);
163*61046927SAndroid Build Coastguard Worker }
164*61046927SAndroid Build Coastguard Worker 
165*61046927SAndroid Build Coastguard Worker static uint64_t
hash_renderpass_instance(const struct tu_render_pass * pass,const struct tu_framebuffer * framebuffer,const struct tu_cmd_buffer * cmd)166*61046927SAndroid Build Coastguard Worker hash_renderpass_instance(const struct tu_render_pass *pass,
167*61046927SAndroid Build Coastguard Worker                          const struct tu_framebuffer *framebuffer,
168*61046927SAndroid Build Coastguard Worker                          const struct tu_cmd_buffer *cmd) {
169*61046927SAndroid Build Coastguard Worker    uint32_t data[3 + pass->attachment_count * 5];
170*61046927SAndroid Build Coastguard Worker    uint32_t* ptr = data;
171*61046927SAndroid Build Coastguard Worker 
172*61046927SAndroid Build Coastguard Worker    *ptr++ = framebuffer->width;
173*61046927SAndroid Build Coastguard Worker    *ptr++ = framebuffer->height;
174*61046927SAndroid Build Coastguard Worker    *ptr++ = framebuffer->layers;
175*61046927SAndroid Build Coastguard Worker 
176*61046927SAndroid Build Coastguard Worker    for (unsigned i = 0; i < pass->attachment_count; i++) {
177*61046927SAndroid Build Coastguard Worker       *ptr++ = cmd->state.attachments[i]->view.width;
178*61046927SAndroid Build Coastguard Worker       *ptr++ = cmd->state.attachments[i]->view.height;
179*61046927SAndroid Build Coastguard Worker       *ptr++ = cmd->state.attachments[i]->image->vk.format;
180*61046927SAndroid Build Coastguard Worker       *ptr++ = cmd->state.attachments[i]->image->vk.array_layers;
181*61046927SAndroid Build Coastguard Worker       *ptr++ = cmd->state.attachments[i]->image->vk.mip_levels;
182*61046927SAndroid Build Coastguard Worker    }
183*61046927SAndroid Build Coastguard Worker 
184*61046927SAndroid Build Coastguard Worker    return XXH64(data, sizeof(data), pass->autotune_hash);
185*61046927SAndroid Build Coastguard Worker }
186*61046927SAndroid Build Coastguard Worker 
187*61046927SAndroid Build Coastguard Worker static void
free_result(struct tu_device * dev,struct tu_renderpass_result * result)188*61046927SAndroid Build Coastguard Worker free_result(struct tu_device *dev, struct tu_renderpass_result *result)
189*61046927SAndroid Build Coastguard Worker {
190*61046927SAndroid Build Coastguard Worker    tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
191*61046927SAndroid Build Coastguard Worker    list_del(&result->node);
192*61046927SAndroid Build Coastguard Worker    free(result);
193*61046927SAndroid Build Coastguard Worker }
194*61046927SAndroid Build Coastguard Worker 
195*61046927SAndroid Build Coastguard Worker static void
free_history(struct tu_device * dev,struct tu_renderpass_history * history)196*61046927SAndroid Build Coastguard Worker free_history(struct tu_device *dev, struct tu_renderpass_history *history)
197*61046927SAndroid Build Coastguard Worker {
198*61046927SAndroid Build Coastguard Worker    tu_autotune_free_results_locked(dev, &history->results);
199*61046927SAndroid Build Coastguard Worker    free(history);
200*61046927SAndroid Build Coastguard Worker }
201*61046927SAndroid Build Coastguard Worker 
202*61046927SAndroid Build Coastguard Worker static bool
get_history(struct tu_autotune * at,uint64_t rp_key,uint32_t * avg_samples)203*61046927SAndroid Build Coastguard Worker get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples)
204*61046927SAndroid Build Coastguard Worker {
205*61046927SAndroid Build Coastguard Worker    bool has_history = false;
206*61046927SAndroid Build Coastguard Worker 
207*61046927SAndroid Build Coastguard Worker    /* If the lock contantion would be found in the wild -
208*61046927SAndroid Build Coastguard Worker     * we could use try_lock here.
209*61046927SAndroid Build Coastguard Worker     */
210*61046927SAndroid Build Coastguard Worker    u_rwlock_rdlock(&at->ht_lock);
211*61046927SAndroid Build Coastguard Worker    struct hash_entry *entry =
212*61046927SAndroid Build Coastguard Worker       _mesa_hash_table_search(at->ht, &rp_key);
213*61046927SAndroid Build Coastguard Worker    if (entry) {
214*61046927SAndroid Build Coastguard Worker       struct tu_renderpass_history *history =
215*61046927SAndroid Build Coastguard Worker          (struct tu_renderpass_history *) entry->data;
216*61046927SAndroid Build Coastguard Worker       if (history->num_results > 0) {
217*61046927SAndroid Build Coastguard Worker          *avg_samples = p_atomic_read(&history->avg_samples);
218*61046927SAndroid Build Coastguard Worker          has_history = true;
219*61046927SAndroid Build Coastguard Worker       }
220*61046927SAndroid Build Coastguard Worker    }
221*61046927SAndroid Build Coastguard Worker    u_rwlock_rdunlock(&at->ht_lock);
222*61046927SAndroid Build Coastguard Worker 
223*61046927SAndroid Build Coastguard Worker    return has_history;
224*61046927SAndroid Build Coastguard Worker }
225*61046927SAndroid Build Coastguard Worker 
226*61046927SAndroid Build Coastguard Worker static struct tu_renderpass_result *
create_history_result(struct tu_autotune * at,uint64_t rp_key)227*61046927SAndroid Build Coastguard Worker create_history_result(struct tu_autotune *at, uint64_t rp_key)
228*61046927SAndroid Build Coastguard Worker {
229*61046927SAndroid Build Coastguard Worker    struct tu_renderpass_result *result =
230*61046927SAndroid Build Coastguard Worker       (struct tu_renderpass_result *) calloc(1, sizeof(*result));
231*61046927SAndroid Build Coastguard Worker    result->rp_key = rp_key;
232*61046927SAndroid Build Coastguard Worker 
233*61046927SAndroid Build Coastguard Worker    return result;
234*61046927SAndroid Build Coastguard Worker }
235*61046927SAndroid Build Coastguard Worker 
236*61046927SAndroid Build Coastguard Worker static void
history_add_result(struct tu_device * dev,struct tu_renderpass_history * history,struct tu_renderpass_result * result)237*61046927SAndroid Build Coastguard Worker history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
238*61046927SAndroid Build Coastguard Worker                       struct tu_renderpass_result *result)
239*61046927SAndroid Build Coastguard Worker {
240*61046927SAndroid Build Coastguard Worker    list_delinit(&result->node);
241*61046927SAndroid Build Coastguard Worker    list_add(&result->node, &history->results);
242*61046927SAndroid Build Coastguard Worker 
243*61046927SAndroid Build Coastguard Worker    if (history->num_results < MAX_HISTORY_RESULTS) {
244*61046927SAndroid Build Coastguard Worker       history->num_results++;
245*61046927SAndroid Build Coastguard Worker    } else {
246*61046927SAndroid Build Coastguard Worker       /* Once above the limit, start popping old results off the
247*61046927SAndroid Build Coastguard Worker        * tail of the list:
248*61046927SAndroid Build Coastguard Worker        */
249*61046927SAndroid Build Coastguard Worker       struct tu_renderpass_result *old_result =
250*61046927SAndroid Build Coastguard Worker          list_last_entry(&history->results, struct tu_renderpass_result, node);
251*61046927SAndroid Build Coastguard Worker       mtx_lock(&dev->autotune_mutex);
252*61046927SAndroid Build Coastguard Worker       free_result(dev, old_result);
253*61046927SAndroid Build Coastguard Worker       mtx_unlock(&dev->autotune_mutex);
254*61046927SAndroid Build Coastguard Worker    }
255*61046927SAndroid Build Coastguard Worker 
256*61046927SAndroid Build Coastguard Worker    /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
257*61046927SAndroid Build Coastguard Worker    uint32_t total_samples = 0;
258*61046927SAndroid Build Coastguard Worker    list_for_each_entry(struct tu_renderpass_result, result,
259*61046927SAndroid Build Coastguard Worker                        &history->results, node) {
260*61046927SAndroid Build Coastguard Worker       total_samples += result->samples_passed;
261*61046927SAndroid Build Coastguard Worker    }
262*61046927SAndroid Build Coastguard Worker 
263*61046927SAndroid Build Coastguard Worker    float avg_samples = (float)total_samples / (float)history->num_results;
264*61046927SAndroid Build Coastguard Worker    p_atomic_set(&history->avg_samples, (uint32_t)avg_samples);
265*61046927SAndroid Build Coastguard Worker }
266*61046927SAndroid Build Coastguard Worker 
267*61046927SAndroid Build Coastguard Worker static void
process_results(struct tu_autotune * at,uint32_t current_fence)268*61046927SAndroid Build Coastguard Worker process_results(struct tu_autotune *at, uint32_t current_fence)
269*61046927SAndroid Build Coastguard Worker {
270*61046927SAndroid Build Coastguard Worker    struct tu_device *dev = at->device;
271*61046927SAndroid Build Coastguard Worker 
272*61046927SAndroid Build Coastguard Worker    list_for_each_entry_safe(struct tu_renderpass_result, result,
273*61046927SAndroid Build Coastguard Worker                             &at->pending_results, node) {
274*61046927SAndroid Build Coastguard Worker       if (fence_before(current_fence, result->fence))
275*61046927SAndroid Build Coastguard Worker          break;
276*61046927SAndroid Build Coastguard Worker 
277*61046927SAndroid Build Coastguard Worker       struct tu_renderpass_history *history = result->history;
278*61046927SAndroid Build Coastguard Worker       result->samples_passed =
279*61046927SAndroid Build Coastguard Worker          result->samples->samples_end - result->samples->samples_start;
280*61046927SAndroid Build Coastguard Worker 
281*61046927SAndroid Build Coastguard Worker       history_add_result(dev, history, result);
282*61046927SAndroid Build Coastguard Worker    }
283*61046927SAndroid Build Coastguard Worker 
284*61046927SAndroid Build Coastguard Worker    list_for_each_entry_safe(struct tu_submission_data, submission_data,
285*61046927SAndroid Build Coastguard Worker                             &at->pending_submission_data, node) {
286*61046927SAndroid Build Coastguard Worker       if (fence_before(current_fence, submission_data->fence))
287*61046927SAndroid Build Coastguard Worker          break;
288*61046927SAndroid Build Coastguard Worker 
289*61046927SAndroid Build Coastguard Worker       finish_submission_data(at, submission_data);
290*61046927SAndroid Build Coastguard Worker    }
291*61046927SAndroid Build Coastguard Worker }
292*61046927SAndroid Build Coastguard Worker 
293*61046927SAndroid Build Coastguard Worker static void
queue_pending_results(struct tu_autotune * at,struct tu_cmd_buffer * cmdbuf)294*61046927SAndroid Build Coastguard Worker queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
295*61046927SAndroid Build Coastguard Worker {
296*61046927SAndroid Build Coastguard Worker    bool one_time_submit = cmdbuf->usage_flags &
297*61046927SAndroid Build Coastguard Worker          VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
298*61046927SAndroid Build Coastguard Worker 
299*61046927SAndroid Build Coastguard Worker    if (one_time_submit) {
300*61046927SAndroid Build Coastguard Worker       /* We can just steal the list since it won't be resubmitted again */
301*61046927SAndroid Build Coastguard Worker       list_splicetail(&cmdbuf->renderpass_autotune_results,
302*61046927SAndroid Build Coastguard Worker                         &at->pending_results);
303*61046927SAndroid Build Coastguard Worker       list_inithead(&cmdbuf->renderpass_autotune_results);
304*61046927SAndroid Build Coastguard Worker    } else {
305*61046927SAndroid Build Coastguard Worker       list_for_each_entry_safe(struct tu_renderpass_result, result,
306*61046927SAndroid Build Coastguard Worker                               &cmdbuf->renderpass_autotune_results, node) {
307*61046927SAndroid Build Coastguard Worker          /* TODO: copying each result isn't nice */
308*61046927SAndroid Build Coastguard Worker          struct tu_renderpass_result *copy =
309*61046927SAndroid Build Coastguard Worker             (struct tu_renderpass_result *) malloc(sizeof(*result));
310*61046927SAndroid Build Coastguard Worker          *copy = *result;
311*61046927SAndroid Build Coastguard Worker          tu_bo_get_ref(copy->bo.bo);
312*61046927SAndroid Build Coastguard Worker          list_addtail(&copy->node, &at->pending_results);
313*61046927SAndroid Build Coastguard Worker       }
314*61046927SAndroid Build Coastguard Worker    }
315*61046927SAndroid Build Coastguard Worker }
316*61046927SAndroid Build Coastguard Worker 
317*61046927SAndroid Build Coastguard Worker struct tu_cs *
tu_autotune_on_submit(struct tu_device * dev,struct tu_autotune * at,struct tu_cmd_buffer ** cmd_buffers,uint32_t cmd_buffer_count)318*61046927SAndroid Build Coastguard Worker tu_autotune_on_submit(struct tu_device *dev,
319*61046927SAndroid Build Coastguard Worker                       struct tu_autotune *at,
320*61046927SAndroid Build Coastguard Worker                       struct tu_cmd_buffer **cmd_buffers,
321*61046927SAndroid Build Coastguard Worker                       uint32_t cmd_buffer_count)
322*61046927SAndroid Build Coastguard Worker {
323*61046927SAndroid Build Coastguard Worker    /* We are single-threaded here */
324*61046927SAndroid Build Coastguard Worker 
325*61046927SAndroid Build Coastguard Worker    const uint32_t gpu_fence = get_autotune_fence(at);
326*61046927SAndroid Build Coastguard Worker    const uint32_t new_fence = at->fence_counter++;
327*61046927SAndroid Build Coastguard Worker 
328*61046927SAndroid Build Coastguard Worker    process_results(at, gpu_fence);
329*61046927SAndroid Build Coastguard Worker 
330*61046927SAndroid Build Coastguard Worker    /* Create history entries here to minimize work and locking being
331*61046927SAndroid Build Coastguard Worker     * done on renderpass end.
332*61046927SAndroid Build Coastguard Worker     */
333*61046927SAndroid Build Coastguard Worker    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
334*61046927SAndroid Build Coastguard Worker       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
335*61046927SAndroid Build Coastguard Worker       list_for_each_entry_safe(struct tu_renderpass_result, result,
336*61046927SAndroid Build Coastguard Worker                           &cmdbuf->renderpass_autotune_results, node) {
337*61046927SAndroid Build Coastguard Worker          struct tu_renderpass_history *history;
338*61046927SAndroid Build Coastguard Worker          struct hash_entry *entry =
339*61046927SAndroid Build Coastguard Worker             _mesa_hash_table_search(at->ht, &result->rp_key);
340*61046927SAndroid Build Coastguard Worker          if (!entry) {
341*61046927SAndroid Build Coastguard Worker             history =
342*61046927SAndroid Build Coastguard Worker                (struct tu_renderpass_history *) calloc(1, sizeof(*history));
343*61046927SAndroid Build Coastguard Worker             history->key = result->rp_key;
344*61046927SAndroid Build Coastguard Worker             list_inithead(&history->results);
345*61046927SAndroid Build Coastguard Worker 
346*61046927SAndroid Build Coastguard Worker             u_rwlock_wrlock(&at->ht_lock);
347*61046927SAndroid Build Coastguard Worker             _mesa_hash_table_insert(at->ht, &history->key, history);
348*61046927SAndroid Build Coastguard Worker             u_rwlock_wrunlock(&at->ht_lock);
349*61046927SAndroid Build Coastguard Worker          } else {
350*61046927SAndroid Build Coastguard Worker             history = (struct tu_renderpass_history *) entry->data;
351*61046927SAndroid Build Coastguard Worker          }
352*61046927SAndroid Build Coastguard Worker 
353*61046927SAndroid Build Coastguard Worker          history->last_fence = new_fence;
354*61046927SAndroid Build Coastguard Worker 
355*61046927SAndroid Build Coastguard Worker          result->fence = new_fence;
356*61046927SAndroid Build Coastguard Worker          result->history = history;
357*61046927SAndroid Build Coastguard Worker       }
358*61046927SAndroid Build Coastguard Worker    }
359*61046927SAndroid Build Coastguard Worker 
360*61046927SAndroid Build Coastguard Worker    struct tu_submission_data *submission_data =
361*61046927SAndroid Build Coastguard Worker       create_submission_data(dev, at, new_fence);
362*61046927SAndroid Build Coastguard Worker 
363*61046927SAndroid Build Coastguard Worker    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
364*61046927SAndroid Build Coastguard Worker       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
365*61046927SAndroid Build Coastguard Worker       if (list_is_empty(&cmdbuf->renderpass_autotune_results))
366*61046927SAndroid Build Coastguard Worker          continue;
367*61046927SAndroid Build Coastguard Worker 
368*61046927SAndroid Build Coastguard Worker       queue_pending_results(at, cmdbuf);
369*61046927SAndroid Build Coastguard Worker    }
370*61046927SAndroid Build Coastguard Worker 
371*61046927SAndroid Build Coastguard Worker    if (TU_AUTOTUNE_DEBUG_LOG)
372*61046927SAndroid Build Coastguard Worker       mesa_logi("Total history entries: %u", at->ht->entries);
373*61046927SAndroid Build Coastguard Worker 
374*61046927SAndroid Build Coastguard Worker    /* Cleanup old entries from history table. The assumption
375*61046927SAndroid Build Coastguard Worker     * here is that application doesn't hold many old unsubmitted
376*61046927SAndroid Build Coastguard Worker     * command buffers, otherwise this table may grow big.
377*61046927SAndroid Build Coastguard Worker     */
378*61046927SAndroid Build Coastguard Worker    hash_table_foreach(at->ht, entry) {
379*61046927SAndroid Build Coastguard Worker       struct tu_renderpass_history *history =
380*61046927SAndroid Build Coastguard Worker          (struct tu_renderpass_history *) entry->data;
381*61046927SAndroid Build Coastguard Worker       if (fence_before(gpu_fence, history->last_fence + MAX_HISTORY_LIFETIME))
382*61046927SAndroid Build Coastguard Worker          continue;
383*61046927SAndroid Build Coastguard Worker 
384*61046927SAndroid Build Coastguard Worker       if (TU_AUTOTUNE_DEBUG_LOG)
385*61046927SAndroid Build Coastguard Worker          mesa_logi("Removed old history entry %016" PRIx64 "", history->key);
386*61046927SAndroid Build Coastguard Worker 
387*61046927SAndroid Build Coastguard Worker       u_rwlock_wrlock(&at->ht_lock);
388*61046927SAndroid Build Coastguard Worker       _mesa_hash_table_remove_key(at->ht, &history->key);
389*61046927SAndroid Build Coastguard Worker       u_rwlock_wrunlock(&at->ht_lock);
390*61046927SAndroid Build Coastguard Worker 
391*61046927SAndroid Build Coastguard Worker       mtx_lock(&dev->autotune_mutex);
392*61046927SAndroid Build Coastguard Worker       free_history(dev, history);
393*61046927SAndroid Build Coastguard Worker       mtx_unlock(&dev->autotune_mutex);
394*61046927SAndroid Build Coastguard Worker    }
395*61046927SAndroid Build Coastguard Worker 
396*61046927SAndroid Build Coastguard Worker    return &submission_data->fence_cs;
397*61046927SAndroid Build Coastguard Worker }
398*61046927SAndroid Build Coastguard Worker 
399*61046927SAndroid Build Coastguard Worker static bool
renderpass_key_equals(const void * _a,const void * _b)400*61046927SAndroid Build Coastguard Worker renderpass_key_equals(const void *_a, const void *_b)
401*61046927SAndroid Build Coastguard Worker {
402*61046927SAndroid Build Coastguard Worker    return *(uint64_t *)_a == *(uint64_t *)_b;
403*61046927SAndroid Build Coastguard Worker }
404*61046927SAndroid Build Coastguard Worker 
405*61046927SAndroid Build Coastguard Worker static uint32_t
renderpass_key_hash(const void * _a)406*61046927SAndroid Build Coastguard Worker renderpass_key_hash(const void *_a)
407*61046927SAndroid Build Coastguard Worker {
408*61046927SAndroid Build Coastguard Worker    return *((uint64_t *) _a) & 0xffffffff;
409*61046927SAndroid Build Coastguard Worker }
410*61046927SAndroid Build Coastguard Worker 
411*61046927SAndroid Build Coastguard Worker VkResult
tu_autotune_init(struct tu_autotune * at,struct tu_device * dev)412*61046927SAndroid Build Coastguard Worker tu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
413*61046927SAndroid Build Coastguard Worker {
414*61046927SAndroid Build Coastguard Worker    at->enabled = true;
415*61046927SAndroid Build Coastguard Worker    at->device = dev;
416*61046927SAndroid Build Coastguard Worker    at->ht = _mesa_hash_table_create(NULL,
417*61046927SAndroid Build Coastguard Worker                                     renderpass_key_hash,
418*61046927SAndroid Build Coastguard Worker                                     renderpass_key_equals);
419*61046927SAndroid Build Coastguard Worker    u_rwlock_init(&at->ht_lock);
420*61046927SAndroid Build Coastguard Worker 
421*61046927SAndroid Build Coastguard Worker    list_inithead(&at->pending_results);
422*61046927SAndroid Build Coastguard Worker    list_inithead(&at->pending_submission_data);
423*61046927SAndroid Build Coastguard Worker    list_inithead(&at->submission_data_pool);
424*61046927SAndroid Build Coastguard Worker 
425*61046927SAndroid Build Coastguard Worker    /* start from 1 because tu6_global::autotune_fence is initialized to 0 */
426*61046927SAndroid Build Coastguard Worker    at->fence_counter = 1;
427*61046927SAndroid Build Coastguard Worker 
428*61046927SAndroid Build Coastguard Worker    return VK_SUCCESS;
429*61046927SAndroid Build Coastguard Worker }
430*61046927SAndroid Build Coastguard Worker 
431*61046927SAndroid Build Coastguard Worker void
tu_autotune_fini(struct tu_autotune * at,struct tu_device * dev)432*61046927SAndroid Build Coastguard Worker tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
433*61046927SAndroid Build Coastguard Worker {
434*61046927SAndroid Build Coastguard Worker    if (TU_AUTOTUNE_LOG_AT_FINISH) {
435*61046927SAndroid Build Coastguard Worker       while (!list_is_empty(&at->pending_results)) {
436*61046927SAndroid Build Coastguard Worker          const uint32_t gpu_fence = get_autotune_fence(at);
437*61046927SAndroid Build Coastguard Worker          process_results(at, gpu_fence);
438*61046927SAndroid Build Coastguard Worker       }
439*61046927SAndroid Build Coastguard Worker 
440*61046927SAndroid Build Coastguard Worker       hash_table_foreach(at->ht, entry) {
441*61046927SAndroid Build Coastguard Worker          struct tu_renderpass_history *history =
442*61046927SAndroid Build Coastguard Worker             (struct tu_renderpass_history *) entry->data;
443*61046927SAndroid Build Coastguard Worker 
444*61046927SAndroid Build Coastguard Worker          mesa_logi("%016" PRIx64 " \tavg_passed=%u results=%u",
445*61046927SAndroid Build Coastguard Worker                    history->key, history->avg_samples, history->num_results);
446*61046927SAndroid Build Coastguard Worker       }
447*61046927SAndroid Build Coastguard Worker    }
448*61046927SAndroid Build Coastguard Worker 
449*61046927SAndroid Build Coastguard Worker    tu_autotune_free_results(dev, &at->pending_results);
450*61046927SAndroid Build Coastguard Worker 
451*61046927SAndroid Build Coastguard Worker    mtx_lock(&dev->autotune_mutex);
452*61046927SAndroid Build Coastguard Worker    hash_table_foreach(at->ht, entry) {
453*61046927SAndroid Build Coastguard Worker       struct tu_renderpass_history *history =
454*61046927SAndroid Build Coastguard Worker          (struct tu_renderpass_history *) entry->data;
455*61046927SAndroid Build Coastguard Worker       free_history(dev, history);
456*61046927SAndroid Build Coastguard Worker    }
457*61046927SAndroid Build Coastguard Worker    mtx_unlock(&dev->autotune_mutex);
458*61046927SAndroid Build Coastguard Worker 
459*61046927SAndroid Build Coastguard Worker    list_for_each_entry_safe(struct tu_submission_data, submission_data,
460*61046927SAndroid Build Coastguard Worker                             &at->pending_submission_data, node) {
461*61046927SAndroid Build Coastguard Worker       free_submission_data(submission_data);
462*61046927SAndroid Build Coastguard Worker    }
463*61046927SAndroid Build Coastguard Worker 
464*61046927SAndroid Build Coastguard Worker    list_for_each_entry_safe(struct tu_submission_data, submission_data,
465*61046927SAndroid Build Coastguard Worker                             &at->submission_data_pool, node) {
466*61046927SAndroid Build Coastguard Worker       free_submission_data(submission_data);
467*61046927SAndroid Build Coastguard Worker    }
468*61046927SAndroid Build Coastguard Worker 
469*61046927SAndroid Build Coastguard Worker    _mesa_hash_table_destroy(at->ht, NULL);
470*61046927SAndroid Build Coastguard Worker    u_rwlock_destroy(&at->ht_lock);
471*61046927SAndroid Build Coastguard Worker }
472*61046927SAndroid Build Coastguard Worker 
473*61046927SAndroid Build Coastguard Worker bool
tu_autotune_submit_requires_fence(struct tu_cmd_buffer ** cmd_buffers,uint32_t cmd_buffer_count)474*61046927SAndroid Build Coastguard Worker tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
475*61046927SAndroid Build Coastguard Worker                                   uint32_t cmd_buffer_count)
476*61046927SAndroid Build Coastguard Worker {
477*61046927SAndroid Build Coastguard Worker    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
478*61046927SAndroid Build Coastguard Worker       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
479*61046927SAndroid Build Coastguard Worker       if (!list_is_empty(&cmdbuf->renderpass_autotune_results))
480*61046927SAndroid Build Coastguard Worker          return true;
481*61046927SAndroid Build Coastguard Worker    }
482*61046927SAndroid Build Coastguard Worker 
483*61046927SAndroid Build Coastguard Worker    return false;
484*61046927SAndroid Build Coastguard Worker }
485*61046927SAndroid Build Coastguard Worker 
486*61046927SAndroid Build Coastguard Worker void
tu_autotune_free_results_locked(struct tu_device * dev,struct list_head * results)487*61046927SAndroid Build Coastguard Worker tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
488*61046927SAndroid Build Coastguard Worker {
489*61046927SAndroid Build Coastguard Worker    list_for_each_entry_safe(struct tu_renderpass_result, result,
490*61046927SAndroid Build Coastguard Worker                             results, node) {
491*61046927SAndroid Build Coastguard Worker       free_result(dev, result);
492*61046927SAndroid Build Coastguard Worker    }
493*61046927SAndroid Build Coastguard Worker }
494*61046927SAndroid Build Coastguard Worker 
495*61046927SAndroid Build Coastguard Worker void
tu_autotune_free_results(struct tu_device * dev,struct list_head * results)496*61046927SAndroid Build Coastguard Worker tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
497*61046927SAndroid Build Coastguard Worker {
498*61046927SAndroid Build Coastguard Worker    mtx_lock(&dev->autotune_mutex);
499*61046927SAndroid Build Coastguard Worker    tu_autotune_free_results_locked(dev, results);
500*61046927SAndroid Build Coastguard Worker    mtx_unlock(&dev->autotune_mutex);
501*61046927SAndroid Build Coastguard Worker }
502*61046927SAndroid Build Coastguard Worker 
503*61046927SAndroid Build Coastguard Worker static bool
fallback_use_bypass(const struct tu_render_pass * pass,const struct tu_framebuffer * framebuffer,const struct tu_cmd_buffer * cmd_buffer)504*61046927SAndroid Build Coastguard Worker fallback_use_bypass(const struct tu_render_pass *pass,
505*61046927SAndroid Build Coastguard Worker                     const struct tu_framebuffer *framebuffer,
506*61046927SAndroid Build Coastguard Worker                     const struct tu_cmd_buffer *cmd_buffer)
507*61046927SAndroid Build Coastguard Worker {
508*61046927SAndroid Build Coastguard Worker    if (cmd_buffer->state.rp.drawcall_count > 5)
509*61046927SAndroid Build Coastguard Worker       return false;
510*61046927SAndroid Build Coastguard Worker 
511*61046927SAndroid Build Coastguard Worker    for (unsigned i = 0; i < pass->subpass_count; i++) {
512*61046927SAndroid Build Coastguard Worker       if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT)
513*61046927SAndroid Build Coastguard Worker          return false;
514*61046927SAndroid Build Coastguard Worker    }
515*61046927SAndroid Build Coastguard Worker 
516*61046927SAndroid Build Coastguard Worker    return true;
517*61046927SAndroid Build Coastguard Worker }
518*61046927SAndroid Build Coastguard Worker 
519*61046927SAndroid Build Coastguard Worker static uint32_t
get_render_pass_pixel_count(const struct tu_cmd_buffer * cmd)520*61046927SAndroid Build Coastguard Worker get_render_pass_pixel_count(const struct tu_cmd_buffer *cmd)
521*61046927SAndroid Build Coastguard Worker {
522*61046927SAndroid Build Coastguard Worker    const VkExtent2D *extent = &cmd->state.render_area.extent;
523*61046927SAndroid Build Coastguard Worker    return extent->width * extent->height;
524*61046927SAndroid Build Coastguard Worker }
525*61046927SAndroid Build Coastguard Worker 
526*61046927SAndroid Build Coastguard Worker static uint64_t
estimate_drawcall_bandwidth(const struct tu_cmd_buffer * cmd,uint32_t avg_renderpass_sample_count)527*61046927SAndroid Build Coastguard Worker estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
528*61046927SAndroid Build Coastguard Worker                             uint32_t avg_renderpass_sample_count)
529*61046927SAndroid Build Coastguard Worker {
530*61046927SAndroid Build Coastguard Worker    const struct tu_cmd_state *state = &cmd->state;
531*61046927SAndroid Build Coastguard Worker 
532*61046927SAndroid Build Coastguard Worker    if (!state->rp.drawcall_count)
533*61046927SAndroid Build Coastguard Worker       return 0;
534*61046927SAndroid Build Coastguard Worker 
535*61046927SAndroid Build Coastguard Worker    /* sample count times drawcall_bandwidth_per_sample */
536*61046927SAndroid Build Coastguard Worker    return (uint64_t)avg_renderpass_sample_count *
537*61046927SAndroid Build Coastguard Worker       state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
538*61046927SAndroid Build Coastguard Worker }
539*61046927SAndroid Build Coastguard Worker 
540*61046927SAndroid Build Coastguard Worker bool
tu_autotune_use_bypass(struct tu_autotune * at,struct tu_cmd_buffer * cmd_buffer,struct tu_renderpass_result ** autotune_result)541*61046927SAndroid Build Coastguard Worker tu_autotune_use_bypass(struct tu_autotune *at,
542*61046927SAndroid Build Coastguard Worker                        struct tu_cmd_buffer *cmd_buffer,
543*61046927SAndroid Build Coastguard Worker                        struct tu_renderpass_result **autotune_result)
544*61046927SAndroid Build Coastguard Worker {
545*61046927SAndroid Build Coastguard Worker    const struct tu_render_pass *pass = cmd_buffer->state.pass;
546*61046927SAndroid Build Coastguard Worker    const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
547*61046927SAndroid Build Coastguard Worker 
548*61046927SAndroid Build Coastguard Worker    /* If a feedback loop in the subpass caused one of the pipelines used to set
549*61046927SAndroid Build Coastguard Worker     * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE) or even
550*61046927SAndroid Build Coastguard Worker     * SINGLE_PRIM_MODE(FLUSH), then that should cause significantly increased
551*61046927SAndroid Build Coastguard Worker     * sysmem bandwidth (though we haven't quantified it).
552*61046927SAndroid Build Coastguard Worker     */
553*61046927SAndroid Build Coastguard Worker    if (cmd_buffer->state.rp.sysmem_single_prim_mode)
554*61046927SAndroid Build Coastguard Worker       return false;
555*61046927SAndroid Build Coastguard Worker 
556*61046927SAndroid Build Coastguard Worker    /* If the user is using a fragment density map, then this will cause less
557*61046927SAndroid Build Coastguard Worker     * FS invocations with GMEM, which has a hard-to-measure impact on
558*61046927SAndroid Build Coastguard Worker     * performance because it depends on how heavy the FS is in addition to how
559*61046927SAndroid Build Coastguard Worker     * many invocations there were and the density. Let's assume the user knows
560*61046927SAndroid Build Coastguard Worker     * what they're doing when they added the map, because if sysmem is
561*61046927SAndroid Build Coastguard Worker     * actually faster then they could've just not used the fragment density
562*61046927SAndroid Build Coastguard Worker     * map.
563*61046927SAndroid Build Coastguard Worker     */
564*61046927SAndroid Build Coastguard Worker    if (pass->has_fdm)
565*61046927SAndroid Build Coastguard Worker       return false;
566*61046927SAndroid Build Coastguard Worker 
567*61046927SAndroid Build Coastguard Worker    /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers
568*61046927SAndroid Build Coastguard Worker     * we would have to allocate GPU memory at the submit time and copy
569*61046927SAndroid Build Coastguard Worker     * results into it.
570*61046927SAndroid Build Coastguard Worker     * Native games ususally don't use it, Zink and DXVK don't use it,
571*61046927SAndroid Build Coastguard Worker     * D3D12 doesn't have such concept.
572*61046927SAndroid Build Coastguard Worker     */
573*61046927SAndroid Build Coastguard Worker    bool simultaneous_use =
574*61046927SAndroid Build Coastguard Worker       cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
575*61046927SAndroid Build Coastguard Worker 
576*61046927SAndroid Build Coastguard Worker    if (!at->enabled || simultaneous_use)
577*61046927SAndroid Build Coastguard Worker       return fallback_use_bypass(pass, framebuffer, cmd_buffer);
578*61046927SAndroid Build Coastguard Worker 
579*61046927SAndroid Build Coastguard Worker    /* We use 64bit hash as a key since we don't fear rare hash collision,
580*61046927SAndroid Build Coastguard Worker     * the worst that would happen is sysmem being selected when it should
581*61046927SAndroid Build Coastguard Worker     * have not, and with 64bit it would be extremely rare.
582*61046927SAndroid Build Coastguard Worker     *
583*61046927SAndroid Build Coastguard Worker     * Q: Why not make the key from framebuffer + renderpass pointers?
584*61046927SAndroid Build Coastguard Worker     * A: At least DXVK creates new framebuffers each frame while keeping
585*61046927SAndroid Build Coastguard Worker     *    renderpasses the same. Also we want to support replaying a single
586*61046927SAndroid Build Coastguard Worker     *    frame in a loop for testing.
587*61046927SAndroid Build Coastguard Worker     */
588*61046927SAndroid Build Coastguard Worker    uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer);
589*61046927SAndroid Build Coastguard Worker 
590*61046927SAndroid Build Coastguard Worker    *autotune_result = create_history_result(at, renderpass_key);
591*61046927SAndroid Build Coastguard Worker 
592*61046927SAndroid Build Coastguard Worker    uint32_t avg_samples = 0;
593*61046927SAndroid Build Coastguard Worker    if (get_history(at, renderpass_key, &avg_samples)) {
594*61046927SAndroid Build Coastguard Worker       const uint32_t pass_pixel_count =
595*61046927SAndroid Build Coastguard Worker          get_render_pass_pixel_count(cmd_buffer);
596*61046927SAndroid Build Coastguard Worker       uint64_t sysmem_bandwidth =
597*61046927SAndroid Build Coastguard Worker          (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
598*61046927SAndroid Build Coastguard Worker       uint64_t gmem_bandwidth =
599*61046927SAndroid Build Coastguard Worker          (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count;
600*61046927SAndroid Build Coastguard Worker 
601*61046927SAndroid Build Coastguard Worker       const uint64_t total_draw_call_bandwidth =
602*61046927SAndroid Build Coastguard Worker          estimate_drawcall_bandwidth(cmd_buffer, avg_samples);
603*61046927SAndroid Build Coastguard Worker 
604*61046927SAndroid Build Coastguard Worker       /* drawcalls access the memory in sysmem rendering (ignoring CCU) */
605*61046927SAndroid Build Coastguard Worker       sysmem_bandwidth += total_draw_call_bandwidth;
606*61046927SAndroid Build Coastguard Worker 
607*61046927SAndroid Build Coastguard Worker       /* drawcalls access gmem in gmem rendering, but we do not want to ignore
608*61046927SAndroid Build Coastguard Worker        * them completely.  The state changes between tiles also have an
609*61046927SAndroid Build Coastguard Worker        * overhead.  The magic numbers of 11 and 10 are randomly chosen.
610*61046927SAndroid Build Coastguard Worker        */
611*61046927SAndroid Build Coastguard Worker       gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
612*61046927SAndroid Build Coastguard Worker 
613*61046927SAndroid Build Coastguard Worker       const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
614*61046927SAndroid Build Coastguard Worker       if (TU_AUTOTUNE_DEBUG_LOG) {
615*61046927SAndroid Build Coastguard Worker          const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
616*61046927SAndroid Build Coastguard Worker          const float drawcall_bandwidth_per_sample =
617*61046927SAndroid Build Coastguard Worker             (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
618*61046927SAndroid Build Coastguard Worker             cmd_buffer->state.rp.drawcall_count;
619*61046927SAndroid Build Coastguard Worker 
620*61046927SAndroid Build Coastguard Worker          mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
621*61046927SAndroid Build Coastguard Worker                renderpass_key,
622*61046927SAndroid Build Coastguard Worker                cmd_buffer->state.rp.drawcall_count,
623*61046927SAndroid Build Coastguard Worker                select_sysmem ? "sysmem" : "gmem");
624*61046927SAndroid Build Coastguard Worker          mesa_logi("   avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
625*61046927SAndroid Build Coastguard Worker                avg_samples,
626*61046927SAndroid Build Coastguard Worker                drawcall_bandwidth_per_sample,
627*61046927SAndroid Build Coastguard Worker                total_draw_call_bandwidth);
628*61046927SAndroid Build Coastguard Worker          mesa_logi("   render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u",
629*61046927SAndroid Build Coastguard Worker                extent->width, extent->height,
630*61046927SAndroid Build Coastguard Worker                pass->sysmem_bandwidth_per_pixel,
631*61046927SAndroid Build Coastguard Worker                pass->gmem_bandwidth_per_pixel);
632*61046927SAndroid Build Coastguard Worker          mesa_logi("   sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
633*61046927SAndroid Build Coastguard Worker                sysmem_bandwidth, gmem_bandwidth);
634*61046927SAndroid Build Coastguard Worker       }
635*61046927SAndroid Build Coastguard Worker 
636*61046927SAndroid Build Coastguard Worker       return select_sysmem;
637*61046927SAndroid Build Coastguard Worker    }
638*61046927SAndroid Build Coastguard Worker 
639*61046927SAndroid Build Coastguard Worker    return fallback_use_bypass(pass, framebuffer, cmd_buffer);
640*61046927SAndroid Build Coastguard Worker }
641*61046927SAndroid Build Coastguard Worker 
642*61046927SAndroid Build Coastguard Worker template <chip CHIP>
643*61046927SAndroid Build Coastguard Worker void
tu_autotune_begin_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)644*61046927SAndroid Build Coastguard Worker tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
645*61046927SAndroid Build Coastguard Worker                              struct tu_cs *cs,
646*61046927SAndroid Build Coastguard Worker                              struct tu_renderpass_result *autotune_result)
647*61046927SAndroid Build Coastguard Worker {
648*61046927SAndroid Build Coastguard Worker    if (!autotune_result)
649*61046927SAndroid Build Coastguard Worker       return;
650*61046927SAndroid Build Coastguard Worker 
651*61046927SAndroid Build Coastguard Worker    struct tu_device *dev = cmd->device;
652*61046927SAndroid Build Coastguard Worker 
653*61046927SAndroid Build Coastguard Worker    static const uint32_t size = sizeof(struct tu_renderpass_samples);
654*61046927SAndroid Build Coastguard Worker 
655*61046927SAndroid Build Coastguard Worker    mtx_lock(&dev->autotune_mutex);
656*61046927SAndroid Build Coastguard Worker    VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
657*61046927SAndroid Build Coastguard Worker    mtx_unlock(&dev->autotune_mutex);
658*61046927SAndroid Build Coastguard Worker    if (ret != VK_SUCCESS) {
659*61046927SAndroid Build Coastguard Worker       autotune_result->bo.iova = 0;
660*61046927SAndroid Build Coastguard Worker       return;
661*61046927SAndroid Build Coastguard Worker    }
662*61046927SAndroid Build Coastguard Worker 
663*61046927SAndroid Build Coastguard Worker    uint64_t result_iova = autotune_result->bo.iova;
664*61046927SAndroid Build Coastguard Worker 
665*61046927SAndroid Build Coastguard Worker    autotune_result->samples =
666*61046927SAndroid Build Coastguard Worker       (struct tu_renderpass_samples *) tu_suballoc_bo_map(
667*61046927SAndroid Build Coastguard Worker          &autotune_result->bo);
668*61046927SAndroid Build Coastguard Worker 
669*61046927SAndroid Build Coastguard Worker    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
670*61046927SAndroid Build Coastguard Worker    if (cmd->device->physical_device->info->a7xx.has_event_write_sample_count) {
671*61046927SAndroid Build Coastguard Worker       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
672*61046927SAndroid Build Coastguard Worker       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
673*61046927SAndroid Build Coastguard Worker                                        .write_sample_count = true).value);
674*61046927SAndroid Build Coastguard Worker       tu_cs_emit_qw(cs, result_iova);
675*61046927SAndroid Build Coastguard Worker 
676*61046927SAndroid Build Coastguard Worker       /* If the renderpass contains an occlusion query with its own ZPASS_DONE,
677*61046927SAndroid Build Coastguard Worker        * we have to provide a fake ZPASS_DONE event here to logically close the
678*61046927SAndroid Build Coastguard Worker        * previous one, preventing firmware from misbehaving due to nested events.
679*61046927SAndroid Build Coastguard Worker        * This writes into the samples_end field, which will be overwritten in
680*61046927SAndroid Build Coastguard Worker        * tu_autotune_end_renderpass.
681*61046927SAndroid Build Coastguard Worker        */
682*61046927SAndroid Build Coastguard Worker       if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
683*61046927SAndroid Build Coastguard Worker          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
684*61046927SAndroid Build Coastguard Worker          tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
685*61046927SAndroid Build Coastguard Worker                                           .write_sample_count = true,
686*61046927SAndroid Build Coastguard Worker                                           .sample_count_end_offset = true,
687*61046927SAndroid Build Coastguard Worker                                           .write_accum_sample_count_diff = true).value);
688*61046927SAndroid Build Coastguard Worker          tu_cs_emit_qw(cs, result_iova);
689*61046927SAndroid Build Coastguard Worker       }
690*61046927SAndroid Build Coastguard Worker    } else {
691*61046927SAndroid Build Coastguard Worker       tu_cs_emit_regs(cs,
692*61046927SAndroid Build Coastguard Worker                         A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
693*61046927SAndroid Build Coastguard Worker       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
694*61046927SAndroid Build Coastguard Worker       tu_cs_emit(cs, ZPASS_DONE);
695*61046927SAndroid Build Coastguard Worker    }
696*61046927SAndroid Build Coastguard Worker }
697*61046927SAndroid Build Coastguard Worker TU_GENX(tu_autotune_begin_renderpass);
698*61046927SAndroid Build Coastguard Worker 
699*61046927SAndroid Build Coastguard Worker template <chip CHIP>
tu_autotune_end_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)700*61046927SAndroid Build Coastguard Worker void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
701*61046927SAndroid Build Coastguard Worker                                 struct tu_cs *cs,
702*61046927SAndroid Build Coastguard Worker                                 struct tu_renderpass_result *autotune_result)
703*61046927SAndroid Build Coastguard Worker {
704*61046927SAndroid Build Coastguard Worker    if (!autotune_result)
705*61046927SAndroid Build Coastguard Worker       return;
706*61046927SAndroid Build Coastguard Worker 
707*61046927SAndroid Build Coastguard Worker    if (!autotune_result->bo.iova)
708*61046927SAndroid Build Coastguard Worker       return;
709*61046927SAndroid Build Coastguard Worker 
710*61046927SAndroid Build Coastguard Worker    uint64_t result_iova = autotune_result->bo.iova;
711*61046927SAndroid Build Coastguard Worker 
712*61046927SAndroid Build Coastguard Worker    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
713*61046927SAndroid Build Coastguard Worker 
714*61046927SAndroid Build Coastguard Worker    if (cmd->device->physical_device->info->a7xx.has_event_write_sample_count) {
715*61046927SAndroid Build Coastguard Worker       /* If the renderpass contains ZPASS_DONE events we emit a fake ZPASS_DONE
716*61046927SAndroid Build Coastguard Worker        * event here, composing a pair of these events that firmware handles without
717*61046927SAndroid Build Coastguard Worker        * issue. This first event writes into the samples_end field and the second
718*61046927SAndroid Build Coastguard Worker        * event overwrites it. The second event also enables the accumulation flag
719*61046927SAndroid Build Coastguard Worker        * even when we don't use that result because the blob always sets it.
720*61046927SAndroid Build Coastguard Worker        */
721*61046927SAndroid Build Coastguard Worker       if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
722*61046927SAndroid Build Coastguard Worker          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
723*61046927SAndroid Build Coastguard Worker          tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
724*61046927SAndroid Build Coastguard Worker                                           .write_sample_count = true).value);
725*61046927SAndroid Build Coastguard Worker          tu_cs_emit_qw(cs, result_iova + offsetof(struct tu_renderpass_samples, samples_end));
726*61046927SAndroid Build Coastguard Worker       }
727*61046927SAndroid Build Coastguard Worker 
728*61046927SAndroid Build Coastguard Worker       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
729*61046927SAndroid Build Coastguard Worker       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
730*61046927SAndroid Build Coastguard Worker                                        .write_sample_count = true,
731*61046927SAndroid Build Coastguard Worker                                        .sample_count_end_offset = true,
732*61046927SAndroid Build Coastguard Worker                                        .write_accum_sample_count_diff = true).value);
733*61046927SAndroid Build Coastguard Worker       tu_cs_emit_qw(cs, result_iova);
734*61046927SAndroid Build Coastguard Worker    } else {
735*61046927SAndroid Build Coastguard Worker       result_iova += offsetof(struct tu_renderpass_samples, samples_end);
736*61046927SAndroid Build Coastguard Worker 
737*61046927SAndroid Build Coastguard Worker       tu_cs_emit_regs(cs,
738*61046927SAndroid Build Coastguard Worker                         A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
739*61046927SAndroid Build Coastguard Worker       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
740*61046927SAndroid Build Coastguard Worker       tu_cs_emit(cs, ZPASS_DONE);
741*61046927SAndroid Build Coastguard Worker    }
742*61046927SAndroid Build Coastguard Worker }
743*61046927SAndroid Build Coastguard Worker TU_GENX(tu_autotune_end_renderpass);
744