1 /* 2 * Copyright © 2021 Google, Inc. 3 * SPDX-License-Identifier: MIT 4 */ 5 6 #ifndef FREEDRENO_AUTOTUNE_H 7 #define FREEDRENO_AUTOTUNE_H 8 9 #include "util/hash_table.h" 10 #include "util/list.h" 11 12 #include "freedreno_util.h" 13 14 struct fd_autotune_results; 15 16 /** 17 * "autotune" our decisions about bypass vs GMEM rendering, based on historical 18 * data about a given render target. 19 * 20 * In deciding which path to take there are tradeoffs, including some that 21 * are not reasonably estimateable without having some additional information: 22 * 23 * (1) If you know you are touching every pixel (ie. there is a glClear()), 24 * then the GMEM path will at least not cost more memory bandwidth than 25 * sysmem[1] 26 * 27 * (2) If there is no clear, GMEM could potentially cost *more* bandwidth 28 * due to sysmem->GMEM restore pass. 29 * 30 * (3) If you see a high draw count, that is an indication that there will be 31 * enough pixels accessed multiple times to benefit from the reduced 32 * memory bandwidth that GMEM brings 33 * 34 * (4) But high draw count where there is not much overdraw can actually be 35 * faster in bypass mode if it is pushing a lot of state change, due to 36 * not having to go thru the state changes per-tile[2] 37 * 38 * The approach taken is to measure the samples-passed for the batch to estimate 39 * the amount of overdraw to detect cases where the number of pixels touched is 40 * low. 41 * 42 * Note however, that (at least since a5xx) we have PERF_RB_{Z,C}_{READ,WRITE} 43 * performance countables, which give a more direct measurement of what we want 44 * to know (ie. is framebuffer memory access high enough to prefer GMEM), but 45 * with the downside of consuming half of the available RB counters. With the 46 * additional complication that external perfcntr collection (fdperf, perfetto) 47 * and the drive could be stomping on each other's feet. (Also reading the 48 * perfcntrs accurately requires a WFI.) 49 * 50 * [1] ignoring UBWC 51 * [2] ignoring early-tile-exit optimizations, but any draw that touches all/ 52 * most of the tiles late in the tile-pass can defeat that 53 */ 54 struct fd_autotune { 55 56 /** 57 * Cache to map batch->key (also used for batch-cache) to historical 58 * information about rendering to that particular render target. 59 */ 60 struct hash_table *ht; 61 62 /** 63 * List of recently used historical results (to age out old results) 64 */ 65 struct list_head lru; 66 67 /** 68 * GPU buffer used to communicate back results to the CPU 69 */ 70 struct fd_bo *results_mem; 71 struct fd_autotune_results *results; 72 73 /** 74 * List of per-batch results that we are waiting for the GPU to finish 75 * with before reading back the results. 76 */ 77 struct list_head pending_results; 78 79 uint32_t fence_counter; 80 uint32_t idx_counter; 81 }; 82 83 /** 84 * The layout of the memory used to read back per-batch results from the 85 * GPU 86 * 87 * Note this struct is intentionally aligned to 4k. And hw requires the 88 * sample start/stop locations to be 128b aligned. 89 */ 90 struct fd_autotune_results { 91 92 /** 93 * The GPU writes back a "fence" seqno value from the cmdstream after 94 * it finishes writing it's result slot, so that the CPU knows when 95 * results are valid 96 */ 97 uint32_t fence; 98 99 uint32_t __pad0; 100 uint64_t __pad1; 101 102 /** 103 * From the cmdstream, the captured samples-passed values are recorded 104 * at the start and end of the batch. 105 * 106 * Note that we do the math on the CPU to avoid a WFI. But pre-emption 107 * may force us to revisit that. 108 */ 109 struct { 110 uint64_t samples_start; 111 uint64_t samples_result; 112 uint64_t samples_end; 113 uint64_t __pad1; 114 } result[127]; 115 }; 116 117 #define __offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base)) 118 #define results_ptr(at, member) \ 119 (at)->results_mem, __offset((at)->results, &(at)->results->member), 0, 0 120 121 struct fd_batch_history; 122 123 /** 124 * Tracks the results from an individual batch. Initially created per batch, 125 * and appended to the tail of at->pending_results. At a later time, when 126 * the GPU has finished writing the results, 127 * 128 * ralloc parent is the associated fd_batch_history 129 */ 130 struct fd_batch_result { 131 132 /** 133 * The index/slot in fd_autotune_results::result[] to write start/end 134 * counter to 135 */ 136 unsigned idx; 137 138 /** 139 * Fence value to write back to fd_autotune_results::fence after both 140 * start/end values written 141 */ 142 uint32_t fence; 143 144 /* 145 * Below here, only used internally within autotune 146 */ 147 struct fd_batch_history *history; 148 struct list_head node; 149 uint32_t cost; 150 uint64_t samples_passed; 151 }; 152 153 void fd_autotune_init(struct fd_autotune *at, struct fd_device *dev); 154 void fd_autotune_fini(struct fd_autotune *at); 155 156 struct fd_batch; 157 bool fd_autotune_use_bypass(struct fd_autotune *at, 158 struct fd_batch *batch) assert_dt; 159 160 #endif /* FREEDRENO_AUTOTUNE_H */ 161