drivers/freedreno/freedreno_autotune.h

/*
 * Copyright © 2021 Google, Inc.
 * SPDX-License-Identifier: MIT
 */

#ifndef FREEDRENO_AUTOTUNE_H
#define FREEDRENO_AUTOTUNE_H

#include "util/hash_table.h"
#include "util/list.h"

#include "freedreno_util.h"

struct fd_autotune_results;

/**
 * "autotune" our decisions about bypass vs GMEM rendering, based on historical
 * data about a given render target.
 *
 * In deciding which path to take there are tradeoffs, including some that
 * are not reasonably estimateable without having some additional information:
 *
 *  (1) If you know you are touching every pixel (ie. there is a glClear()),
 *      then the GMEM path will at least not cost more memory bandwidth than
 *      sysmem[1]
 *
 *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
 *      due to sysmem->GMEM restore pass.
 *
 *  (3) If you see a high draw count, that is an indication that there will be
 *      enough pixels accessed multiple times to benefit from the reduced
 *      memory bandwidth that GMEM brings
 *
 *  (4) But high draw count where there is not much overdraw can actually be
 *      faster in bypass mode if it is pushing a lot of state change, due to
 *      not having to go thru the state changes per-tile[2]
 *
 * The approach taken is to measure the samples-passed for the batch to estimate
 * the amount of overdraw to detect cases where the number of pixels touched is
 * low.
 *
 * Note however, that (at least since a5xx) we have PERF_RB_{Z,C}_{READ,WRITE}
 * performance countables, which give a more direct measurement of what we want
 * to know (ie. is framebuffer memory access high enough to prefer GMEM), but
 * with the downside of consuming half of the available RB counters.  With the
 * additional complication that external perfcntr collection (fdperf, perfetto)
 * and the drive could be stomping on each other's feet.  (Also reading the
 * perfcntrs accurately requires a WFI.)
 *
 * [1] ignoring UBWC
 * [2] ignoring early-tile-exit optimizations, but any draw that touches all/
 *     most of the tiles late in the tile-pass can defeat that
 */
struct fd_autotune {

   /**
    * Cache to map batch->key (also used for batch-cache) to historical
    * information about rendering to that particular render target.
    */
   struct hash_table *ht;

   /**
    * List of recently used historical results (to age out old results)
    */
   struct list_head lru;

   /**
    * GPU buffer used to communicate back results to the CPU
    */
   struct fd_bo *results_mem;
   struct fd_autotune_results *results;

   /**
    * List of per-batch results that we are waiting for the GPU to finish
    * with before reading back the results.
    */
   struct list_head pending_results;

   uint32_t fence_counter;
   uint32_t idx_counter;
};

/**
 * The layout of the memory used to read back per-batch results from the
 * GPU
 *
 * Note this struct is intentionally aligned to 4k.  And hw requires the
 * sample start/stop locations to be 128b aligned.
 */
struct fd_autotune_results {

   /**
    * The GPU writes back a "fence" seqno value from the cmdstream after
    * it finishes writing it's result slot, so that the CPU knows when
    * results are valid
    */
   uint32_t fence;

   uint32_t __pad0;
   uint64_t __pad1;

   /**
    * From the cmdstream, the captured samples-passed values are recorded
    * at the start and end of the batch.
    *
    * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
    * may force us to revisit that.
    */
   struct {
      uint64_t samples_start;
      uint64_t samples_result;
      uint64_t samples_end;
      uint64_t __pad1;
   } result[127];
};

#define __offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base))
#define results_ptr(at, member)                                                \
   (at)->results_mem, __offset((at)->results, &(at)->results->member), 0, 0

struct fd_batch_history;

/**
 * Tracks the results from an individual batch.  Initially created per batch,
 * and appended to the tail of at->pending_results.  At a later time, when
 * the GPU has finished writing the results,
 *
 * ralloc parent is the associated fd_batch_history
 */
struct fd_batch_result {

   /**
    * The index/slot in fd_autotune_results::result[] to write start/end
    * counter to
    */
   unsigned idx;

   /**
    * Fence value to write back to fd_autotune_results::fence after both
    * start/end values written
    */
   uint32_t fence;

   /*
    * Below here, only used internally within autotune
    */
   struct fd_batch_history *history;
   struct list_head node;
   uint32_t cost;
   uint64_t samples_passed;
};

void fd_autotune_init(struct fd_autotune *at, struct fd_device *dev);
void fd_autotune_fini(struct fd_autotune *at);

struct fd_batch;
bool fd_autotune_use_bypass(struct fd_autotune *at,
                            struct fd_batch *batch) assert_dt;

#endif /* FREEDRENO_AUTOTUNE_H */