/*
 * Copyright © 2016 Rob Clark <robclark@freedesktop.org>
 * SPDX-License-Identifier: MIT
 *
 * Authors:
 *    Rob Clark <robclark@freedesktop.org>
 */

#ifndef FREEDRENO_BATCH_H_
#define FREEDRENO_BATCH_H_

#include "util/list.h"
#include "util/simple_mtx.h"
#include "util/u_inlines.h"
#include "util/u_queue.h"
#include "util/perf/u_trace.h"

#include "freedreno_context.h"
#include "freedreno_fence.h"
#include "freedreno_util.h"

#ifdef __cplusplus
extern "C" {
#endif

struct fd_resource;
struct fd_batch_key;
struct fd_batch_result;

/**
 * A subpass is a fragment of a batch potentially starting with a clear.
 * If the app does a mid-batch clear, that clear and subsequent draws
 * can be split out into another sub-pass.  At gmem time, the appropriate
 * sysmem or gmem clears can be interleaved with the CP_INDIRECT_BUFFER
 * to the subpass's draw cmdstream.
 *
 * For depth clears, a replacement LRZ buffer can be allocated (clear
 * still inserted into the prologue cmdstream since it needs be executed
 * even in sysmem or if we aren't binning, since later batches could
 * depend in the LRZ state).  The alternative would be to invalidate
 * LRZ for draws after the start of the new subpass.
 */
struct fd_batch_subpass {
   struct list_head node;

   /** draw pass cmdstream: */
   struct fd_ringbuffer *draw;

   /** for the gmem code to stash per tile per subpass clears */
   struct fd_ringbuffer *subpass_clears;

   BITMASK_ENUM(fd_buffer_mask) fast_cleared;

   union pipe_color_union clear_color[MAX_RENDER_TARGETS];
   double clear_depth;
   unsigned clear_stencil;

   /**
    * The number of draws emitted to this subpass.  If it is greater than
    * zero, a clear triggers creating a new subpass (because clears must
    * always come at the start of a subpass).
    */
   unsigned num_draws;

   /**
    * If a subpass starts with a LRZ clear, it gets a new LRZ buffer.
    * The fd_resource::lrz always tracks the current lrz buffer, but at
    * binning/gmem time we need to know what was the current lrz buffer
    * at the time draws were emitted to the subpass.  Which is tracked
    * here.
    */
   struct fd_bo *lrz;
};

/**
 * A batch tracks everything about a cmdstream batch/submit, including the
 * ringbuffers used for binning, draw, and gmem cmds, list of associated
 * fd_resource-s, etc.
 */
struct fd_batch {
   struct pipe_reference reference;
   unsigned seqno;
   unsigned idx; /* index into cache->batches[] */

   struct u_trace trace;

   /* To detect cases where we can skip cmdstream to record timestamp: */
   uint32_t *last_timestamp_cmd;

   int in_fence_fd;
   struct pipe_fence_handle *fence;

   struct fd_context *ctx;

   /* update seqno of most recent draw/etc to the batch. */
   uint32_t update_seqno;

   /* do we need to mem2gmem before rendering.  We don't, if for example,
    * there was a glClear() that invalidated the entire previous buffer
    * contents.  Keep track of which buffer(s) are cleared, or needs
    * restore.  Masks of PIPE_CLEAR_*
    *
    * The 'cleared' bits will be set for buffers which are *entirely*
    * cleared.
    *
    * The 'invalidated' bits are set for cleared buffers, and buffers
    * where the contents are undefined, ie. what we don't need to restore
    * to gmem.
    */
   BITMASK_ENUM(fd_buffer_mask) invalidated, cleared, restore, resolve;

   /* is this a non-draw batch (ie compute/blit which has no pfb state)? */
   bool nondraw : 1;
   bool needs_flush : 1;
   bool flushed : 1;
   bool tessellation : 1; /* tessellation used in batch */

   /* Keep track if WAIT_FOR_IDLE is needed for registers we need
    * to update via RMW:
    */
   bool needs_wfi : 1;

   /* To decide whether to render to system memory, keep track of the
    * number of draws, and whether any of them require multisample,
    * depth_test (or depth write), stencil_test, blending, and
    * color_logic_Op (since those functions are disabled when by-
    * passing GMEM.
    */
   BITMASK_ENUM(fd_gmem_reason) gmem_reason;

   /* At submit time, once we've decided that this batch will use GMEM
    * rendering, the appropriate gmem state is looked up:
    */
   const struct fd_gmem_stateobj *gmem_state;

   /* Driver specific barrier/flush flags: */
   unsigned barrier;

   /* A calculated "draw cost" value for the batch, which tries to
    * estimate the bandwidth-per-sample of all the draws according
    * to:
    *
    *    foreach_draw (...) {
    *      cost += num_mrt;
    *      if (blend_enabled)
    *        cost += num_mrt;
    *      if (depth_test_enabled)
    *        cost++;
    *      if (depth_write_enabled)
    *        cost++;
    *    }
    *
    * The idea is that each sample-passed minimally does one write
    * per MRT.  If blend is enabled, the hw will additionally do
    * a framebuffer read per sample-passed (for each MRT with blend
    * enabled).  If depth-test is enabled, the hw will additionally
    * a depth buffer read.  If depth-write is enable, the hw will
    * additionally do a depth buffer write.
    *
    * This does ignore depth buffer traffic for samples which do not
    * pass do to depth-test fail, and some other details.  But it is
    * just intended to be a rough estimate that is easy to calculate.
    */
   unsigned cost;

   /* Tells the gen specific backend where to write stats used for
    * the autotune module.
    *
    * Pointer only valid during gmem emit code.
    */
   struct fd_batch_result *autotune_result;

   unsigned num_draws;    /* number of draws in current batch */
   unsigned num_vertices; /* number of vertices in current batch */

   /* Currently only used on a6xx, to calculate vsc prim/draw stream
    * sizes:
    */
   unsigned num_bins_per_pipe;
   unsigned prim_strm_bits;
   unsigned draw_strm_bits;

   /* Track the maximal bounds of the scissor of all the draws within a
    * batch.  Used at the tile rendering step (fd_gmem_render_tiles(),
    * mem2gmem/gmem2mem) to avoid needlessly moving data in/out of gmem.
    *
    * Note that unlike gallium state, maxx/maxy are inclusive (for
    * fully covered 512x512 the scissor would be 0,0+511,511)
    */
   struct pipe_scissor_state max_scissor;

   /* Keep track of DRAW initiators that need to be patched up depending
    * on whether we using binning or not:
    */
   struct util_dynarray draw_patches;

   /* texture state that needs patching for fb_read: */
   struct util_dynarray fb_read_patches;

   /* Keep track of writes to RB_RENDER_CONTROL which need to be patched
    * once we know whether or not to use GMEM, and GMEM tile pitch.
    *
    * (only for a3xx.. but having gen specific subclasses of fd_batch
    * seemed overkill for now)
    */
   struct util_dynarray rbrc_patches;

   /* Keep track of GMEM related values that need to be patched up once we
    * know the gmem layout:
    */
   struct util_dynarray gmem_patches;

   /* Keep track of pointer to start of MEM exports for a20x binning shaders
    *
    * this is so the end of the shader can be cut off at the right point
    * depending on the GMEM configuration
    */
   struct util_dynarray shader_patches;

   struct pipe_framebuffer_state framebuffer;

   struct fd_submit *submit;

   /**
    * List of fd_batch_subpass.
    */
   struct list_head subpasses;

#define foreach_subpass(subpass, batch) \
   list_for_each_entry (struct fd_batch_subpass, subpass, &batch->subpasses, node)
#define foreach_subpass_safe(subpass, batch) \
   list_for_each_entry_safe (struct fd_batch_subpass, subpass, &batch->subpasses, node)

   /**
    * The current subpass.
    */
   struct fd_batch_subpass *subpass;

   /**
    * just a reference to the current subpass's draw cmds for backwards compat.
    */
   struct fd_ringbuffer *draw;
   /** binning pass cmdstream: */
   struct fd_ringbuffer *binning;
   /** tiling/gmem (IB0) cmdstream: */
   struct fd_ringbuffer *gmem;

   /** preemble cmdstream (executed once before first tile): */
   struct fd_ringbuffer *prologue;

   /** epilogue cmdstream (executed after each tile): */
   struct fd_ringbuffer *tile_epilogue;

   /** epilogue cmdstream (executed after all tiles): */
   struct fd_ringbuffer *epilogue;

   struct fd_ringbuffer *tile_loads;
   struct fd_ringbuffer *tile_store;

   /**
    * hw query related state:
    */
   /*@{*/
   /* next sample offset.. incremented for each sample in the batch/
    * submit, reset to zero on next submit.
    */
   uint32_t next_sample_offset;

   /* The # of pipeline-stats queries running.  In case of nested
    * queries using {START/STOP}_{PRIMITIVE,FRAGMENT,COMPUTE}_CNTRS,
    * we need to start only on the first one and stop only on the
    * last one.
    */
   uint8_t pipeline_stats_queries_active[3];

   /* cached samples (in case multiple queries need to reference
    * the same sample snapshot)
    */
   struct fd_hw_sample *sample_cache[MAX_HW_SAMPLE_PROVIDERS];

   /* which sample providers were used in the current batch: */
   uint32_t query_providers_used;

   /* which sample providers are currently enabled in the batch: */
   uint32_t query_providers_active;

   /* list of samples in current batch: */
   struct util_dynarray samples;

   /* current query result bo and tile stride: */
   struct pipe_resource *query_buf;
   uint32_t query_tile_stride;
   /*@}*/

   /* Set of resources used by currently-unsubmitted batch (read or
    * write).. does not hold a reference to the resource.
    */
   struct set *resources;

   /** key in batch-cache (if not null): */
   struct fd_batch_key *key;
   uint32_t hash;

   /** set of dependent batches.. holds refs to dependent batches: */
   uint32_t dependents_mask;
};

struct fd_batch *fd_batch_create(struct fd_context *ctx, bool nondraw);

struct fd_batch_subpass *fd_batch_create_subpass(struct fd_batch *batch) assert_dt;

void fd_batch_set_fb(struct fd_batch *batch, const struct pipe_framebuffer_state *pfb) assert_dt;

void fd_batch_flush(struct fd_batch *batch) assert_dt;
bool fd_batch_has_dep(struct fd_batch *batch, struct fd_batch *dep) assert_dt;
void fd_batch_add_dep(struct fd_batch *batch, struct fd_batch *dep) assert_dt;
void fd_batch_resource_write(struct fd_batch *batch,
                             struct fd_resource *rsc) assert_dt;
void fd_batch_resource_read_slowpath(struct fd_batch *batch,
                                     struct fd_resource *rsc) assert_dt;
void fd_batch_check_size(struct fd_batch *batch) assert_dt;

uint32_t fd_batch_key_hash(const void *_key);
bool fd_batch_key_equals(const void *_a, const void *_b);
struct fd_batch_key *fd_batch_key_clone(void *mem_ctx,
                                        const struct fd_batch_key *key);

/* not called directly: */
void __fd_batch_describe(char *buf, const struct fd_batch *batch) assert_dt;
void __fd_batch_destroy_locked(struct fd_batch *batch);
void __fd_batch_destroy(struct fd_batch *batch);

/*
 * NOTE the rule is, you need to hold the screen->lock when destroying
 * a batch..  so either use fd_batch_reference() (which grabs the lock
 * for you) if you don't hold the lock, or fd_batch_reference_locked()
 * if you do hold the lock.
 *
 * WARNING the _locked() version can briefly drop the lock.  Without
 * recursive mutexes, I'm not sure there is much else we can do (since
 * __fd_batch_destroy() needs to unref resources)
 *
 * WARNING you must acquire the screen->lock and use the _locked()
 * version in case that the batch being ref'd can disappear under
 * you.
 */

static inline void
fd_batch_reference_locked(struct fd_batch **ptr, struct fd_batch *batch)
{
   struct fd_batch *old_batch = *ptr;

   /* only need lock if a reference is dropped: */
   if (old_batch)
      fd_screen_assert_locked(old_batch->ctx->screen);

   if (pipe_reference_described(
          &(*ptr)->reference, &batch->reference,
          (debug_reference_descriptor)__fd_batch_describe))
      __fd_batch_destroy_locked(old_batch);

   *ptr = batch;
}

static inline void
fd_batch_reference(struct fd_batch **ptr, struct fd_batch *batch)
{
   struct fd_batch *old_batch = *ptr;

   if (pipe_reference_described(
          &(*ptr)->reference, &batch->reference,
          (debug_reference_descriptor)__fd_batch_describe))
      __fd_batch_destroy(old_batch);

   *ptr = batch;
}

/**
 * Mark the batch as having something worth flushing (rendering, blit, query,
 * etc)
 */
static inline void
fd_batch_needs_flush(struct fd_batch *batch)
{
   batch->needs_flush = true;
   batch->update_seqno = ++batch->ctx->update_count;
   fd_pipe_fence_ref(&batch->ctx->last_fence, NULL);
}

/* Since we reorder batches and can pause/resume queries (notably for disabling
 * queries dueing some meta operations), we update the current query state for
 * the batch before each draw.
 */
static inline void
fd_batch_update_queries(struct fd_batch *batch) assert_dt
{
   struct fd_context *ctx = batch->ctx;

   if (!(ctx->dirty & FD_DIRTY_QUERY))
      return;

   ctx->query_update_batch(batch, false);
}

static inline void
fd_batch_finish_queries(struct fd_batch *batch) assert_dt
{
   struct fd_context *ctx = batch->ctx;

   ctx->query_update_batch(batch, true);
}

static inline void
fd_reset_wfi(struct fd_batch *batch)
{
   batch->needs_wfi = true;
}

void fd_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt;

/* emit a CP_EVENT_WRITE:
 * (a6xx+ cannot use this, use fd6_event_write<chip>.)
 */
#ifndef __cplusplus
static inline void
fd_event_write(struct fd_batch *batch, struct fd_ringbuffer *ring,
               enum vgt_event_type evt)
{
   OUT_PKT3(ring, CP_EVENT_WRITE, 1);
   OUT_RING(ring, evt);
   fd_reset_wfi(batch);
}
#endif

/* Get per-tile epilogue */
static inline struct fd_ringbuffer *
fd_batch_get_tile_epilogue(struct fd_batch *batch)
{
   if (batch->tile_epilogue == NULL) {
      batch->tile_epilogue = fd_submit_new_ringbuffer(batch->submit, 0x1000,
                                                 FD_RINGBUFFER_GROWABLE);
   }

   return batch->tile_epilogue;
}

/* Get epilogue run after all tiles*/
static inline struct fd_ringbuffer *
fd_batch_get_epilogue(struct fd_batch *batch)
{
   if (batch->epilogue == NULL) {
      batch->epilogue = fd_submit_new_ringbuffer(batch->submit, 0x1000,
                                                 FD_RINGBUFFER_GROWABLE);
   }

   return batch->epilogue;
}

struct fd_ringbuffer *fd_batch_get_prologue(struct fd_batch *batch);

#ifdef __cplusplus
}
#endif

#endif /* FREEDRENO_BATCH_H_ */