/*
 * Copyright © 2017 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#ifndef IRIS_BUFMGR_H
#define IRIS_BUFMGR_H

#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <sys/types.h>

#include "c11/threads.h"
#include "common/intel_bind_timeline.h"
#include "util/macros.h"
#include "util/u_atomic.h"
#include "util/u_dynarray.h"
#include "util/list.h"
#include "util/simple_mtx.h"
#include "pipe/p_defines.h"
#include "pipebuffer/pb_slab.h"
#include "intel/dev/intel_device_info.h"

struct intel_device_info;
struct util_debug_callback;
struct isl_surf;
struct iris_syncobj;

/**
 * Memory zones.  When allocating a buffer, you can request that it is
 * placed into a specific region of the virtual address space (PPGTT).
 *
 * Most buffers can go anywhere (IRIS_MEMZONE_OTHER).  Some buffers are
 * accessed via an offset from a base address.  STATE_BASE_ADDRESS has
 * a maximum 4GB size for each region, so we need to restrict those
 * buffers to be within 4GB of the base.  Each memory zone corresponds
 * to a particular base address.
 *
 * We lay out the virtual address space as follows:
 *
 * - [0,   4K): Nothing            (empty page for null address)
 * - [4K,  4G): Shaders            (Instruction Base Address)
 * - [4G,  8G): Surfaces & Binders (Surface State Base Address, Bindless ...)
 * - [8G, 12G): Dynamic            (Dynamic State Base Address)
 * - [12G, *):  Other              (everything else in the full 48-bit VMA)
 *
 * A special buffer for border color lives at the start of the dynamic state
 * memory zone.  This unfortunately has to be handled specially because the
 * SAMPLER_STATE "Indirect State Pointer" field is only a 24-bit pointer.
 *
 * Each GL context uses a separate GEM context, which technically gives them
 * each a separate VMA.  However, we assign address globally, so buffers will
 * have the same address in all GEM contexts.  This lets us have a single BO
 * field for the address, which is easy and cheap.
 */
enum iris_memory_zone {
   IRIS_MEMZONE_SHADER,
   IRIS_MEMZONE_BINDER,
   IRIS_MEMZONE_SCRATCH,
   IRIS_MEMZONE_SURFACE,
   IRIS_MEMZONE_DYNAMIC,
   IRIS_MEMZONE_OTHER,

   IRIS_MEMZONE_BORDER_COLOR_POOL,
};

/* Intentionally exclude single buffer "zones" */
#define IRIS_MEMZONE_COUNT (IRIS_MEMZONE_OTHER + 1)

#define IRIS_SCRATCH_ZONE_SIZE (8 * 1024 * 1024)
#define IRIS_BINDER_ZONE_SIZE ((1ull << 30) - IRIS_SCRATCH_ZONE_SIZE)

#define IRIS_MEMZONE_SHADER_START     (0ull * (1ull << 32))
#define IRIS_MEMZONE_BINDER_START     (1ull * (1ull << 32))
#define IRIS_MEMZONE_SCRATCH_START    IRIS_MEMZONE_BINDER_START
#define IRIS_MEMZONE_SURFACE_START    (IRIS_MEMZONE_BINDER_START + IRIS_BINDER_ZONE_SIZE)
#define IRIS_MEMZONE_DYNAMIC_START    (2ull * (1ull << 32))
#define IRIS_MEMZONE_OTHER_START      (3ull * (1ull << 32))

#define IRIS_BORDER_COLOR_POOL_ADDRESS IRIS_MEMZONE_DYNAMIC_START
#define IRIS_BORDER_COLOR_POOL_SIZE (64 * 4096)

/**
 * Classification of the various incoherent caches of the GPU into a number of
 * caching domains.
 */
enum iris_domain {
   /** Render color cache. */
   IRIS_DOMAIN_RENDER_WRITE = 0,
   /** (Hi)Z/stencil cache. */
   IRIS_DOMAIN_DEPTH_WRITE,
   /** Data port (HDC) cache. */
   IRIS_DOMAIN_DATA_WRITE,
   /** Any other read-write cache. */
   IRIS_DOMAIN_OTHER_WRITE,
   /** Vertex cache. */
   IRIS_DOMAIN_VF_READ,
   /** Texture cache. */
   IRIS_DOMAIN_SAMPLER_READ,
   /** Pull-style shader constant loads. */
   IRIS_DOMAIN_PULL_CONSTANT_READ,
   /** Any other read-only cache, including reads from non-L3 clients. */
   IRIS_DOMAIN_OTHER_READ,
   /** Number of caching domains. */
   NUM_IRIS_DOMAINS,
   /** Not a real cache, use to opt out of the cache tracking mechanism. */
   IRIS_DOMAIN_NONE = NUM_IRIS_DOMAINS
};

/**
 * Whether a caching domain is guaranteed not to write any data to memory.
 */
static inline bool
iris_domain_is_read_only(enum iris_domain access)
{
   return access >= IRIS_DOMAIN_VF_READ &&
          access <= IRIS_DOMAIN_OTHER_READ;
}

static inline bool
iris_domain_is_l3_coherent(const struct intel_device_info *devinfo,
                           enum iris_domain access)
{
   /* VF reads are coherent with the L3 on Tigerlake+ because we set
    * the "L3 Bypass Disable" bit in the vertex/index buffer packets.
    */
   if (access == IRIS_DOMAIN_VF_READ)
      return devinfo->ver >= 12;

   return access != IRIS_DOMAIN_OTHER_WRITE &&
          access != IRIS_DOMAIN_OTHER_READ;
}

enum iris_mmap_mode {
   IRIS_MMAP_NONE, /**< Cannot be mapped */
   IRIS_MMAP_UC, /**< Fully uncached memory map */
   IRIS_MMAP_WC, /**< Write-combining map with no caching of reads */
   IRIS_MMAP_WB, /**< Write-back mapping with CPU caches enabled */
};

enum iris_heap {
   /**
    * System memory which is CPU-cached at (at least 1-way) coherent.
    *
    * This will use WB (write-back) CPU mappings.
    *
    * LLC systems and discrete cards (which enable snooping) will mostly use
    * this heap.  Non-LLC systems will only use it when explicit coherency is
    * required, as snooping is expensive there.
    */
   IRIS_HEAP_SYSTEM_MEMORY_CACHED_COHERENT,

   /**
    * System memory which is not CPU cached.
    *
    * This will use WC (write-combining) CPU mappings, which has uncached
    * performance for reads.  This can be used for scanout on integrated
    * GPUs (which is never coherent with CPU caches).  It will be used for
    * most buffers on non-LLC platforms, where cache coherency is expensive.
    */
   IRIS_HEAP_SYSTEM_MEMORY_UNCACHED,

   /** IRIS_HEAP_SYSTEM_MEMORY_UNCACHED + compressed, only supported in Xe2 */
   IRIS_HEAP_SYSTEM_MEMORY_UNCACHED_COMPRESSED,

   /** Device-local memory (VRAM).  Cannot be placed in system memory! */
   IRIS_HEAP_DEVICE_LOCAL,
   IRIS_HEAP_MAX_NO_VRAM = IRIS_HEAP_DEVICE_LOCAL,

   /** Device-local compressed memory, only supported in Xe2 */
   IRIS_HEAP_DEVICE_LOCAL_COMPRESSED,

   /** Device-local memory that may be evicted to system memory if needed. */
   IRIS_HEAP_DEVICE_LOCAL_PREFERRED,

   /**
    * Device-local memory (VRAM) + guarantee that is CPU visible.
    *
    * To be used in cases that cannot be placed in system memory!
    * This will only be used when running in small PCIe bar systems.
    */
   IRIS_HEAP_DEVICE_LOCAL_CPU_VISIBLE_SMALL_BAR,
   IRIS_HEAP_MAX_LARGE_BAR = IRIS_HEAP_DEVICE_LOCAL_CPU_VISIBLE_SMALL_BAR,

   IRIS_HEAP_MAX,
};

extern const char *iris_heap_to_string[];

static inline bool
iris_heap_is_device_local(enum iris_heap heap)
{
   return heap == IRIS_HEAP_DEVICE_LOCAL ||
          heap == IRIS_HEAP_DEVICE_LOCAL_PREFERRED ||
          heap == IRIS_HEAP_DEVICE_LOCAL_CPU_VISIBLE_SMALL_BAR ||
          heap == IRIS_HEAP_DEVICE_LOCAL_COMPRESSED;
}

#define IRIS_BATCH_COUNT 3

struct iris_bo_screen_deps {
   struct iris_syncobj *write_syncobjs[IRIS_BATCH_COUNT];
   struct iris_syncobj *read_syncobjs[IRIS_BATCH_COUNT];
};

struct iris_bo {
   /**
    * Size in bytes of the buffer object.
    *
    * The size may be larger than the size originally requested for the
    * allocation, such as being aligned to page size.
    */
   uint64_t size;

   /** Buffer manager context associated with this buffer object */
   struct iris_bufmgr *bufmgr;

   /** Pre-computed hash using _mesa_hash_pointer for cache tracking sets */
   uint32_t hash;

   /** The GEM handle for this buffer object. */
   uint32_t gem_handle;

   /**
    * Canonical virtual address of the buffer inside the PPGTT (Per-Process Graphics
    * Translation Table).
    *
    * Although each hardware context has its own VMA, we assign BO's to the
    * same address in all contexts, for simplicity.
    */
   uint64_t address;

   /**
    * If non-zero, then this bo has an aux-map translation to this address.
    */
   uint64_t aux_map_address;

   /**
    * If this BO is referenced by a batch, this _may_ be the index into the
    * batch->exec_bos[] list.
    *
    * Note that a single buffer may be used by multiple batches/contexts,
    * and thus appear in multiple lists, but we only track one index here.
    * In the common case one can guess that batch->exec_bos[bo->index] == bo
    * and double check if that's true to avoid a linear list walk.
    *
    * XXX: this is not ideal now that we have more than one batch per context,
    * XXX: as the index will flop back and forth between the render index and
    * XXX: compute index...
    */
   unsigned index;

   int refcount;
   const char *name;

   /** BO cache list */
   struct list_head head;

   /**
    * Synchronization sequence number of most recent access of this BO from
    * each caching domain.
    *
    * Although this is a global field, use in multiple contexts should be
    * safe, see iris_emit_buffer_barrier_for() for details.
    *
    * Also align it to 64 bits. This will make atomic operations faster on 32
    * bit platforms.
    */
   alignas(8) uint64_t last_seqnos[NUM_IRIS_DOMAINS];

   /** Up to one per screen, may need realloc. */
   struct iris_bo_screen_deps *deps;
   int deps_size;

   /**
    * Boolean of whether the GPU is definitely not accessing the buffer.
    *
    * This is only valid when reusable, since non-reusable
    * buffers are those that have been shared with other
    * processes, so we don't know their state.
    */
   bool idle;

   /** Was this buffer zeroed at allocation time? */
   bool zeroed;

   union {
      struct {
         time_t free_time;

         /** Mapped address for the buffer, saved across map/unmap cycles */
         void *map;

         /** List of GEM handle exports of this buffer (bo_export) */
         struct list_head exports;

         /**
          * Kernel-assigned global name for this object
          *
          * List contains both flink named and prime fd'd objects
          */
         unsigned global_name;

         /** Prime fd used for shared buffers, -1 otherwise. */
         int prime_fd;

         /** The mmap coherency mode selected at BO allocation time */
         enum iris_mmap_mode mmap_mode;

         /** The heap selected at BO allocation time */
         enum iris_heap heap;

         /** Was this buffer imported from an external client? */
         bool imported;

         /** Has this buffer been exported to external clients? */
         bool exported;

         /** Boolean of whether this buffer can be re-used */
         bool reusable;

         /** Boolean of whether this buffer points into user memory */
         bool userptr;

         /** Boolean of whether this buffer is protected (HW encryption) */
         bool protected;

         /** Boolean of whether this buffer needs to be captured in error dump.
          * Xe KMD requires this to be set before vm bind while i915 needs
          * this set before batch_submit().
          */
         bool capture;
      } real;
      struct {
         struct pb_slab_entry entry;
         struct iris_bo *real;
      } slab;
   };
};

/* No special attributes. */
#define BO_ALLOC_PLAIN           0
/* Content is set to 0, only done in cache and slabs code paths. */
#define BO_ALLOC_ZEROED          (1<<0)
/* Allocate a cached and coherent BO, this has a performance cost in
 * integrated platforms without LLC.
 * Should only be used in BOs that will be written and read from CPU often.
 */
#define BO_ALLOC_COHERENT        (1<<1)
/* Place BO only on smem. */
#define BO_ALLOC_SMEM            (1<<2)
/* BO can be sent to display. */
#define BO_ALLOC_SCANOUT         (1<<3)
/* No sub-allocation(slabs). */
#define BO_ALLOC_NO_SUBALLOC     (1<<4)
/* Place BO only on lmem. */
#define BO_ALLOC_LMEM            (1<<5)
/* Content is protected, can't be mapped and needs special handling.  */
#define BO_ALLOC_PROTECTED       (1<<6)
/* BO can be exported to other applications. */
#define BO_ALLOC_SHARED          (1<<7)
/* BO will be captured in the KMD error dump. */
#define BO_ALLOC_CAPTURE         (1<<8)
/* Can be mapped. */
#define BO_ALLOC_CPU_VISIBLE     (1<<9)
/* BO content is compressed. */
#define BO_ALLOC_COMPRESSED      (1<<10)

/**
 * Allocate a buffer object.
 *
 * Buffer objects are not necessarily initially mapped into CPU virtual
 * address space or graphics device aperture.  They must be mapped
 * using iris_bo_map() to be used by the CPU.
 */
struct iris_bo *iris_bo_alloc(struct iris_bufmgr *bufmgr,
                              const char *name,
                              uint64_t size,
                              uint32_t alignment,
                              enum iris_memory_zone memzone,
                              unsigned flags);

struct iris_bo *
iris_bo_create_userptr(struct iris_bufmgr *bufmgr, const char *name,
                       void *ptr, size_t size,
                       enum iris_memory_zone memzone);

/** Takes a reference on a buffer object */
static inline void
iris_bo_reference(struct iris_bo *bo)
{
   p_atomic_inc(&bo->refcount);
}

/**
 * Releases a reference on a buffer object, freeing the data if
 * no references remain.
 */
void iris_bo_unreference(struct iris_bo *bo);

#define MAP_READ          PIPE_MAP_READ
#define MAP_WRITE         PIPE_MAP_WRITE
#define MAP_ASYNC         PIPE_MAP_UNSYNCHRONIZED
#define MAP_PERSISTENT    PIPE_MAP_PERSISTENT
#define MAP_COHERENT      PIPE_MAP_COHERENT
/* internal */
#define MAP_RAW           (PIPE_MAP_DRV_PRV << 0)
#define MAP_INTERNAL_MASK (MAP_RAW)

#define MAP_FLAGS         (MAP_READ | MAP_WRITE | MAP_ASYNC | \
                           MAP_PERSISTENT | MAP_COHERENT | MAP_INTERNAL_MASK)

/**
 * Maps the buffer into userspace.
 *
 * This function will block waiting for any existing execution on the
 * buffer to complete, first.  The resulting mapping is returned.
 */
MUST_CHECK void *iris_bo_map(struct util_debug_callback *dbg,
                             struct iris_bo *bo, unsigned flags);

/**
 * Reduces the refcount on the userspace mapping of the buffer
 * object.
 */
static inline int iris_bo_unmap(struct iris_bo *bo) { return 0; }

/**
 * Waits for rendering to an object by the GPU to have completed.
 *
 * This is not required for any access to the BO by bo_map,
 * bo_subdata, etc.  It is merely a way for the driver to implement
 * glFinish.
 */
void iris_bo_wait_rendering(struct iris_bo *bo);


/**
 * Unref a buffer manager instance.
 */
void iris_bufmgr_unref(struct iris_bufmgr *bufmgr);

/**
 * Create a visible name for a buffer which can be used by other apps
 *
 * \param buf Buffer to create a name for
 * \param name Returned name
 */
int iris_bo_flink(struct iris_bo *bo, uint32_t *name);

/**
 * Returns true if the BO is backed by a real GEM object, false if it's
 * a wrapper that's suballocated from a larger BO.
 */
static inline bool
iris_bo_is_real(struct iris_bo *bo)
{
   return bo->gem_handle != 0;
}

/**
 * Unwrap any slab-allocated wrapper BOs to get the BO for the underlying
 * backing storage, which is a real BO associated with a GEM object.
 */
static inline struct iris_bo *
iris_get_backing_bo(struct iris_bo *bo)
{
   if (!iris_bo_is_real(bo))
      bo = bo->slab.real;

   /* We only allow one level of wrapping. */
   assert(iris_bo_is_real(bo));

   return bo;
}

/**
 * Is this buffer shared with external clients (imported or exported)?
 */
static inline bool
iris_bo_is_external(const struct iris_bo *bo)
{
   bo = iris_get_backing_bo((struct iris_bo *) bo);
   return bo->real.exported || bo->real.imported;
}

static inline bool
iris_bo_is_imported(const struct iris_bo *bo)
{
   bo = iris_get_backing_bo((struct iris_bo *) bo);
   return bo->real.imported;
}

static inline bool
iris_bo_is_exported(const struct iris_bo *bo)
{
   bo = iris_get_backing_bo((struct iris_bo *) bo);
   return bo->real.exported;
}

/**
 * True if the BO prefers to reside in device-local memory.
 *
 * We don't consider eviction here; this is meant to be a performance hint.
 * It will return true for BOs allocated from the LMEM or LMEM+SMEM heaps,
 * even if the buffer has been temporarily evicted to system memory.
 */
static inline bool
iris_bo_likely_local(const struct iris_bo *bo)
{
   if (!bo)
      return false;

   bo = iris_get_backing_bo((struct iris_bo *) bo);
   return iris_heap_is_device_local(bo->real.heap);
}

static inline enum iris_mmap_mode
iris_bo_mmap_mode(const struct iris_bo *bo)
{
   bo = iris_get_backing_bo((struct iris_bo *) bo);
   return bo->real.mmap_mode;
}

/**
 * Mark a buffer as being shared with other external clients.
 */
void iris_bo_mark_exported(struct iris_bo *bo);

/**
 * Returns true  if mapping the buffer for write could cause the process
 * to block, due to the object being active in the GPU.
 */
bool iris_bo_busy(struct iris_bo *bo);

struct iris_bufmgr *iris_bufmgr_get_for_fd(int fd, bool bo_reuse);
int iris_bufmgr_get_fd(struct iris_bufmgr *bufmgr);

struct iris_bo *iris_bo_gem_create_from_name(struct iris_bufmgr *bufmgr,
                                             const char *name,
                                             unsigned handle);

void* iris_bufmgr_get_aux_map_context(struct iris_bufmgr *bufmgr);

int iris_gem_get_tiling(struct iris_bo *bo, uint32_t *tiling);
int iris_gem_set_tiling(struct iris_bo *bo, const struct isl_surf *surf);

int iris_bo_export_dmabuf(struct iris_bo *bo, int *prime_fd);
struct iris_bo *iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd,
                                      const uint64_t modifier);

/**
 * Exports a bo as a GEM handle into a given DRM file descriptor
 * \param bo Buffer to export
 * \param drm_fd File descriptor where the new handle is created
 * \param out_handle Pointer to store the new handle
 *
 * Returns 0 if the buffer was successfully exported, a non zero error code
 * otherwise.
 */
int iris_bo_export_gem_handle_for_device(struct iris_bo *bo, int drm_fd,
                                         uint32_t *out_handle);

/**
 * Returns the BO's address relative to the appropriate base address.
 *
 * All of our base addresses are programmed to the start of a 4GB region,
 * so simply returning the bottom 32 bits of the BO address will give us
 * the offset from whatever base address corresponds to that memory region.
 */
static inline uint32_t
iris_bo_offset_from_base_address(struct iris_bo *bo)
{
   /* This only works for buffers in the memory zones corresponding to a
    * base address - the top, unbounded memory zone doesn't have a base.
    */
   assert(bo->address < IRIS_MEMZONE_OTHER_START);
   return bo->address;
}

/**
 * Track access of a BO from the specified caching domain and sequence number.
 *
 * Can be used without locking.  Only the most recent access (i.e. highest
 * seqno) is tracked.
 */
static inline void
iris_bo_bump_seqno(struct iris_bo *bo, uint64_t seqno,
                   enum iris_domain type)
{
   uint64_t *const last_seqno = &bo->last_seqnos[type];
   uint64_t tmp, prev_seqno = p_atomic_read(last_seqno);

   while (prev_seqno < seqno &&
          prev_seqno != (tmp = p_atomic_cmpxchg(last_seqno, prev_seqno, seqno)))
      prev_seqno = tmp;
}

/**
 * Return the PAT entry based for the given heap.
 */
const struct intel_device_info_pat_entry *
iris_heap_to_pat_entry(const struct intel_device_info *devinfo,
                       enum iris_heap heap);

enum iris_memory_zone iris_memzone_for_address(uint64_t address);

int iris_bufmgr_create_screen_id(struct iris_bufmgr *bufmgr);

simple_mtx_t *iris_bufmgr_get_bo_deps_lock(struct iris_bufmgr *bufmgr);

/**
 * A pool containing SAMPLER_BORDER_COLOR_STATE entries.
 *
 * See iris_border_color.c for more information.
 */
struct iris_border_color_pool {
   struct iris_bo *bo;
   void *map;
   unsigned insert_point;

   /** Map from border colors to offsets in the buffer. */
   struct hash_table *ht;

   /** Protects insert_point and the hash table. */
   simple_mtx_t lock;
};

struct iris_border_color_pool *iris_bufmgr_get_border_color_pool(
      struct iris_bufmgr *bufmgr);

/* iris_border_color.c */
void iris_init_border_color_pool(struct iris_bufmgr *bufmgr,
                                 struct iris_border_color_pool *pool);
void iris_destroy_border_color_pool(struct iris_border_color_pool *pool);
uint32_t iris_upload_border_color(struct iris_border_color_pool *pool,
                                  union pipe_color_union *color);

uint64_t iris_bufmgr_vram_size(struct iris_bufmgr *bufmgr);
uint64_t iris_bufmgr_sram_size(struct iris_bufmgr *bufmgr);
const struct intel_device_info *iris_bufmgr_get_device_info(struct iris_bufmgr *bufmgr);
const struct iris_kmd_backend *
iris_bufmgr_get_kernel_driver_backend(struct iris_bufmgr *bufmgr);
uint32_t iris_bufmgr_get_global_vm_id(struct iris_bufmgr *bufmgr);
bool iris_bufmgr_use_global_vm_id(struct iris_bufmgr *bufmgr);
struct intel_bind_timeline *iris_bufmgr_get_bind_timeline(struct iris_bufmgr *bufmgr);
bool iris_bufmgr_compute_engine_supported(struct iris_bufmgr *bufmgr);
uint64_t iris_bufmgr_get_dummy_aux_address(struct iris_bufmgr *bufmgr);

enum iris_madvice {
   IRIS_MADVICE_WILL_NEED = 0,
   IRIS_MADVICE_DONT_NEED = 1,
};

void iris_bo_import_sync_state(struct iris_bo *bo, int sync_file_fd);
struct iris_syncobj *iris_bo_export_sync_state(struct iris_bo *bo);

#endif /* IRIS_BUFMGR_H */