/* * Copyright © 2012-2018 Rob Clark * SPDX-License-Identifier: MIT * * Authors: * Rob Clark */ #ifndef FREEDRENO_RINGBUFFER_H_ #define FREEDRENO_RINGBUFFER_H_ #include #include "util/u_atomic.h" #include "util/u_debug.h" #include "adreno_common.xml.h" #include "adreno_pm4.xml.h" #include "freedreno_drmif.h" #include "freedreno_pm4.h" #ifdef __cplusplus extern "C" { #endif struct fd_submit; struct fd_ringbuffer; enum fd_ringbuffer_flags { /* Primary ringbuffer for a submit, ie. an IB1 level rb * which kernel must setup RB->IB1 CP_INDIRECT_BRANCH * packets. */ FD_RINGBUFFER_PRIMARY = 0x1, /* Hint that the stateobj will be used for streaming state * that is used once or a few times and then discarded. * * For sub-allocation, non streaming stateobj's should be * sub-allocated from a page size buffer, so one long lived * state obj doesn't prevent other pages from being freed. * (Ie. it would be no worse than allocating a page sized * bo for each small non-streaming stateobj). * * But streaming stateobj's could be sub-allocated from a * larger buffer to reduce the alloc/del overhead. */ FD_RINGBUFFER_STREAMING = 0x2, /* Indicates that "growable" cmdstream can be used, * consisting of multiple physical cmdstream buffers */ FD_RINGBUFFER_GROWABLE = 0x4, /* Internal use only: */ _FD_RINGBUFFER_OBJECT = 0x8, }; /* A submit object manages/tracks all the state buildup for a "submit" * ioctl to the kernel. Additionally, with the exception of long-lived * non-STREAMING stateobj rb's, rb's are allocated from the submit. */ struct fd_submit *fd_submit_new(struct fd_pipe *pipe); /* NOTE: all ringbuffer's create from the submit should be unref'd * before destroying the submit. */ void fd_submit_del(struct fd_submit *submit); struct fd_submit * fd_submit_ref(struct fd_submit *submit); /* Allocate a new rb from the submit. */ struct fd_ringbuffer *fd_submit_new_ringbuffer(struct fd_submit *submit, uint32_t size, enum fd_ringbuffer_flags flags); /* in_fence_fd: -1 for no in-fence, else fence fd * if use_fence_fd is true the output fence will be dma_fence fd backed */ struct fd_fence *fd_submit_flush(struct fd_submit *submit, int in_fence_fd, bool use_fence_fd); struct fd_ringbuffer; struct fd_reloc; struct fd_ringbuffer_funcs { void (*grow)(struct fd_ringbuffer *ring, uint32_t size); /** * Alternative to emit_reloc for the softpin case, where we only need * to track that the bo is used (and not track all the extra info that * the kernel would need to do a legacy reloc. */ void (*emit_bo)(struct fd_ringbuffer *ring, struct fd_bo *bo); void (*assert_attached)(struct fd_ringbuffer *ring, struct fd_bo *bo); void (*emit_reloc)(struct fd_ringbuffer *ring, const struct fd_reloc *reloc); uint32_t (*emit_reloc_ring)(struct fd_ringbuffer *ring, struct fd_ringbuffer *target, uint32_t cmd_idx); uint32_t (*cmd_count)(struct fd_ringbuffer *ring); bool (*check_size)(struct fd_ringbuffer *ring); void (*destroy)(struct fd_ringbuffer *ring); }; /* the ringbuffer object is not opaque so that OUT_RING() type stuff * can be inlined. Note that users should not make assumptions about * the size of this struct. */ struct fd_ringbuffer { uint32_t *cur, *end, *start; const struct fd_ringbuffer_funcs *funcs; // size or end coudl probably go away int size; int32_t refcnt; enum fd_ringbuffer_flags flags; }; /* Allocate a new long-lived state object, not associated with * a submit: */ struct fd_ringbuffer *fd_ringbuffer_new_object(struct fd_pipe *pipe, uint32_t size); /* * Helpers for ref/unref with some extra debugging.. unref() returns true if * the object is still live */ static inline void ref(int32_t *ref) { ASSERTED int32_t count = p_atomic_inc_return(ref); /* We should never see a refcnt transition 0->1, this is a sign of a * zombie coming back from the dead! */ assert(count != 1); } static inline bool unref(int32_t *ref) { int32_t count = p_atomic_dec_return(ref); assert(count != -1); return count == 0; } static inline void fd_ringbuffer_del(struct fd_ringbuffer *ring) { if (--ring->refcnt > 0) return; ring->funcs->destroy(ring); } static inline struct fd_ringbuffer * fd_ringbuffer_ref(struct fd_ringbuffer *ring) { ring->refcnt++; return ring; } static inline void fd_ringbuffer_grow(struct fd_ringbuffer *ring, uint32_t ndwords) { assert(ring->funcs->grow); /* unsupported on kgsl */ ring->funcs->grow(ring, ring->size); } static inline bool fd_ringbuffer_check_size(struct fd_ringbuffer *ring) { return ring->funcs->check_size(ring); } static inline void fd_ringbuffer_emit(struct fd_ringbuffer *ring, uint32_t data) { (*ring->cur++) = data; } struct fd_reloc { struct fd_bo *bo; uint64_t iova; uint64_t orval; #define FD_RELOC_READ 0x0001 #define FD_RELOC_WRITE 0x0002 #define FD_RELOC_DUMP 0x0004 uint32_t offset; int32_t shift; }; /* We always mark BOs for write, instead of tracking it across reloc * sources in userspace. On the kernel side, this means we track a single * excl fence in the BO instead of a set of read fences, which is cheaper. * The downside is that a dmabuf-shared device won't be able to read in * parallel with a read-only access by freedreno, but most other drivers * have decided that that usecase isn't important enough to do this * tracking, as well. */ #define FD_RELOC_FLAGS_INIT (FD_RELOC_READ | FD_RELOC_WRITE) /* NOTE: relocs are 2 dwords on a5xx+ */ static inline void fd_ringbuffer_attach_bo(struct fd_ringbuffer *ring, struct fd_bo *bo) { ring->funcs->emit_bo(ring, bo); } static inline void fd_ringbuffer_assert_attached(struct fd_ringbuffer *ring, struct fd_bo *bo) { #ifndef NDEBUG ring->funcs->assert_attached(ring, bo); #endif } static inline void fd_ringbuffer_reloc(struct fd_ringbuffer *ring, const struct fd_reloc *reloc) { ring->funcs->emit_reloc(ring, reloc); } static inline uint32_t fd_ringbuffer_cmd_count(struct fd_ringbuffer *ring) { if (!ring->funcs->cmd_count) return 1; return ring->funcs->cmd_count(ring); } static inline uint32_t fd_ringbuffer_emit_reloc_ring_full(struct fd_ringbuffer *ring, struct fd_ringbuffer *target, uint32_t cmd_idx) { return ring->funcs->emit_reloc_ring(ring, target, cmd_idx); } static inline uint32_t offset_bytes(void *end, void *start) { return ((char *)end) - ((char *)start); } static inline uint32_t fd_ringbuffer_size(struct fd_ringbuffer *ring) { /* only really needed for stateobj ringbuffers, and won't really * do what you expect for growable rb's.. so lets just restrict * this to stateobj's for now: */ assert(!(ring->flags & FD_RINGBUFFER_GROWABLE)); return offset_bytes(ring->cur, ring->start); } static inline bool fd_ringbuffer_empty(struct fd_ringbuffer *ring) { return (fd_ringbuffer_cmd_count(ring) == 1) && (offset_bytes(ring->cur, ring->start) == 0); } #define LOG_DWORDS 0 static inline void OUT_RING(struct fd_ringbuffer *ring, uint32_t data) { if (LOG_DWORDS) { fprintf(stderr, "ring[%p]: OUT_RING %04x: %08x", ring, (uint32_t)(ring->cur - ring->start), data); } fd_ringbuffer_emit(ring, data); } static inline uint64_t __reloc_iova(struct fd_bo *bo, uint32_t offset, uint64_t orval, int32_t shift) { uint64_t iova = fd_bo_get_iova(bo) + offset; if (shift < 0) iova >>= -shift; else iova <<= shift; iova |= orval; return iova; } /* * NOTE: OUT_RELOC() is 2 dwords (64b) on a5xx+ */ static inline void OUT_RELOC(struct fd_ringbuffer *ring, struct fd_bo *bo, uint32_t offset, uint64_t orval, int32_t shift) { if (LOG_DWORDS) { fprintf(stderr, "ring[%p]: OUT_RELOC %04x: %p+%u << %d", ring, (uint32_t)(ring->cur - ring->start), bo, offset, shift); } assert(offset < fd_bo_size(bo)); uint64_t iova = __reloc_iova(bo, offset, orval, shift); #if FD_BO_NO_HARDPIN uint64_t *cur = (uint64_t *)ring->cur; *cur = iova; ring->cur += 2; fd_ringbuffer_assert_attached(ring, bo); #else struct fd_reloc reloc = { .bo = bo, .iova = iova, .orval = orval, .offset = offset, .shift = shift, }; fd_ringbuffer_reloc(ring, &reloc); #endif } static inline void OUT_RB(struct fd_ringbuffer *ring, struct fd_ringbuffer *target) { fd_ringbuffer_emit_reloc_ring_full(ring, target, 0); } static inline void BEGIN_RING(struct fd_ringbuffer *ring, uint32_t ndwords) { if (unlikely(ring->cur + ndwords > ring->end)) fd_ringbuffer_grow(ring, ndwords); } static inline void OUT_PKT0(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt) { BEGIN_RING(ring, cnt + 1); OUT_RING(ring, pm4_pkt0_hdr(regindx, cnt)); } static inline void OUT_PKT2(struct fd_ringbuffer *ring) { BEGIN_RING(ring, 1); OUT_RING(ring, CP_TYPE2_PKT); } static inline void OUT_PKT3(struct fd_ringbuffer *ring, uint8_t opcode, uint16_t cnt) { BEGIN_RING(ring, cnt + 1); OUT_RING(ring, CP_TYPE3_PKT | ((cnt - 1) << 16) | ((opcode & 0xFF) << 8)); } /* * Starting with a5xx, pkt4/pkt7 are used instead of pkt0/pkt3 */ static inline void OUT_PKT4(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt) { BEGIN_RING(ring, cnt + 1); OUT_RING(ring, pm4_pkt4_hdr((uint16_t)regindx, (uint16_t)cnt)); } static inline void OUT_PKT7(struct fd_ringbuffer *ring, uint32_t opcode, uint32_t cnt) { BEGIN_RING(ring, cnt + 1); OUT_RING(ring, pm4_pkt7_hdr((uint8_t)opcode, (uint16_t)cnt)); } static inline void OUT_WFI(struct fd_ringbuffer *ring) { OUT_PKT3(ring, CP_WAIT_FOR_IDLE, 1); OUT_RING(ring, 0x00000000); } static inline void OUT_WFI5(struct fd_ringbuffer *ring) { OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0); } #ifdef __cplusplus } /* end of extern "C" */ #endif #endif /* FREEDRENO_RINGBUFFER_H_ */