/*
 * Copyright 2022 Alyssa Rosenzweig
 * SPDX-License-Identifier: MIT
 */

#include "agx_tilebuffer.h"
#include <assert.h>
#include "util/bitscan.h"
#include "util/format/u_format.h"
#include "agx_usc.h"
#include "layout.h"

/* Maximum number of bytes per tile on G13G. This may change in future versions
 * of the architecture.
 */
#define MAX_BYTES_PER_TILE (32768 - 1)

/* Maximum bytes per sample in the tilebuffer. Greater allocations require
 * spilling render targets to memory.
 */
#define MAX_BYTES_PER_SAMPLE (64)

/* Minimum tile size in pixels, architectural. */
#define MIN_TILE_SIZE_PX (16 * 16)

/* Select the largest tile size that fits */
static struct agx_tile_size
agx_select_tile_size(unsigned bytes_per_pixel)
{
   /* clang-format off */
   struct agx_tile_size sizes[] = {
      { 32, 32 },
      { 32, 16 },
      { 16, 16 }
   };
   /* clang-format on */

   for (unsigned i = 0; i < ARRAY_SIZE(sizes); ++i) {
      struct agx_tile_size size = sizes[i];

      if ((bytes_per_pixel * size.width * size.height) <= MAX_BYTES_PER_TILE)
         return size;
   }

   unreachable("No supported tile size meets the bytes per pixel requirement");
}

static unsigned
agx_shared_layout_from_tile_size(struct agx_tile_size t)
{
   if (t.width == 32 && t.height == 32)
      return AGX_SHARED_LAYOUT_32X32;
   else if (t.width == 32 && t.height == 16)
      return AGX_SHARED_LAYOUT_32X16;
   else if (t.width == 16 && t.height == 16)
      return AGX_SHARED_LAYOUT_16X16;
   else
      unreachable("Invalid tile size");
}

struct agx_tilebuffer_layout
agx_build_tilebuffer_layout(const enum pipe_format *formats, uint8_t nr_cbufs,
                            uint8_t nr_samples, bool layered)
{
   struct agx_tilebuffer_layout tib = {
      .nr_samples = nr_samples,
      .layered = layered,
   };

   uint32_t offset_B = 0;

   for (unsigned rt = 0; rt < nr_cbufs; ++rt) {
      tib.logical_format[rt] = formats[rt];

      /* If there are gaps in the layout, don't allocate holes. Obscure,
       * PIPE_FORMAT_NONE has a size of 1, not 0.
       */
      if (formats[rt] == PIPE_FORMAT_NONE)
         continue;

      /* Require natural alignment for tilebuffer allocations. This could be
       * optimized, but this shouldn't be a problem in practice.
       */
      enum pipe_format physical_fmt = agx_tilebuffer_physical_format(&tib, rt);
      unsigned align_B = util_format_get_blocksize(physical_fmt);
      assert(util_is_power_of_two_nonzero(align_B) &&
             util_is_power_of_two_nonzero(MAX_BYTES_PER_SAMPLE) &&
             align_B < MAX_BYTES_PER_SAMPLE &&
             "max bytes per sample divisible by alignment");

      offset_B = ALIGN_POT(offset_B, align_B);
      assert(offset_B <= MAX_BYTES_PER_SAMPLE && "loop invariant + above");

      /* Determine the size, if we were to allocate this render target to the
       * tilebuffer as desired.
       */
      unsigned nr = util_format_get_nr_components(physical_fmt) == 1
                       ? util_format_get_nr_components(formats[rt])
                       : 1;

      unsigned size_B = align_B * nr;
      unsigned new_offset_B = offset_B + size_B;

      /* If allocating this render target would exceed any tilebuffer limits, we
       * need to spill it to memory. We continue processing in case there are
       * smaller render targets after that would still fit. Otherwise, we
       * allocate it to the tilebuffer.
       *
       * TODO: Suboptimal, we might be able to reorder render targets to
       * avoid fragmentation causing spilling.
       */
      bool fits = (new_offset_B <= MAX_BYTES_PER_SAMPLE) &&
                  (ALIGN_POT(new_offset_B, 8) * MIN_TILE_SIZE_PX *
                   nr_samples) <= MAX_BYTES_PER_TILE;

      if (fits) {
         tib._offset_B[rt] = offset_B;
         offset_B = new_offset_B;
      } else {
         tib.spilled[rt] = true;
      }
   }

   assert(offset_B <= MAX_BYTES_PER_SAMPLE && "loop invariant");

   /* Multisampling needs a nonempty allocation.
    * XXX: Check this against hw
    */
   if (nr_samples > 1)
      offset_B = MAX2(offset_B, 1);

   tib.sample_size_B = ALIGN_POT(offset_B, 8);

   tib.tile_size = agx_select_tile_size(tib.sample_size_B * nr_samples);

   agx_tilebuffer_pack_usc(&tib);
   return tib;
}

enum pipe_format
agx_tilebuffer_physical_format(struct agx_tilebuffer_layout *tib, unsigned rt)
{
   return ail_pixel_format[tib->logical_format[rt]].renderable;
}

bool
agx_tilebuffer_supports_mask(struct agx_tilebuffer_layout *tib, unsigned rt)
{
   /* We don't bother support masking with spilled render targets. This might be
    * optimized in the future but spilling is so rare anyway it's not worth it.
    */
   if (tib->spilled[rt])
      return false;

   enum pipe_format fmt = agx_tilebuffer_physical_format(tib, rt);
   return ail_isa_format_supports_mask((enum ail_isa_format)fmt);
}

uint32_t
agx_tilebuffer_total_size(struct agx_tilebuffer_layout *tib)
{
   return tib->sample_size_B * tib->nr_samples * tib->tile_size.width *
          tib->tile_size.height;
}

void
agx_tilebuffer_pack_usc(struct agx_tilebuffer_layout *tib)
{
   agx_pack(&tib->usc, USC_SHARED, cfg) {
      if (tib->nr_samples > 0) {
         cfg.uses_shared_memory = true;
         cfg.layout = agx_shared_layout_from_tile_size(tib->tile_size);
         cfg.sample_stride_in_8_bytes = tib->sample_size_B / 8;
         cfg.sample_count = tib->nr_samples;
         cfg.bytes_per_threadgroup = agx_tilebuffer_total_size(tib);
      } else {
         cfg.layout = AGX_SHARED_LAYOUT_VERTEX_COMPUTE;
         cfg.bytes_per_threadgroup = 65536;
      }
   }
}