/*
 * Copyright © 2021 Valve Corporation
 * Copyright © 2014 Rob Clark <robclark@freedesktop.org>
 * SPDX-License-Identifier: MIT
 */

#include "ir3_ra.h"
#include "ir3_shader.h"

#include "util/u_math.h"

/* Allocating shared registers can pose a challenge, because their live
 * intervals use the physical CFG which has extra edges inserted that are
 * pretty much always critical edges. This causes problems with phi nodes,
 * because copies for phi nodes have to happen "along the edge," and similarly
 * causes problems when reunifying values that have had their live range split.
 * Problematic phi nodes should be relatively rare, so we ban them for now.
 * The solution we choose for live-range splitting is to integrate spilling and
 * register allcoation and spill to vector registers rather than split a live
 * range, which negates some of the advantages of SSA-based RA, but it isn't as
 * bad as it seems because the conditions needed (vector shared registers, which
 * only movmsk currently produces, or fixed registers which we don't do) are
 * relatively rare. Spilling is also much cheaper than spilling vector registers
 * to private memory.
 */

struct ra_interval {
   struct ir3_reg_interval interval;

   struct rb_node physreg_node;
   physreg_t physreg_start, physreg_end;

   /* Where the shared register is spilled to. If there were no uses when it's
    * spilled it could be the original defining instruction.
    */
   struct ir3_register *spill_def;

   /* Whether this contains a source of the current instruction that can't be
    * spilled.
    */
   bool src;

   bool needs_reload;
};

struct ra_block_state {
   bool visited;

   /* For blocks whose successors are visited first (i.e. loop backedges), which
    * values should be live at the end.
    */
   BITSET_WORD *live_out;
};

struct ra_ctx {
   struct ir3_reg_ctx reg_ctx;

   BITSET_DECLARE(available, RA_MAX_FILE_SIZE);

   struct rb_tree physreg_intervals;

   struct ra_interval *intervals;

   struct ir3_liveness *live;

   struct hash_table *pcopy_src_map;

   struct ra_block_state *blocks;

   unsigned start;
};

static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
{
   return rb_node_data(struct ra_interval, interval, interval);
}

static struct ra_interval *
rb_node_to_interval(struct rb_node *node)
{
   return rb_node_data(struct ra_interval, node, physreg_node);
}

static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node *node)
{
   return rb_node_data(struct ra_interval, node, physreg_node);
}

static struct ra_interval *
ra_interval_next(struct ra_interval *interval)
{
   struct rb_node *next = rb_node_next(&interval->physreg_node);
   return next ? rb_node_to_interval(next) : NULL;
}

static struct ra_interval *
ra_interval_next_or_null(struct ra_interval *interval)
{
   return interval ? ra_interval_next(interval) : NULL;
}

static int
ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
{
   const struct ra_interval *a = rb_node_to_interval_const(_a);
   const struct ra_interval *b = rb_node_to_interval_const(_b);
   return b->physreg_start - a->physreg_start;
}

static int
ra_interval_cmp(const struct rb_node *node, const void *data)
{
   physreg_t reg = *(const physreg_t *)data;
   const struct ra_interval *interval = rb_node_to_interval_const(node);
   if (interval->physreg_start > reg)
      return -1;
   else if (interval->physreg_end <= reg)
      return 1;
   else
      return 0;
}

static struct ra_ctx *
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
{
   return rb_node_data(struct ra_ctx, ctx, reg_ctx);
}

static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
{
   struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
   return node ? rb_node_to_interval(node) : NULL;
}

/* Get the interval covering the reg, or the closest to the right if it
 * doesn't exist.
 */
static struct ra_interval *
ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
{
   struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
   if (!interval) {
      return NULL;
   } else if (interval->physreg_end > reg) {
      return interval;
   } else {
      /* There is no interval covering reg, and ra_file_search_sloppy()
       * returned the closest range to the left, so the next interval to the
       * right should be the closest to the right.
       */
      return ra_interval_next_or_null(interval);
   }
}

static struct ra_interval *
ra_ctx_search_right(struct ra_ctx *ctx, physreg_t reg)
{
   return ra_interval_search_right(&ctx->physreg_intervals, reg);
}

static void
interval_add(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
{
   struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
   struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);

   /* We can assume in this case that physreg_start/physreg_end is already
    * initialized.
    */
   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
      BITSET_CLEAR(ctx->available, i);
   }

   rb_tree_insert(&ctx->physreg_intervals, &interval->physreg_node,
                  ra_interval_insert_cmp);
}

static void
interval_delete(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
{
   struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
   struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);

   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
      BITSET_SET(ctx->available, i);
   }

   rb_tree_remove(&ctx->physreg_intervals, &interval->physreg_node);
}

static void
interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
               struct ir3_reg_interval *_child)
{
   struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
   struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);

   child->physreg_start =
      parent->physreg_start + (child->interval.reg->interval_start -
                               parent->interval.reg->interval_start);
   child->physreg_end =
      child->physreg_start +
      (child->interval.reg->interval_end - child->interval.reg->interval_start);

   interval_add(ctx, _child);
}

static void
ra_ctx_init(struct ra_ctx *ctx)
{
   ctx->reg_ctx.interval_add = interval_add;
   ctx->reg_ctx.interval_delete = interval_delete;
   ctx->reg_ctx.interval_readd = interval_readd;
}

static void
ra_ctx_reset_block(struct ra_ctx *ctx)
{
   for (unsigned i = 0; i < RA_SHARED_SIZE; i++) {
      BITSET_SET(ctx->available, i);
   }

   rb_tree_init(&ctx->reg_ctx.intervals);
   rb_tree_init(&ctx->physreg_intervals);
}

static void
ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
{
   ir3_reg_interval_init(&interval->interval, reg);
}

static physreg_t
ra_interval_get_physreg(const struct ra_interval *interval)
{
   unsigned child_start = interval->interval.reg->interval_start;

   while (interval->interval.parent) {
      interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
   }

   return interval->physreg_start +
          (child_start - interval->interval.reg->interval_start);
}

static unsigned
ra_interval_get_num(const struct ra_interval *interval)
{
   return ra_physreg_to_num(ra_interval_get_physreg(interval),
                            interval->interval.reg->flags);
}

static void
ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
{
   mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);

   ir3_reg_interval_dump(stream, &interval->interval);
}

static void
ra_ctx_dump(struct ra_ctx *ctx)
{
   struct log_stream *stream = mesa_log_streami();

   mesa_log_stream_printf(stream, "shared:\n");
   rb_tree_foreach (struct ra_interval, interval, &ctx->physreg_intervals,
                    physreg_node) {
      ra_interval_dump(stream, interval);
   }

   unsigned start, end;
   mesa_log_stream_printf(stream, "available:\n");
   BITSET_FOREACH_RANGE (start, end, ctx->available, RA_SHARED_SIZE) {
      mesa_log_stream_printf(stream, "%u-%u ", start, end);
   }
   mesa_log_stream_printf(stream, "\n");
   mesa_log_stream_printf(stream, "start: %u\n", ctx->start);
}

static bool
get_reg_specified(struct ra_ctx *ctx, struct ir3_register *reg, physreg_t physreg)
{
   for (unsigned i = 0; i < reg_size(reg); i++) {
      if (!BITSET_TEST(ctx->available, physreg + i))
         return false;
   }

   return true;
}

static unsigned
reg_file_size(struct ir3_register *reg)
{
   if (reg->flags & IR3_REG_HALF)
      return RA_SHARED_HALF_SIZE;
   else
      return RA_SHARED_SIZE;
}

static physreg_t
find_best_gap(struct ra_ctx *ctx, struct ir3_register *dst, unsigned size,
              unsigned align)
{
   unsigned file_size = reg_file_size(dst);

   /* This can happen if we create a very large merge set. Just bail out in that
    * case.
    */
   if (size > file_size)
      return (physreg_t) ~0;

   unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
   unsigned candidate = start;
   do {
      bool is_available = true;
      for (unsigned i = 0; i < size; i++) {
         if (!BITSET_TEST(ctx->available, candidate + i)) {
            is_available = false;
            break;
         }
      }

      if (is_available) {
         ctx->start = (candidate + size) % file_size;
         return candidate;
      }

      candidate += align;
      if (candidate + size > file_size)
         candidate = 0;
   } while (candidate != start);

   return (physreg_t)~0;
}

static physreg_t
find_best_spill_reg(struct ra_ctx *ctx, struct ir3_register *reg,
                    unsigned size, unsigned align)
{
   unsigned file_size = reg_file_size(reg);
   unsigned min_cost = UINT_MAX;

   unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
   physreg_t candidate = start;
   physreg_t best_reg = (physreg_t)~0;
   do {
      unsigned cost = 0;

      /* Iterate through intervals we'd need to spill to use this reg. */
      for (struct ra_interval *interval = ra_ctx_search_right(ctx, candidate);
           interval && interval->physreg_start < candidate + size;
           interval = ra_interval_next_or_null(interval)) {
         /* We can't spill sources of the current instruction when reloading
          * sources.
          */
         if (interval->src) {
            cost = UINT_MAX;
            break;
         }

         /* We prefer spilling intervals that already have been spilled, so we
          * don't have to emit another mov.
          */
         if (!interval->spill_def)
            cost += (interval->physreg_end - interval->physreg_start);
      }

      if (cost < min_cost) {
         min_cost = cost;
         best_reg = candidate;
      }

      candidate += align;
      if (candidate + size > file_size)
         candidate = 0;
   } while (candidate != start);

   return best_reg;
}

static struct ir3_register *
split(struct ir3_register *def, unsigned offset, struct ir3_instruction *before)
{
   if (reg_elems(def) == 1) {
      assert(offset == 0);
      return def;
   }

   struct ir3_instruction *split =
      ir3_instr_create(before->block, OPC_META_SPLIT, 1, 1);
   split->split.off = offset;
   struct ir3_register *dst = __ssa_dst(split);
   struct ir3_register *src =
      ir3_src_create(split, INVALID_REG, def->flags & (IR3_REG_HALF | IR3_REG_SSA));
   src->wrmask = def->wrmask;
   src->def = def;
   ir3_instr_move_after(split, before);
   return dst;
}

static struct ir3_register *
extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
        struct ir3_instruction *before)
{
   if (offset == 0 && elems == reg_elems(parent_def))
      return parent_def;

   if (elems == 1)
      return split(parent_def, offset, before);

   struct ir3_instruction *collect =
      ir3_instr_create(before->block, OPC_META_COLLECT, 1, elems);
   struct ir3_register *dst = __ssa_dst(collect);
   dst->flags |= parent_def->flags & IR3_REG_HALF;
   dst->wrmask = MASK(elems);

   ir3_instr_move_after(collect, before);

   for (unsigned i = 0; i < elems; i++) {
      ir3_src_create(collect, INVALID_REG,
                     parent_def->flags & (IR3_REG_HALF | IR3_REG_SSA))->def =
         split(parent_def, offset + i, before);
   }

   return dst;
}

static void
spill_interval_children(struct ra_interval *interval,
                        struct ir3_instruction *before)
{
   rb_tree_foreach (struct ra_interval, child, &interval->interval.children,
                    interval.node) {
      if (!child->spill_def) {
         child->spill_def = extract(interval->spill_def,
                                    (child->interval.reg->interval_start -
                                     interval->interval.reg->interval_start) /
                                    reg_elem_size(interval->interval.reg),
                                    reg_elems(child->interval.reg), before);
      }
      spill_interval_children(child, before);
   }
}

static void
spill_interval(struct ra_ctx *ctx, struct ra_interval *interval)
{
   struct ir3_instruction *before = interval->interval.reg->instr;

   d("spilling ssa_%u:%u", before->serialno, interval->interval.reg->name);

   if (!interval->spill_def) {
      /* If this is a phi node or input, we need to insert the demotion to a
       * regular register after the last phi or input in the block.
       */
      if (before->opc == OPC_META_PHI ||
          before->opc == OPC_META_INPUT) {
         struct ir3_block *block = before->block;
         struct ir3_instruction *last_phi_input = NULL;
         foreach_instr_from (instr, before, &block->instr_list) {
            if (instr->opc != before->opc)
               break;
            last_phi_input = instr;
         }
         before = last_phi_input;
      }

      struct ir3_instruction *mov = ir3_instr_create(before->block, OPC_MOV, 1, 1);
      mov->flags |= IR3_INSTR_SHARED_SPILL;
      struct ir3_register *dst = __ssa_dst(mov);
      dst->flags |= (interval->interval.reg->flags & IR3_REG_HALF);
      dst->wrmask = interval->interval.reg->wrmask;
      mov->repeat = reg_elems(dst) - 1;
      ir3_src_create(mov, interval->interval.reg->num,
                     IR3_REG_SHARED | (mov->repeat ? IR3_REG_R : 0) |
                     (interval->interval.reg->flags & IR3_REG_HALF))->wrmask =
                     interval->interval.reg->wrmask;
      mov->cat1.src_type = mov->cat1.dst_type =
         (interval->interval.reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;

      ir3_instr_move_after(mov, before);
      interval->spill_def = dst;
   }

   spill_interval_children(interval, interval->spill_def->instr);

   ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
}

/* Try to demote a scalar ALU instruction to a normal ALU instruction, using the
 * spilled sources. We have to take into account restrictions on the number of
 * shared sources that only exist for normal ALU instructions.
 */
static bool
try_demote_instruction(struct ra_ctx *ctx, struct ir3_instruction *instr)
{
   /* First, check restrictions. */
   switch (opc_cat(instr->opc)) {
   case 1:
      /* MOVMSK is special and can't be demoted. It also has no sources so must
       * go before the check below.
       */
      if (instr->opc == OPC_MOVMSK)
         return false;

      assert(instr->srcs_count >= 1);
      if (!(instr->srcs[0]->flags & (IR3_REG_CONST | IR3_REG_IMMED)))
         return false;
      break;
   case 2: {
      /* We need one source to either be demotable or an immediate. */
      if (instr->srcs_count > 1) {
         struct ra_interval *src0_interval =
            (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
         struct ra_interval *src1_interval =
            (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
         if (!(src0_interval && src0_interval->spill_def) &&
             !(src1_interval && src1_interval->spill_def) &&
             !(instr->srcs[0]->flags & IR3_REG_IMMED) &&
             !(instr->srcs[1]->flags & IR3_REG_IMMED))
            return false;
      }
      break;
   }
   case 3: {
      struct ra_interval *src0_interval =
         (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
      struct ra_interval *src1_interval =
         (instr->srcs[1]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[1]->def->name] : NULL;

      /* src1 cannot be shared */
      if (src1_interval && !src1_interval->spill_def) {
         /* Try to swap src0 and src1, similar to what copy prop does. */
         if (!is_mad(instr->opc))
            return false;

         if ((src0_interval && src0_interval->spill_def) ||
             (instr->srcs[0]->flags & IR3_REG_IMMED)) {
            struct ir3_register *src0 = instr->srcs[0];
            instr->srcs[0] = instr->srcs[1];
            instr->srcs[1] = src0;
         } else {
            return false;
         }
      }
      break;
   }
   case 4: {
      assert(instr->srcs[0]->flags & IR3_REG_SSA);
      struct ra_interval *src_interval = &ctx->intervals[instr->srcs[0]->def->name];
      if (!src_interval->spill_def)
         return false;
      break;
   }

   default:
      return false;
   }

   d("demoting instruction");

   /* If the instruction is already not a scalar ALU instruction, we should've
    * skipped reloading and just demoted sources directly, so we should never
    * get here.
    */
   assert(instr->dsts[0]->flags & IR3_REG_SHARED);

   /* Now we actually demote the instruction */
   ra_foreach_src (src, instr) {
      assert(src->flags & IR3_REG_SHARED);
      struct ra_interval *interval = &ctx->intervals[src->def->name];
      if (interval->spill_def) {
         src->def = interval->spill_def;
         src->flags &= ~IR3_REG_SHARED;
         interval->needs_reload = false;
         if (interval->interval.inserted)
            ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
         while (interval->interval.parent)
            interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
         interval->src = false;
      }
   }

   struct ra_interval *dst_interval = &ctx->intervals[instr->dsts[0]->name];
   instr->dsts[0]->flags &= ~IR3_REG_SHARED;
   ra_interval_init(dst_interval, instr->dsts[0]);
   dst_interval->spill_def = instr->dsts[0];

   instr->flags |= IR3_INSTR_SHARED_SPILL;

   return true;
}

/* Free up [start, start + size) by spilling live intervals.
 */
static void
free_space(struct ra_ctx *ctx, physreg_t start, unsigned size)
{
   struct ra_interval *interval = ra_ctx_search_right(ctx, start);
   while (interval && interval->physreg_start < start + size) {
      struct ra_interval *next = ra_interval_next_or_null(interval);
      spill_interval(ctx, interval);
      interval = next;
   }
}

static physreg_t
get_reg(struct ra_ctx *ctx, struct ir3_register *reg, bool src)
{
   if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
      physreg_t preferred_reg =
         reg->merge_set->preferred_reg + reg->merge_set_offset;
      if (preferred_reg < reg_file_size(reg) &&
          preferred_reg % reg_elem_size(reg) == 0 &&
          get_reg_specified(ctx, reg, preferred_reg))
         return preferred_reg;
   }

   /* If this register is a subset of a merge set which we have not picked a
    * register for, first try to allocate enough space for the entire merge
    * set.
    */
   unsigned size = reg_size(reg);
   if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
       size < reg->merge_set->size) {
      physreg_t best_reg = find_best_gap(ctx, reg, reg->merge_set->size,
                                         reg->merge_set->alignment);
      if (best_reg != (physreg_t)~0u) {
         best_reg += reg->merge_set_offset;
         return best_reg;
      }
   }

   /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
    * Because this doesn't introduce unnecessary dependencies, and it
    * potentially avoids needing (ss) syncs for write after read hazards for
    * SFU instructions:
    */
   if (!src && (is_sfu(reg->instr) || is_alu(reg->instr))) {
      for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
         struct ir3_register *src = reg->instr->srcs[i];
         if (!ra_reg_is_src(src))
            continue;
         if ((src->flags & IR3_REG_SHARED) && reg_size(src) >= size) {
            struct ra_interval *src_interval = &ctx->intervals[src->def->name];
            physreg_t src_physreg = ra_interval_get_physreg(src_interval);
            if (src_physreg % reg_elem_size(reg) == 0 &&
                src_physreg + size <= reg_file_size(reg) &&
                get_reg_specified(ctx, reg, src_physreg))
               return src_physreg;
         }
      }
   }

   return find_best_gap(ctx, reg, size, reg_elem_size(reg));
}

/* The reload process is split in two, first we allocate a register to reload to
 * for all sources that need a reload and then we actually execute the reload.
 * This is to allow us to demote shared ALU instructions to non-shared whenever
 * we would otherwise need to spill to reload, without leaving dangling unused
 * reload mov's from previously processed sources. So, for example, we could
 * need to reload both sources of an add, but after reloading the first source
 * we realize that we would need to spill to reload the second source and we
 * should demote the add instead, which means cancelling the first reload.
 */
static void
reload_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
           struct ir3_register *src)
{
   struct ir3_register *reg = src->def;
   struct ra_interval *interval = &ctx->intervals[reg->name];
   unsigned size = reg_size(reg);

   physreg_t best_reg = get_reg(ctx, reg, true);

   if (best_reg == (physreg_t)~0u) {
      if (try_demote_instruction(ctx, instr))
         return;

      best_reg = find_best_spill_reg(ctx, reg, size, reg_elem_size(reg));
      assert(best_reg != (physreg_t)~0u);

      free_space(ctx, best_reg, size);
   }

   d("reload src %u physreg %u", reg->name, best_reg);
   interval->physreg_start = best_reg;
   interval->physreg_end = best_reg + size;
   interval->needs_reload = true;
   ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
   interval->src = true;
}

static void
reload_interval(struct ra_ctx *ctx, struct ir3_instruction *instr,
                struct ir3_block *block, struct ra_interval *interval)
{
   struct ir3_register *def = interval->interval.reg;
   struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
   mov->flags |= IR3_INSTR_SHARED_SPILL;
   unsigned flags = IR3_REG_SHARED | (def->flags & IR3_REG_HALF);
   ir3_dst_create(mov, ra_physreg_to_num(interval->physreg_start, flags),
                  flags)->wrmask = def->wrmask;
   mov->repeat = reg_elems(def) - 1;
   struct ir3_register *mov_src =
      ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | (def->flags & IR3_REG_HALF) |
                     (mov->repeat ? IR3_REG_R : 0));
   assert(interval->spill_def);
   mov_src->def = interval->spill_def;
   mov_src->wrmask = def->wrmask;
   mov->cat1.src_type = mov->cat1.dst_type =
      (def->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;

   if (instr)
      ir3_instr_move_before(mov, instr);
}

static void
reload_src_finalize(struct ra_ctx *ctx, struct ir3_instruction *instr,
                    struct ir3_register *src)
{
   struct ir3_register *reg = src->def;
   struct ra_interval *interval = &ctx->intervals[reg->name];

   if (!interval->needs_reload)
      return;

   reload_interval(ctx, instr, instr->block, interval);

   interval->needs_reload = false;
}

static bool
can_demote_src(struct ir3_instruction *instr)
{
   switch (instr->opc) {
   case OPC_SCAN_MACRO:
   case OPC_META_COLLECT:
      return false;
   case OPC_MOV:
      /* non-shared -> shared floating-point conversions and
       * 8-bit sign extension don't work.
       */
      return (!(instr->dsts[0]->flags & IR3_REG_SHARED) ||
              !((full_type(instr->cat1.src_type) == TYPE_F32 ||
                 full_type(instr->cat1.dst_type) == TYPE_F32) ||
                (instr->cat1.src_type == TYPE_U8 &&
                 full_type(instr->cat1.dst_type) == TYPE_S32)));
   default:
      return (!is_alu(instr) && !is_sfu(instr)) ||
         !(instr->dsts[0]->flags & IR3_REG_SHARED);
   }
}

/* Ensure that this source is never spilled while reloading other sources.
 */
static void
mark_src(struct ra_ctx *ctx, struct ir3_register *src)
{
   if (!(src->flags & IR3_REG_SHARED))
      return;

   struct ra_interval *interval = &ctx->intervals[src->def->name];

   if (interval->interval.inserted) {
      while (interval->interval.parent)
         interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);

      interval->src = true;
   }
}

static void
ensure_src_live(struct ra_ctx *ctx, struct ir3_instruction *instr,
                struct ir3_register *src)
{
   if (!(src->flags & IR3_REG_SHARED))
      return;

   struct ra_interval *interval = &ctx->intervals[src->def->name];

   if (!interval->interval.inserted) {
      /* In some cases we cannot demote shared reg sources to non-shared regs,
       * then we have to reload it.
       */
      assert(interval->spill_def);
      if (!can_demote_src(instr)) {
         reload_src(ctx, instr, src);
      } else {
         if (instr->opc == OPC_META_PARALLEL_COPY) {
            /* Stash away the original def to use later in case we actually have
             * to insert a reload.
             */
            _mesa_hash_table_insert(ctx->pcopy_src_map, src, src->def);
         }
         src->def = interval->spill_def;
         src->flags &= ~IR3_REG_SHARED;
      }
   }
}

static void
assign_src(struct ra_ctx *ctx, struct ir3_register *src)
{
   if (!(src->flags & IR3_REG_SHARED))
      return;

   struct ra_interval *interval = &ctx->intervals[src->def->name];
   assert(interval->interval.inserted);
   src->num = ra_physreg_to_num(ra_interval_get_physreg(interval), src->flags);

   if ((src->flags & IR3_REG_FIRST_KILL) &&
       !interval->interval.parent &&
       rb_tree_is_empty(&interval->interval.children))
      ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);

   while (interval->interval.parent)
      interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);

   interval->src = false;
}

static void
handle_dst(struct ra_ctx *ctx, struct ir3_instruction *instr,
           struct ir3_register *dst)
{
   if (!(dst->flags & IR3_REG_SHARED))
      return;

   struct ra_interval *interval = &ctx->intervals[dst->name];
   ra_interval_init(interval, dst);
   interval->spill_def = NULL;

   if (dst->tied) {
      struct ir3_register *tied_def = dst->tied->def;
      struct ra_interval *tied_interval = &ctx->intervals[tied_def->name];
      if ((dst->tied->flags & IR3_REG_KILL) &&
          !tied_interval->interval.parent &&
          rb_tree_is_empty(&tied_interval->interval.children)) {
         dst->num = dst->tied->num;
         interval->physreg_start = tied_interval->physreg_start;
         interval->physreg_end = tied_interval->physreg_end;
         ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
         return;
      }
   }

   physreg_t physreg = get_reg(ctx, dst, false);
   if (physreg == (physreg_t) ~0u) {
      if (try_demote_instruction(ctx, instr))
         return;

      unsigned size = reg_size(dst);
      physreg = find_best_spill_reg(ctx, dst, size, reg_elem_size(dst));
      assert(physreg != (physreg_t)~0u);
      free_space(ctx, physreg, size);
   }

   ra_update_affinity(reg_file_size(dst), dst, physreg);
   interval->physreg_start = physreg;
   interval->physreg_end = physreg + reg_size(dst);
   dst->num = ra_physreg_to_num(physreg, dst->flags);
   ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
   d("insert dst %u physreg %u", dst->name, physreg);

   if (dst->tied) {
      struct ir3_instruction *mov = ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY, 1, 1);
      unsigned flags = IR3_REG_SHARED | (dst->flags & IR3_REG_HALF);
      ir3_dst_create(mov, dst->num, flags)->wrmask = dst->wrmask;
      ir3_src_create(mov, dst->tied->num, flags)->wrmask = dst->wrmask;
      mov->cat1.src_type = mov->cat1.dst_type =
         (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;;
      ir3_instr_move_before(mov, instr);
      dst->tied->num = dst->num;
   }
}

static void
handle_src_late(struct ra_ctx *ctx, struct ir3_instruction *instr,
                struct ir3_register *src)
{
   if (!(src->flags & IR3_REG_SHARED))
      return;

   struct ra_interval *interval = &ctx->intervals[src->def->name];
   reload_src_finalize(ctx, instr, src);

   /* Remove killed sources that have to be killed late due to being merged with
    * other defs.
    */
   if (!(src->flags & IR3_REG_KILL))
      return;

   if (interval->interval.inserted)
      ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
}

static void
handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
{
   ra_foreach_src (src, instr)
      mark_src(ctx, src);

   ra_foreach_src (src, instr)
      ensure_src_live(ctx, instr, src);

   ra_foreach_src_rev (src, instr)
      assign_src(ctx, src);

   ra_foreach_dst (dst, instr)
      handle_dst(ctx, instr, dst);

   ra_foreach_src (src, instr)
      handle_src_late(ctx, instr, src);
}

static void
handle_split(struct ra_ctx *ctx, struct ir3_instruction *split)
{
   struct ir3_register *src = split->srcs[0];
   struct ir3_register *dst = split->dsts[0];

   if (!(dst->flags & IR3_REG_SHARED))
      return;

   if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
      handle_normal_instr(ctx, split);
      return;
   }

   struct ra_interval *src_interval = &ctx->intervals[src->def->name];
   struct ra_interval *dst_interval = &ctx->intervals[dst->name];

   ra_interval_init(dst_interval, dst);
   dst_interval->spill_def = NULL;

   if (src_interval->spill_def) {
      struct ir3_instruction *spill_split =
         ir3_instr_create(split->block, OPC_META_SPLIT, 1, 1);
      struct ir3_register *dst = __ssa_dst(spill_split);
      struct ir3_register *src =
         ir3_src_create(spill_split, INVALID_REG, IR3_REG_SSA);
      src->def = src_interval->spill_def;
      src->wrmask = src_interval->spill_def->wrmask;
      spill_split->split.off = split->split.off;
      ir3_instr_move_after(spill_split, split);
      dst_interval->spill_def = dst;
      list_del(&split->node);
      return;
   }

   dst_interval->physreg_start =
      src_interval->physreg_start + dst->merge_set_offset -
      src->def->merge_set_offset;
   dst_interval->physreg_end = dst_interval->physreg_start + reg_size(dst);
   ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
   src->num = ra_interval_get_num(src_interval);
   dst->num = ra_interval_get_num(dst_interval);
   d("insert dst %u physreg %u", dst->name, dst_interval->physreg_start);

   if (src->flags & IR3_REG_KILL)
      ir3_reg_interval_remove(&ctx->reg_ctx, &src_interval->interval);
}

static void
handle_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
{
   struct ir3_register *dst = phi->dsts[0];
   
   if (!(dst->flags & IR3_REG_SHARED))
      return;

   struct ra_interval *dst_interval = &ctx->intervals[dst->name];
   ra_interval_init(dst_interval, dst);

   /* In some rare cases, it's possible to have a phi node with a physical-only
    * source. Here's a contrived example:
    *
    * loop {
    *    if non-uniform {
    *       if uniform {
    *          x_1 = ...;
    *          continue;
    *       }
    *       x_2 = ...;
    *    } else {
    *       break;
    *    }
    *    // continue block
    *    x_3 = phi(x_1, x_2)
    * }
    *
    * Assuming x_1 and x_2 are uniform, x_3 will also be uniform, because all
    * threads that stay in the loop take the same branch to the continue block,
    * however execution may fall through from the assignment to x_2 to the
    * break statement because the outer if is non-uniform, and then it will fall
    * through again to the continue block, so if x_3 is to be in a shared reg
    * then the phi needs an extra source pointing to the break statement, which
    * itself needs a phi node:
    *
    * loop {
    *    if non-uniform {
    *       if uniform {
    *          x_1 = ...;
    *          continue;
    *       }
    *       x_2 = ...;
    *    } else {
    *       x_4 = phi(undef, x_2)
    *       break;
    *    }
    *    // continue block
    *    x_3 = phi(x_1, x_2, x_4)
    * }
    */

   /* phi nodes are special because we cannot spill them normally, instead we
    * have to spill the parallel copies that their sources point to and make the
    * entire phi not shared anymore.
    */

   physreg_t physreg = get_reg(ctx, dst, false);
   if (physreg == (physreg_t) ~0u) {
      d("spilling phi destination");
      dst->flags &= ~IR3_REG_SHARED;
      dst_interval->spill_def = dst;
      phi->flags |= IR3_INSTR_SHARED_SPILL;

      foreach_src (src, phi) {
         src->flags &= ~IR3_REG_SHARED;
         if (src->def)
            src->def->flags &= ~IR3_REG_SHARED;
      }

      return;
   }

   dst->num = ra_physreg_to_num(physreg, dst->flags);
   dst_interval->spill_def = NULL;
   dst_interval->physreg_start = physreg;
   dst_interval->physreg_end = physreg + reg_size(dst);
   ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);

   ra_foreach_src_n (src, i, phi) {
      /* We assume that any phis with non-logical sources aren't promoted. */
      assert(i < phi->block->predecessors_count);
      src->num = dst->num;
      src->def->num = dst->num;
   }
}

static void
handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *pcopy)
{
   /* For parallel copies, we only handle the source. The destination is handled
    * later when processing phi nodes.
    */

   ra_foreach_src (src, pcopy)
      mark_src(ctx, src);

   ra_foreach_src (src, pcopy)
      ensure_src_live(ctx, pcopy, src);

   ra_foreach_src_rev (src, pcopy)
      assign_src(ctx, src);

   ra_foreach_src (src, pcopy)
      handle_src_late(ctx, pcopy, src);
}

static void
handle_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
{
   instr->flags &= ~IR3_INSTR_SHARED_SPILL;

   switch (instr->opc) {
   case OPC_META_SPLIT:
      handle_split(ctx, instr);
      break;
   case OPC_META_PHI:
      handle_phi(ctx, instr);
      break;
   case OPC_META_PARALLEL_COPY:
      handle_pcopy(ctx, instr);
      break;
   default:
      handle_normal_instr(ctx, instr);
   }
}

/* In case we define a value outside a loop, use it inside the loop, then spill
 * it afterwards inside the same loop, we could lose the value so we have to
 * reload it. We have to reload it after any parallel copy instruction, when the
 * live shared registers equal the live-in of the backedge. lower_pcopy() will
 * then move any non-shared parallel copies down past the reload.
 */
static void
reload_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
{
   struct ra_block_state *state = &ctx->blocks[block->index];
   unsigned name;
   BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
      struct ir3_register *reg = ctx->live->definitions[name];

      struct ra_interval *interval = &ctx->intervals[name];
      if (!interval->interval.inserted) {
         d("reloading %d at end of backedge", reg->name);
         reload_interval(ctx, NULL, block, interval);
      }
   }
}

static void
record_pred_live_out(struct ra_ctx *ctx,
                     struct ra_interval *interval,
                     struct ir3_block *pred)
{
   struct ra_block_state *state = &ctx->blocks[pred->index];

   struct ir3_register *def = interval->interval.reg;
   BITSET_SET(state->live_out, def->name);

   rb_tree_foreach (struct ra_interval, child,
                    &interval->interval.children, interval.node) {
      record_pred_live_out(ctx, child, pred);
   }
}

static void
record_pred_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
{
   for (unsigned i = 0; i < block->predecessors_count; i++) {
      struct ir3_block *pred = block->predecessors[i];
      struct ra_block_state *state = &ctx->blocks[pred->index];
      if (state->visited)
         continue;

      state->live_out = rzalloc_array(NULL, BITSET_WORD,
                                      BITSET_WORDS(ctx->live->definitions_count));


      rb_tree_foreach (struct ra_interval, interval,
                       &ctx->reg_ctx.intervals, interval.node) {
         record_pred_live_out(ctx, interval, pred);
      }
   }
}

static void
handle_block(struct ra_ctx *ctx, struct ir3_block *block)
{
   ra_ctx_reset_block(ctx);

   unsigned name;
   BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
                       ctx->live->definitions_count) {
      struct ir3_register *def = ctx->live->definitions[name];
      struct ra_interval *interval = &ctx->intervals[name];

      /* Non-shared definitions may still be definitions we spilled by demoting
       * them, so we still need to initialize the interval. But we shouldn't
       * make these intervals live.
       */
      ra_interval_init(interval, def);

      if ((def->flags & IR3_REG_SHARED) && !interval->spill_def) {
         ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
      }
   }

   if (RA_DEBUG) {
      d("after live-in block %u:\n", block->index);
      ra_ctx_dump(ctx);
   }

   if (block->predecessors_count > 1)
      record_pred_live_outs(ctx, block);

   foreach_instr_safe (instr, &block->instr_list) {
      di(instr, "processing");

      handle_instr(ctx, instr);

      if (RA_DEBUG)
         ra_ctx_dump(ctx);
   }

   if (block->successors[0]) {
      struct ra_block_state *state = &ctx->blocks[block->successors[0]->index];

      if (state->visited) {
         assert(!block->successors[1]);

         reload_live_outs(ctx, block);
      }
   }

   ctx->blocks[block->index].visited = true;
}

static void
lower_pcopy(struct ir3 *ir, struct ra_ctx *ctx)
{
   foreach_block (block, &ir->block_list) {
      foreach_instr_safe (instr, &block->instr_list) {
         /* At this point due to spilling there may be parallel copies from
          * shared to non-shared registers and vice versa. Lowering these after
          * RA may produce cycles involving shared and non-shared registers,
          * which would need to be resolved by swapping a shared and non-shared
          * register which is something we can't handle. However by lowering
          * these to moves now, we can make sure that cycles only involve
          * non-shared registers. To avoid illegally moving a shared register
          * read or write across the parallel copy, which may have other
          * conflicting reads/writes if there's a cycle, we need to move copies
          * from non-shared to shared below the shared copies, and we need to
          * move copies from shared to non-shared above them. So, we have the
          * following order:
          *
          * 1. shared->non-shared copies (spills)
          * 2. shared->shared copies (one parallel copy as there may be cycles)
          * 3. non-shared->shared copies (reloads)
          * 4. non-shared->non-shared copies
          *
          * We split out the non-shared->non-shared copies as a separate step.
          */
         if (instr->opc == OPC_META_PARALLEL_COPY) {
            for (unsigned i = 0; i < instr->srcs_count; i++) {
               if ((instr->srcs[i]->flags & IR3_REG_SHARED) &&
                   !(instr->dsts[i]->flags & IR3_REG_SHARED)) {
                  /* shared->non-shared. Create a spill move and rewrite the
                   * source to be the destination of the move (so that the
                   * original shared->non-shared copy becomes a
                   * non-shared->non-shared copy).
                   */
                  struct ir3_instruction *mov =
                     ir3_instr_create(block, OPC_MOV, 1, 1);
                  mov->flags |= IR3_INSTR_SHARED_SPILL;
                  struct ir3_register *dst =
                     ir3_dst_create(mov, INVALID_REG, instr->dsts[i]->flags);
                  dst->wrmask = instr->dsts[i]->wrmask;
                  dst->instr = mov;
                  mov->repeat = reg_elems(mov->dsts[0]) - 1;
                  struct ir3_register *src =
                     ir3_src_create(mov, instr->srcs[i]->num,
                                    instr->srcs[i]->flags |
                                    (mov->repeat ? IR3_REG_R : 0));
                  src->wrmask = instr->srcs[i]->wrmask;
                  mov->cat1.dst_type = mov->cat1.src_type =
                     (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
                  instr->srcs[i]->flags = mov->dsts[0]->flags;
                  instr->srcs[i]->def = mov->dsts[0];
                  ir3_instr_move_before(mov, instr);
               }
            }

            for (unsigned i = 0; i < instr->dsts_count;) {
               if ((instr->dsts[i]->flags & IR3_REG_SHARED) &&
                   (instr->srcs[i]->flags & IR3_REG_SSA) &&
                   !(instr->srcs[i]->flags & IR3_REG_SHARED)) {
                  /* non-shared->shared. Create a reload move.
                   */
                  struct ir3_instruction *mov =
                     ir3_instr_create(block, OPC_MOV, 1, 1);
                  mov->flags |= IR3_INSTR_SHARED_SPILL;
                  struct ir3_register *dst =
                     ir3_dst_create(mov, instr->dsts[i]->num,
                                    instr->dsts[i]->flags);
                  dst->instr = mov;
                  dst->wrmask = instr->dsts[i]->wrmask;
                  mov->repeat = reg_elems(mov->dsts[0]) - 1;
                  struct ir3_register *src = 
                     ir3_src_create(mov, INVALID_REG, instr->srcs[i]->flags |
                                    (mov->repeat ? IR3_REG_R : 0));
                  src->def = instr->srcs[i]->def;
                  src->wrmask = instr->srcs[i]->wrmask;
                  mov->cat1.dst_type = mov->cat1.src_type =
                     (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;

                  /* When we spill a parallel copy source, we lose the
                   * information of where it originally points to since we make
                   * it point to the spill def. If we later decide not to also
                   * spill the phi associated with it, we have to restore it
                   * here using the stashed original source so that RA
                   * validation can check that we did the correct thing.
                   *
                   * Because SSA-ness goes away after validation, this is really
                   * just about validation.
                   */
                  struct ir3_block *succ = block->successors[0];
                  unsigned pred_idx = ir3_block_get_pred_index(succ, block);
                  foreach_instr (phi, &succ->instr_list) {
                     if (phi->opc != OPC_META_PHI)
                        break;

                     if (phi->srcs[pred_idx]->def == instr->dsts[i]) {
                        struct ir3_register *def =
                           _mesa_hash_table_search(ctx->pcopy_src_map,
                                                   instr->srcs[i])->data;
                        phi->srcs[pred_idx]->def = def;
                        break;
                     }
                  }

                  instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
                  instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
                  instr->srcs_count--;
                  instr->dsts_count--;
                  ir3_instr_move_after(mov, instr);
                  continue;
               }

               i++;
            }

            /* Move any non-shared copies to a separate parallel copy
             * instruction right at the end of the block, after any reloads. At
             * this point all copies should be {shared,immediate}->shared or
             * {non-shared,immediate}->non-shared. 
             */
            unsigned non_shared_copies = 0;
            for (unsigned i = 0; i < instr->dsts_count; i++) {
               if (!(instr->dsts[i]->flags & IR3_REG_SHARED))
                  non_shared_copies++;
            }

            if (non_shared_copies != 0) {
               struct ir3_instruction *pcopy =
                  ir3_instr_create(block, OPC_META_PARALLEL_COPY,
                                   non_shared_copies, non_shared_copies);

               unsigned j = 0;
               for (unsigned i = 0; i < instr->dsts_count;) {
                  if (!(instr->dsts[i]->flags & IR3_REG_SHARED)) {
                     pcopy->dsts[j] = instr->dsts[i];
                     pcopy->srcs[j] = instr->srcs[i];
                     pcopy->dsts[j]->instr = pcopy;
                     instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
                     instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
                     instr->srcs_count--;
                     instr->dsts_count--;
                     j++;
                     continue;
                  }
                  i++;
               }

               pcopy->srcs_count = pcopy->dsts_count = j;
               if (instr->dsts_count == 0)
                  list_del(&instr->node);
            }
         }
      }
   }
}

static void
finalize(struct ir3 *ir)
{
   foreach_block (block, &ir->block_list) {
      foreach_instr (instr, &block->instr_list) {
         for (unsigned i = 0; i < instr->dsts_count; i++) {
            if (instr->dsts[i]->flags & IR3_REG_SHARED) {
               instr->dsts[i]->flags &= ~IR3_REG_SSA;
            }
         }

         for (unsigned i = 0; i < instr->srcs_count; i++) {
            if (instr->srcs[i]->flags & IR3_REG_SHARED) {
               instr->srcs[i]->flags &= ~IR3_REG_SSA;
               instr->srcs[i]->def = NULL;
            }
         }
      }
   }
}

void
ir3_ra_shared(struct ir3_shader_variant *v, struct ir3_liveness **live_ptr)
{
   struct ra_ctx ctx;
   struct ir3_liveness *live = *live_ptr;

   ra_ctx_init(&ctx);
   ctx.intervals = rzalloc_array(NULL, struct ra_interval,
                                 live->definitions_count);
   ctx.blocks = rzalloc_array(NULL, struct ra_block_state,
                              live->block_count);
   ctx.start = 0;
   ctx.live = live;
   ctx.pcopy_src_map = _mesa_pointer_hash_table_create(NULL);

   foreach_block (block, &v->ir->block_list) {
      handle_block(&ctx, block);
   }

   lower_pcopy(v->ir, &ctx);

   for (unsigned i = 0; i < live->block_count; i++) {
      if (ctx.blocks[i].live_out)
         ralloc_free(ctx.blocks[i].live_out);
   }

   ralloc_free(ctx.intervals);
   ralloc_free(ctx.pcopy_src_map);
   ralloc_free(ctx.blocks);

   ir3_ra_validate(v, RA_FULL_SIZE, RA_HALF_SIZE, live->block_count, true);
   finalize(v->ir);

   /* Recalculate liveness and register pressure now that additional values have
    * been added.
    * TODO we should only do this if any values have been spilled/reloaded.
    * Note: since we don't have to recreate merge sets, we have to manually copy
    * interval_offset to the new liveness struct.
    */
   unsigned interval_offset = live->interval_offset;
   void *live_mem_ctx = ralloc_parent(live);
   ralloc_free(live);
   *live_ptr = ir3_calc_liveness(live_mem_ctx, v->ir);
   (*live_ptr)->interval_offset = interval_offset;
}