/* * Copyright © 2019 Google, Inc. * SPDX-License-Identifier: MIT * * Authors: * Rob Clark */ #include "util/dag.h" #include "util/u_math.h" #include "ir3.h" #include "ir3_compiler.h" #include "ir3_context.h" #if MESA_DEBUG #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS) #else #define SCHED_DEBUG 0 #endif #define d(fmt, ...) \ do { \ if (SCHED_DEBUG) { \ mesa_logi("PSCHED: " fmt, ##__VA_ARGS__); \ } \ } while (0) #define di(instr, fmt, ...) \ do { \ if (SCHED_DEBUG) { \ struct log_stream *stream = mesa_log_streami(); \ mesa_log_stream_printf(stream, "PSCHED: " fmt ": ", ##__VA_ARGS__); \ ir3_print_instr_stream(stream, instr); \ mesa_log_stream_destroy(stream); \ } \ } while (0) #define SCHED_DEBUG_DUMP_DEPTH 1 /* * Post RA Instruction Scheduling */ struct ir3_postsched_ctx { struct ir3 *ir; struct ir3_shader_variant *v; void *mem_ctx; struct ir3_block *block; /* the current block */ struct dag *dag; struct list_head unscheduled_list; /* unscheduled instructions */ unsigned ip; int ss_delay; int sy_delay; }; struct ir3_postsched_node { struct dag_node dag; /* must be first for util_dynarray_foreach */ struct ir3_instruction *instr; bool partially_evaluated_path; unsigned earliest_ip; bool has_sy_src, has_ss_src; unsigned max_delay; }; #define foreach_sched_node(__n, __list) \ list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link) static bool has_sy_src(struct ir3_instruction *instr) { struct ir3_postsched_node *node = instr->data; return node->has_sy_src; } static bool has_ss_src(struct ir3_instruction *instr) { struct ir3_postsched_node *node = instr->data; return node->has_ss_src; } #ifndef NDEBUG static void sched_dag_validate_cb(const struct dag_node *node, void *data) { struct ir3_postsched_node *n = (struct ir3_postsched_node *)node; ir3_print_instr(n->instr); } #endif static void schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr) { assert(ctx->block == instr->block); /* remove from unscheduled_list: */ list_delinit(&instr->node); di(instr, "schedule"); bool counts_for_delay = is_alu(instr) || is_flow(instr); unsigned delay_cycles = counts_for_delay ? 1 + instr->repeat : 0; struct ir3_postsched_node *n = instr->data; /* We insert any nop's needed to get to earliest_ip, then advance * delay_cycles by scheduling the instruction. */ ctx->ip = MAX2(ctx->ip, n->earliest_ip) + delay_cycles; util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) { unsigned delay = (unsigned)(uintptr_t)edge->data; struct ir3_postsched_node *child = container_of(edge->child, struct ir3_postsched_node, dag); child->earliest_ip = MAX2(child->earliest_ip, ctx->ip + delay); } list_addtail(&instr->node, &instr->block->instr_list); dag_prune_head(ctx->dag, &n->dag); if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH)) return; if (is_ss_producer(instr)) { ctx->ss_delay = soft_ss_delay(instr); } else if (has_ss_src(instr)) { ctx->ss_delay = 0; } else if (ctx->ss_delay > 0) { ctx->ss_delay--; } if (is_sy_producer(instr)) { ctx->sy_delay = soft_sy_delay(instr, ctx->block->shader); } else if (has_sy_src(instr)) { ctx->sy_delay = 0; } else if (ctx->sy_delay > 0) { ctx->sy_delay--; } } static unsigned node_delay(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n) { return MAX2(n->earliest_ip, ctx->ip) - ctx->ip; } static unsigned node_delay_soft(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n) { unsigned delay = node_delay(ctx, n); /* This takes into account that as when we schedule multiple tex or sfu, the * first user has to wait for all of them to complete. */ if (n->has_ss_src) delay = MAX2(delay, ctx->ss_delay); if (n->has_sy_src) delay = MAX2(delay, ctx->sy_delay); return delay; } static void dump_node(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n, int level) { if (level > SCHED_DEBUG_DUMP_DEPTH) return; di(n->instr, "%*s%smaxdel=%d, node_delay=%d,node_delay_soft=%d, %d parents ", level * 2, "", (level > 0 ? "-> " : ""), n->max_delay, node_delay(ctx, n), node_delay_soft(ctx, n), n->dag.parent_count); util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) { struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child; dump_node(ctx, child, level + 1); } } static void dump_state(struct ir3_postsched_ctx *ctx) { if (!SCHED_DEBUG) return; foreach_sched_node (n, &ctx->dag->heads) { dump_node(ctx, n, 0); } } /* find instruction to schedule: */ static struct ir3_instruction * choose_instr(struct ir3_postsched_ctx *ctx) { struct ir3_postsched_node *chosen = NULL; dump_state(ctx); foreach_sched_node (n, &ctx->dag->heads) { if (!is_meta(n->instr)) continue; if (!chosen || (chosen->max_delay < n->max_delay)) chosen = n; } if (chosen) { di(chosen->instr, "prio: chose (meta)"); return chosen->instr; } /* Try to schedule inputs with a higher priority, if possible, as * the last bary.f unlocks varying storage to unblock more VS * warps. */ foreach_sched_node (n, &ctx->dag->heads) { if (!is_input(n->instr)) continue; if (!chosen || (chosen->max_delay < n->max_delay)) chosen = n; } if (chosen) { di(chosen->instr, "prio: chose (input)"); return chosen->instr; } /* Next prioritize discards: */ foreach_sched_node (n, &ctx->dag->heads) { unsigned d = node_delay(ctx, n); if (d > 0) continue; if (!is_kill_or_demote(n->instr)) continue; if (!chosen || (chosen->max_delay < n->max_delay)) chosen = n; } if (chosen) { di(chosen->instr, "csp: chose (kill, hard ready)"); return chosen->instr; } /* Next prioritize expensive instructions: */ foreach_sched_node (n, &ctx->dag->heads) { unsigned d = node_delay_soft(ctx, n); if (d > 0) continue; if (!(is_ss_producer(n->instr) || is_sy_producer(n->instr))) continue; if (!chosen || (chosen->max_delay < n->max_delay)) chosen = n; } if (chosen) { di(chosen->instr, "csp: chose (sfu/tex, soft ready)"); return chosen->instr; } /* Next try to find a ready leader w/ soft delay (ie. including extra * delay for things like tex fetch which can be synchronized w/ sync * bit (but we probably do want to schedule some other instructions * while we wait). We also allow a small amount of nops, to prefer now-nops * over future-nops up to a point, as that gives better results. */ unsigned chosen_delay = 0; foreach_sched_node (n, &ctx->dag->heads) { unsigned d = node_delay_soft(ctx, n); if (d > 3) continue; if (!chosen || d < chosen_delay) { chosen = n; chosen_delay = d; continue; } if (d > chosen_delay) continue; if (chosen->max_delay < n->max_delay) { chosen = n; chosen_delay = d; } } if (chosen) { di(chosen->instr, "csp: chose (soft ready)"); return chosen->instr; } /* Otherwise choose leader with maximum cost: */ foreach_sched_node (n, &ctx->dag->heads) { if (!chosen || chosen->max_delay < n->max_delay) chosen = n; } if (chosen) { di(chosen->instr, "csp: chose (leader)"); return chosen->instr; } return NULL; } struct ir3_postsched_deps_state { struct ir3_postsched_ctx *ctx; enum { F, R } direction; bool merged; /* Track the mapping between sched node (instruction) that last * wrote a given register (in whichever direction we are iterating * the block) * * Note, this table is twice as big as the # of regs, to deal with * half-precision regs. The approach differs depending on whether * the half and full precision register files are "merged" (conflict, * ie. a6xx+) in which case we use "regs" for both full precision and half * precision dependencies and consider each full precision dep * as two half-precision dependencies, vs older separate (non- * conflicting) in which case the separate "half_regs" table is used for * half-precision deps. See ir3_reg_file_offset(). */ struct ir3_postsched_node *regs[2 * GPR_REG_SIZE]; unsigned dst_n[2 * GPR_REG_SIZE]; struct ir3_postsched_node *half_regs[GPR_REG_SIZE]; unsigned half_dst_n[GPR_REG_SIZE]; struct ir3_postsched_node *shared_regs[2 * SHARED_REG_SIZE]; unsigned shared_dst_n[2 * SHARED_REG_SIZE]; struct ir3_postsched_node *nongpr_regs[2 * NONGPR_REG_SIZE]; unsigned nongpr_dst_n[2 * NONGPR_REG_SIZE]; }; static void add_dep(struct ir3_postsched_deps_state *state, struct ir3_postsched_node *before, struct ir3_postsched_node *after, unsigned d) { if (!before || !after) return; assert(before != after); if (state->direction == F) { dag_add_edge_max_data(&before->dag, &after->dag, (uintptr_t)d); } else { dag_add_edge_max_data(&after->dag, &before->dag, 0); } } static void add_single_reg_dep(struct ir3_postsched_deps_state *state, struct ir3_postsched_node *node, struct ir3_postsched_node **dep_ptr, unsigned *dst_n_ptr, unsigned num, int src_n, int dst_n) { struct ir3_postsched_node *dep = *dep_ptr; unsigned d = 0; if (src_n >= 0 && dep && state->direction == F) { struct ir3_compiler *compiler = state->ctx->ir->compiler; /* get the dst_n this corresponds to */ unsigned dst_n = *dst_n_ptr; d = ir3_delayslots_with_repeat(compiler, dep->instr, node->instr, dst_n, src_n); if (is_sy_producer(dep->instr)) node->has_sy_src = true; if (needs_ss(compiler, dep->instr, node->instr)) node->has_ss_src = true; } if (src_n >= 0 && dep && state->direction == R) { /* If node generates a WAR hazard (because it doesn't consume its sources * immediately, dep needs (ss) to sync its dest. Even though this isn't a * (ss) source (but rather a dest), the effect is exactly the same so we * model it as such. */ if (is_war_hazard_producer(node->instr)) { dep->has_ss_src = true; } } add_dep(state, dep, node, d); if (src_n < 0) { *dep_ptr = node; *dst_n_ptr = dst_n; } } /* This is where we handled full vs half-precision, and potential conflicts * between half and full precision that result in additional dependencies. * The 'reg' arg is really just to know half vs full precision. * * If src_n is positive, then this adds a dependency on a source register, and * src_n is the index passed into ir3_delayslots() for calculating the delay: * it corresponds to node->instr->srcs[src_n]. If src_n is negative, then * this is for the destination register corresponding to dst_n. */ static void add_reg_dep(struct ir3_postsched_deps_state *state, struct ir3_postsched_node *node, const struct ir3_register *reg, unsigned num, int src_n, int dst_n) { struct ir3_postsched_node **regs; unsigned *dst_n_ptr; enum ir3_reg_file file; unsigned size = reg_elem_size(reg); unsigned offset = ir3_reg_file_offset(reg, num, state->merged, &file); switch (file) { case IR3_FILE_FULL: assert(offset + size <= ARRAY_SIZE(state->regs)); regs = state->regs; dst_n_ptr = state->dst_n; break; case IR3_FILE_HALF: assert(offset + 1 <= ARRAY_SIZE(state->half_regs)); regs = state->half_regs; dst_n_ptr = state->half_dst_n; break; case IR3_FILE_SHARED: assert(offset + size <= ARRAY_SIZE(state->shared_regs)); regs = state->shared_regs; dst_n_ptr = state->shared_dst_n; break; case IR3_FILE_NONGPR: assert(offset + size <= ARRAY_SIZE(state->nongpr_regs)); regs = state->nongpr_regs; dst_n_ptr = state->nongpr_dst_n; break; } for (unsigned i = 0; i < size; i++) add_single_reg_dep(state, node, ®s[offset + i], &dst_n_ptr[offset + i], num, src_n, dst_n); } static void calculate_deps(struct ir3_postsched_deps_state *state, struct ir3_postsched_node *node) { /* Add dependencies on instructions that previously (or next, * in the reverse direction) wrote any of our src registers: */ foreach_src_n (reg, i, node->instr) { if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) continue; if (reg->flags & IR3_REG_RELATIV) { /* mark entire array as read: */ for (unsigned j = 0; j < reg->size; j++) { add_reg_dep(state, node, reg, reg->array.base + j, i, -1); } } else { assert(reg->wrmask >= 1); u_foreach_bit (b, reg->wrmask) { add_reg_dep(state, node, reg, reg->num + b, i, -1); } } } /* And then after we update the state for what this instruction * wrote: */ foreach_dst_n (reg, i, node->instr) { if (reg->wrmask == 0) continue; if (reg->flags & IR3_REG_RELATIV) { /* mark the entire array as written: */ for (unsigned j = 0; j < reg->size; j++) { add_reg_dep(state, node, reg, reg->array.base + j, -1, i); } } else { assert(reg->wrmask >= 1); u_foreach_bit (b, reg->wrmask) { add_reg_dep(state, node, reg, reg->num + b, -1, i); } } } } static void calculate_forward_deps(struct ir3_postsched_ctx *ctx) { struct ir3_postsched_deps_state state = { .ctx = ctx, .direction = F, .merged = ctx->v->mergedregs, }; foreach_instr (instr, &ctx->unscheduled_list) { calculate_deps(&state, instr->data); } } static void calculate_reverse_deps(struct ir3_postsched_ctx *ctx) { struct ir3_postsched_deps_state state = { .ctx = ctx, .direction = R, .merged = ctx->v->mergedregs, }; foreach_instr_rev (instr, &ctx->unscheduled_list) { calculate_deps(&state, instr->data); } } static void sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr) { struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node); dag_init_node(ctx->dag, &n->dag); n->instr = instr; instr->data = n; } static void sched_dag_max_delay_cb(struct dag_node *node, void *state) { struct ir3_postsched_node *n = (struct ir3_postsched_node *)node; struct ir3_postsched_ctx *ctx = state; uint32_t max_delay = 0; util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) { struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child; unsigned delay = edge->data; unsigned sy_delay = 0; unsigned ss_delay = 0; if (child->has_sy_src && is_sy_producer(n->instr)) { sy_delay = soft_sy_delay(n->instr, ctx->block->shader); } if (child->has_ss_src && needs_ss(ctx->v->compiler, n->instr, child->instr)) { ss_delay = soft_ss_delay(n->instr); } delay = MAX3(delay, sy_delay, ss_delay); max_delay = MAX2(child->max_delay + delay, max_delay); } n->max_delay = MAX2(n->max_delay, max_delay); } static void sched_dag_init(struct ir3_postsched_ctx *ctx) { ctx->mem_ctx = ralloc_context(NULL); ctx->dag = dag_create(ctx->mem_ctx); foreach_instr (instr, &ctx->unscheduled_list) sched_node_init(ctx, instr); calculate_forward_deps(ctx); calculate_reverse_deps(ctx); /* * To avoid expensive texture fetches, etc, from being moved ahead * of kills, track the kills we've seen so far, so we can add an * extra dependency on them for tex/mem instructions */ struct util_dynarray kills; util_dynarray_init(&kills, ctx->mem_ctx); /* The last bary.f with the (ei) flag must be scheduled before any kills, * or the hw gets angry. Keep track of inputs here so we can add the * false dep on the kill instruction. */ struct util_dynarray inputs; util_dynarray_init(&inputs, ctx->mem_ctx); /* * Normal srcs won't be in SSA at this point, those are dealt with in * calculate_forward_deps() and calculate_reverse_deps(). But we still * have the false-dep information in SSA form, so go ahead and add * dependencies for that here: */ foreach_instr (instr, &ctx->unscheduled_list) { struct ir3_postsched_node *n = instr->data; foreach_ssa_src_n (src, i, instr) { if (src->block != instr->block) continue; /* we can end up with unused false-deps.. just skip them: */ if (src->flags & IR3_INSTR_UNUSED) continue; struct ir3_postsched_node *sn = src->data; /* don't consider dependencies in other blocks: */ if (src->block != instr->block) continue; dag_add_edge_max_data(&sn->dag, &n->dag, 0); } if (is_input(instr)) { util_dynarray_append(&inputs, struct ir3_instruction *, instr); } else if (is_kill_or_demote(instr)) { util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) { struct ir3_instruction *input = *instrp; struct ir3_postsched_node *in = input->data; dag_add_edge_max_data(&in->dag, &n->dag, 0); } util_dynarray_append(&kills, struct ir3_instruction *, instr); } else if (is_tex(instr) || is_mem(instr)) { util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) { struct ir3_instruction *kill = *instrp; struct ir3_postsched_node *kn = kill->data; dag_add_edge_max_data(&kn->dag, &n->dag, 0); } } } #ifndef NDEBUG dag_validate(ctx->dag, sched_dag_validate_cb, NULL); #endif // TODO do we want to do this after reverse-dependencies? dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, ctx); } static void sched_dag_destroy(struct ir3_postsched_ctx *ctx) { ralloc_free(ctx->mem_ctx); ctx->mem_ctx = NULL; ctx->dag = NULL; } static void sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block) { ctx->block = block; ctx->sy_delay = 0; ctx->ss_delay = 0; /* The terminator has to stay at the end. Instead of trying to set up * dependencies to achieve this, it's easier to just remove it now and add it * back after scheduling. */ struct ir3_instruction *terminator = ir3_block_take_terminator(block); /* move all instructions to the unscheduled list, and * empty the block's instruction list (to which we will * be inserting). */ list_replace(&block->instr_list, &ctx->unscheduled_list); list_inithead(&block->instr_list); // TODO once we are using post-sched for everything we can // just not stick in NOP's prior to post-sched, and drop this. // for now keep this, since it makes post-sched optional: foreach_instr_safe (instr, &ctx->unscheduled_list) { switch (instr->opc) { case OPC_NOP: list_delinit(&instr->node); break; default: break; } } sched_dag_init(ctx); /* First schedule all meta:input instructions, followed by * tex-prefetch. We want all of the instructions that load * values into registers before the shader starts to go * before any other instructions. But in particular we * want inputs to come before prefetches. This is because * a FS's bary_ij input may not actually be live in the * shader, but it should not be scheduled on top of any * other input (but can be overwritten by a tex prefetch) */ foreach_instr_safe (instr, &ctx->unscheduled_list) if (instr->opc == OPC_META_INPUT) schedule(ctx, instr); foreach_instr_safe (instr, &ctx->unscheduled_list) if (instr->opc == OPC_META_TEX_PREFETCH) schedule(ctx, instr); foreach_instr_safe (instr, &ctx->unscheduled_list) if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO) schedule(ctx, instr); while (!list_is_empty(&ctx->unscheduled_list)) { struct ir3_instruction *instr = choose_instr(ctx); unsigned delay = node_delay(ctx, instr->data); d("delay=%u", delay); assert(delay <= 6); schedule(ctx, instr); } sched_dag_destroy(ctx); if (terminator) list_addtail(&terminator->node, &block->instr_list); } static bool is_self_mov(struct ir3_instruction *instr) { if (!is_same_type_mov(instr)) return false; if (instr->dsts[0]->num != instr->srcs[0]->num) return false; if (instr->dsts[0]->flags & IR3_REG_RELATIV) return false; if (instr->cat1.round != ROUND_ZERO) return false; if (instr->srcs[0]->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT)) return false; return true; } /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y * as a result of places were before RA we are not sure that it is * safe to eliminate. We could eliminate these earlier, but sometimes * they are tangled up in false-dep's, etc, so it is easier just to * let them exist until after RA */ static void cleanup_self_movs(struct ir3 *ir) { foreach_block (block, &ir->block_list) { foreach_instr_safe (instr, &block->instr_list) { for (unsigned i = 0; i < instr->deps_count; i++) { if (instr->deps[i] && is_self_mov(instr->deps[i])) { instr->deps[i] = NULL; } } if (is_self_mov(instr)) list_delinit(&instr->node); } } } bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v) { struct ir3_postsched_ctx ctx = { .ir = ir, .v = v, }; cleanup_self_movs(ir); foreach_block (block, &ir->block_list) { sched_block(&ctx, block); } return true; }