/* * Copyright 2014 Intel Corporation * Copyright 2023 Advanced Micro Devices, Inc. * * SPDX-License-Identifier: MIT */ /* This implements dominance and post-dominance of the SSA use graph where * instructions are vertices and SSA uses are edges (i.e. edges go from * each instruction to all its uses). CF nodes are ignored and irrelevant. * It's different from nir_dominance.c, but the algorithm is the same, which * is from "A Simple, Fast Dominance Algorithm" by Cooper, Harvey, and Kennedy. * * Definitions: * - Instruction A is post-dominated by instruction B if the result of * instruction A and following intermediate results using the result of * instruction A only affect the result of instruction B. Consequently, * if instruction B was removed, instruction A would become dead including * all instructions computing the intermediate results. * Example: A(load) -> ... -> B(ALU) * Note: This is the foundation of inter-shader code motion from later * shaders to earlier shaders. * - Instruction B is dominated by instruction A if all use paths from * all loads to instruction B must go through instruction A. * Note: Unlike post-dominance, dominance is unusable as-is because * the immediate dominator typically doesn't exist if there are non-unary * opcodes (i.e. branches of an expression tree following source operands * don't usually converge to a single instruction unless all instructions * are unary). The solution is to ignore loads like load_const to allow * non-unary opcodes, which would be the foundation of inter-shader code * motion from earlier shaders to later shaders, such as 2 output stores * having only 1 ALU instruction as their only source at the beginning, * ignoring constant and uniform operands along the way. * * Interesting cases implied by this (post-)dominator tree: * - load_const, loads without src operands, and undef are not dominated by * anything because they don't have any src operands. * - No instruction post-dominates store intrinsics (and all other intrinsics * without a destination) and nir_if nodes (they use a value but don't * produce any). * * Typical application: * - The immediate post-dominator query returns the solution to the problem of * how much code we can move into the previous shader or preamble without * increasing the number of inputs. Example of an SSA-use graph and * the possible result that a user of this utility can produce: * * input0 input1 input0 input1 * \ / \ | \ * constant alu ... ------> | ... * \ / * alu * (immediate post-dominator of input0) * * Examples of possible applications: * - Moving load_input+ALU to the previous shader: An immediate post-dominator * of load_input and all instructions between load_input and the immediate * post-dominator are a candidate for being moved into the previous shader * and we only need to check if the post-dominator is movable. Repeat * the immediate post-dominator query on the accepted post-dominator and see * if that is also movable. Repeat that until you find the farthest post- * dominator that is movable. * - Moving load_uniform+ALU to a preamble shader or the CPU: An immediate * post-dominator of load_uniform is a candidate for being moved into * the preamble shader or the CPU. Repeat the immediate post-dominator query * until you find the farthest post-dominator that is movable. * - Replacing a value used to compute 2 shader outputs by only 1 output, and * moving the computation into the next shader: * The Lowest Common Ancestor of 2 output stores within the dominator tree * is a candidate for the new replacement output. Any loads that are * trivially movable such as load_const should be ignored by this utility, * otherwise the Lowest Common Ancestor wouldn't exist. * * Queries: * - get the immediate dominator of an instruction * - get the Lowest Common Ancestor of 2 instructions * - whether one instruction dominates another * * Implemenation details: * - Since some instructions are not dominated by anything, a dummy root is * added into the graph that dominates such instructions, which is required * by the algorithm. * * TODO: only post-dominance implemented, not dominance */ #include "nir.h" struct nir_use_dom_node { nir_instr *instr; uint32_t index; /* The index of this node's immediate dominator in the dominator tree. * The dummy root points it to itself. -1 == unset. */ int32_t imm_dom; }; struct nir_use_dominance_state { nir_function_impl *impl; struct nir_use_dom_node *dom_nodes; unsigned num_dom_nodes; }; static struct nir_use_dom_node * get_node(struct nir_use_dominance_state *state, nir_instr *instr) { return &state->dom_nodes[instr->index]; } static struct nir_use_dom_node * get_imm_dom(struct nir_use_dominance_state *state, struct nir_use_dom_node *node) { assert(node->imm_dom != -1); return &state->dom_nodes[node->imm_dom]; } static bool init_instr(struct nir_use_dominance_state *state, nir_instr *instr, unsigned *index) { assert(*index < state->num_dom_nodes); struct nir_use_dom_node *node = &state->dom_nodes[*index]; if (*index == 0) { /* dummy root */ node->imm_dom = 0; } else { node->imm_dom = -1; node->instr = instr; instr->index = node->index = *index; } (*index)++; return true; } static struct nir_use_dom_node * intersect(struct nir_use_dominance_state *state, struct nir_use_dom_node *i1, struct nir_use_dom_node *i2) { while (i1 != i2) { /* Note, the comparisons here are the opposite of what the paper says * because we index instrs from beginning -> end (i.e. reverse * post-order) instead of post-order like they assume. */ while (i1->index > i2->index) i1 = get_imm_dom(state, i1); while (i2->index > i1->index) i2 = get_imm_dom(state, i2); } return i1; } static void update_imm_dom(struct nir_use_dominance_state *state, struct nir_use_dom_node *pred, struct nir_use_dom_node **new_idom) { if (pred->imm_dom != -1) { if (*new_idom) *new_idom = intersect(state, pred, *new_idom); else *new_idom = pred; } } static bool calc_dominance(struct nir_use_dominance_state *state, struct nir_use_dom_node *node, bool post_dominance) { struct nir_use_dom_node *new_idom = NULL; if (post_dominance) { nir_def *def = nir_instr_def(node->instr); bool has_use = false; /* Intrinsics that can't be reordered will get the root node as * the post-dominator. */ if (def && (node->instr->type != nir_instr_type_intrinsic || nir_intrinsic_can_reorder(nir_instr_as_intrinsic(node->instr)))) { nir_foreach_use_including_if(src, def) { has_use = true; /* Ifs are treated like stores because they don't produce * a value. dom_nodes[0] is the dummy root. */ if (nir_src_is_if(src)) { update_imm_dom(state, &state->dom_nodes[0], &new_idom); /* Short-cut because we can't come back from the root node. */ break; } else { update_imm_dom(state, get_node(state, nir_src_parent_instr(src)), &new_idom); } } } /* No destination (e.g. stores, atomics with an unused result, discard, * dead instructions). dom_nodes[0] is the dummy root. */ if (!has_use) update_imm_dom(state, &state->dom_nodes[0], &new_idom); } else { unreachable("TODO: only post-dominance implemented, not dominance"); } if (new_idom && node->imm_dom != new_idom->index) { node->imm_dom = new_idom->index; return true; } return false; } /** * Calculate dominance or post-dominance of the SSA use graph. * The returned state must not be freed while dominance queries are being used. * ralloc_free() frees the state. * * It clobbers nir_instr::index, which can't be changed while dominance queries * are being used. * * \param impl NIR function * \param post_dominance Whether to compute post-dominance or dominance. */ struct nir_use_dominance_state * nir_calc_use_dominance_impl(nir_function_impl *impl, bool post_dominance) { struct nir_use_dominance_state *state = rzalloc(NULL, struct nir_use_dominance_state); if (!state) return NULL; unsigned num_dom_nodes = 1; /* including the dummy root */ nir_foreach_block(block, impl) { num_dom_nodes += exec_list_length(&block->instr_list); } state->impl = impl; state->num_dom_nodes = num_dom_nodes; state->dom_nodes = rzalloc_array(state, struct nir_use_dom_node, num_dom_nodes); if (!state->dom_nodes) { ralloc_free(state); return NULL; } unsigned index = 0; /* We need a dummy root node because there are instructions such as * load_const that aren't dominated by anything. If we are calculating * post-dominance, intrinsics without a destination aren't post-dominated * by anything. However, the algorithm requires a common (post-)dominator. */ init_instr(state, NULL, &index); /* Post-dominance is identical to dominance, but instructions are added * in the opposite order. */ if (post_dominance) { nir_foreach_block_reverse(block, impl) { nir_foreach_instr_reverse(instr, block) { init_instr(state, instr, &index); } } } else { nir_foreach_block(block, impl) { nir_foreach_instr(instr, block) { init_instr(state, instr, &index); } } } bool progress = true; while (progress) { progress = false; /* Skip the dummy root (iterate from 1). */ for (unsigned i = 1; i < num_dom_nodes; i++) { progress |= calc_dominance(state, &state->dom_nodes[i], post_dominance); } } return state; } nir_instr * nir_get_immediate_use_dominator(struct nir_use_dominance_state *state, nir_instr *instr) { struct nir_use_dom_node *node = get_node(state, instr); return get_imm_dom(state, node)->instr; } /** * Computes the least common ancestor of two instructions. */ nir_instr * nir_use_dominance_lca(struct nir_use_dominance_state *state, nir_instr *i1, nir_instr *i2) { assert(i1 && i2); struct nir_use_dom_node *lca = intersect(state, get_node(state, i1), get_node(state, i2)); assert(lca); /* Might be NULL in case of the dummy root. */ return lca->instr; } /** * Returns true if the parent dominates the child in the SSA use graph * described at the beginning. */ bool nir_instr_dominates_use(struct nir_use_dominance_state *state, nir_instr *parent_instr, nir_instr *child_instr) { struct nir_use_dom_node *parent = get_node(state, parent_instr); struct nir_use_dom_node *child = get_node(state, child_instr); while (parent->index < child->index) child = get_imm_dom(state, child); return parent == child; } static void print_instr(struct nir_use_dom_node *node) { if (!node) printf("NULL - bug"); else if (node->index == 0) printf("dummy_root"); else nir_print_instr(node->instr, stdout); } void nir_print_use_dominators(struct nir_use_dominance_state *state, nir_instr **instructions, unsigned num_instructions) { for (unsigned i = 0; i < num_instructions; i++) { printf("Input idom(\""); nir_print_instr(instructions[i], stdout); printf("\") = \""); print_instr(get_imm_dom(state, get_node(state, instructions[i]))); printf("\"\n"); } puts(""); nir_foreach_block(block, state->impl) { nir_foreach_instr(instr, block) { printf("idom(\""); nir_print_instr(instr, stdout); printf("\") = \""); print_instr(get_imm_dom(state, get_node(state, instr))); printf("\"\n"); } } puts(""); for (unsigned i = 0; i < num_instructions; i++) { for (unsigned j = i + 1; j < num_instructions; j++) { printf("LCA input 1: "); nir_print_instr(instructions[i], stdout); printf("\nLCA input 2: "); nir_print_instr(instructions[j], stdout); puts(""); nir_instr *lca = nir_use_dominance_lca(state, instructions[i], instructions[j]); if (lca) { printf("2 inputs have a common post-dominator: "); nir_print_instr(lca, stdout); printf("\n"); } puts(""); } } }