/* * Copyright © 2021 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ /* This is a new block-level load instruction scheduler where loads are grouped * according to their indirection level within a basic block. An indirection * is when a result of one load is used as a source of another load. The result * is that disjoint ALU opcode groups and load (texture) opcode groups are * created where each next load group is the next level of indirection. * It's done by finding the first and last load with the same indirection * level, and moving all unrelated instructions between them after the last * load except for load sources, which are moved before the first load. * It naturally suits hardware that has limits on texture indirections, but * other hardware can benefit too. Only texture, image, and SSBO load and * atomic instructions are grouped. * * There is an option to group only those loads that use the same resource * variable. This increases the chance to get more cache hits than if the loads * were spread out. * * The increased register usage is offset by the increase in observed memory * bandwidth due to more cache hits (dependent on hw behavior) and thus * decrease the subgroup lifetime, which allows registers to be deallocated * and reused sooner. In some bandwidth-bound cases, low register usage doesn't * benefit at all. Doubling the register usage and using those registers to * amplify observed bandwidth can improve performance a lot. * * It's recommended to run a hw-specific instruction scheduler after this to * prevent spilling. */ #include "nir.h" static bool is_memory_load(nir_instr *instr) { /* Count texture_size too because it has the same latency as cache hits. */ if (instr->type == nir_instr_type_tex) return true; if (instr->type == nir_instr_type_intrinsic) { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); const char *name = nir_intrinsic_infos[intr->intrinsic].name; /* TODO: nir_intrinsics.py could do this */ /* load_ubo is ignored because it's usually cheap. */ if (!nir_intrinsic_writes_external_memory(intr) && !strstr(name, "shared") && (strstr(name, "ssbo") || strstr(name, "image"))) return true; } return false; } static nir_instr * get_intrinsic_resource(nir_intrinsic_instr *intr) { /* This is also the list of intrinsics that are grouped. */ /* load_ubo is ignored because it's usually cheap. */ switch (intr->intrinsic) { case nir_intrinsic_image_load: case nir_intrinsic_image_deref_load: case nir_intrinsic_image_sparse_load: case nir_intrinsic_image_deref_sparse_load: /* Group image_size too because it has the same latency as cache hits. */ case nir_intrinsic_image_samples_identical: case nir_intrinsic_image_deref_samples_identical: case nir_intrinsic_bindless_image_samples_identical: case nir_intrinsic_image_size: case nir_intrinsic_image_deref_size: case nir_intrinsic_bindless_image_load: case nir_intrinsic_bindless_image_sparse_load: case nir_intrinsic_load_ssbo: case nir_intrinsic_image_fragment_mask_load_amd: case nir_intrinsic_image_deref_fragment_mask_load_amd: case nir_intrinsic_bindless_image_fragment_mask_load_amd: return intr->src[0].ssa->parent_instr; default: return NULL; } } /* Track only those that we want to group. */ static bool is_grouped_load(nir_instr *instr) { /* Count texture_size too because it has the same latency as cache hits. */ if (instr->type == nir_instr_type_tex) return true; if (instr->type == nir_instr_type_intrinsic) return get_intrinsic_resource(nir_instr_as_intrinsic(instr)) != NULL; return false; } static bool can_move(nir_instr *instr, uint8_t current_indirection_level) { /* Grouping is done by moving everything else out of the first/last * instruction range of the indirection level. */ if (is_grouped_load(instr) && instr->pass_flags == current_indirection_level) return false; if (instr->type == nir_instr_type_alu || instr->type == nir_instr_type_deref || instr->type == nir_instr_type_tex || instr->type == nir_instr_type_load_const || instr->type == nir_instr_type_undef) return true; if (instr->type == nir_instr_type_intrinsic && nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr))) return true; return false; } static nir_instr * get_uniform_inst_resource(nir_instr *instr) { if (instr->type == nir_instr_type_tex) { nir_tex_instr *tex = nir_instr_as_tex(instr); if (tex->texture_non_uniform) return NULL; for (unsigned i = 0; i < tex->num_srcs; i++) { switch (tex->src[i].src_type) { case nir_tex_src_texture_deref: case nir_tex_src_texture_handle: return tex->src[i].src.ssa->parent_instr; default: break; } } return NULL; } if (instr->type == nir_instr_type_intrinsic) return get_intrinsic_resource(nir_instr_as_intrinsic(instr)); return NULL; } struct check_sources_state { nir_block *block; uint32_t first_index; }; static bool has_only_sources_less_than(nir_src *src, void *data) { struct check_sources_state *state = (struct check_sources_state *)data; /* true if nir_foreach_src should keep going */ return state->block != src->ssa->parent_instr->block || src->ssa->parent_instr->index < state->first_index; } static void group_loads(nir_instr *first, nir_instr *last) { /* Walk the instruction range between the first and last backward, and * move those that have no uses within the range after the last one. */ for (nir_instr *instr = exec_node_data_backward(nir_instr, last->node.prev, node); instr != first; instr = exec_node_data_backward(nir_instr, instr->node.prev, node)) { /* Only move instructions without side effects. */ if (!can_move(instr, first->pass_flags)) continue; nir_def *def = nir_instr_def(instr); if (def) { bool all_uses_after_last = true; nir_foreach_use(use, def) { if (nir_src_parent_instr(use)->block == instr->block && nir_src_parent_instr(use)->index <= last->index) { all_uses_after_last = false; break; } } if (all_uses_after_last) { nir_instr *move_instr = instr; /* Set the last instruction because we'll delete the current one. */ instr = exec_node_data_forward(nir_instr, instr->node.next, node); /* Move the instruction after the last and update its index * to indicate that it's after it. */ nir_instr_move(nir_after_instr(last), move_instr); move_instr->index = last->index + 1; } } } struct check_sources_state state; state.block = first->block; state.first_index = first->index; /* Walk the instruction range between the first and last forward, and move * those that have no sources within the range before the first one. */ for (nir_instr *instr = exec_node_data_forward(nir_instr, first->node.next, node); instr != last; instr = exec_node_data_forward(nir_instr, instr->node.next, node)) { /* Only move instructions without side effects. */ if (!can_move(instr, first->pass_flags)) continue; if (nir_foreach_src(instr, has_only_sources_less_than, &state)) { nir_instr *move_instr = instr; /* Set the last instruction because we'll delete the current one. */ instr = exec_node_data_backward(nir_instr, instr->node.prev, node); /* Move the instruction before the first and update its index * to indicate that it's before it. */ nir_instr_move(nir_before_instr(first), move_instr); move_instr->index = first->index - 1; } } } static bool is_pseudo_inst(nir_instr *instr) { /* Other instructions do not usually contribute to the shader binary size. */ return instr->type != nir_instr_type_alu && instr->type != nir_instr_type_call && instr->type != nir_instr_type_tex && instr->type != nir_instr_type_intrinsic; } static void set_instr_indices(nir_block *block) { /* Start with 1 because we'll move instruction before the first one * and will want to label it 0. */ unsigned counter = 1; nir_instr *last = NULL; nir_foreach_instr(instr, block) { /* Make sure grouped instructions don't have the same index as pseudo * instructions. */ if (last && is_pseudo_inst(last) && is_grouped_load(instr)) counter++; /* Set each instruction's index within the block. */ instr->index = counter; /* Only count non-pseudo instructions. */ if (!is_pseudo_inst(instr)) counter++; last = instr; } } static void handle_load_range(nir_instr **first, nir_instr **last, nir_instr *current, unsigned max_distance) { if (*first && *last && (!current || current->index > (*first)->index + max_distance)) { assert(*first != *last); group_loads(*first, *last); set_instr_indices((*first)->block); *first = NULL; *last = NULL; } } static bool is_barrier(nir_instr *instr) { if (instr->type == nir_instr_type_intrinsic) { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); const char *name = nir_intrinsic_infos[intr->intrinsic].name; if (intr->intrinsic == nir_intrinsic_terminate || intr->intrinsic == nir_intrinsic_terminate_if || /* TODO: nir_intrinsics.py could do this */ strstr(name, "barrier")) return true; } return false; } struct indirection_state { nir_block *block; unsigned indirections; }; static unsigned get_num_indirections(nir_instr *instr); static bool gather_indirections(nir_src *src, void *data) { struct indirection_state *state = (struct indirection_state *)data; nir_instr *instr = src->ssa->parent_instr; /* We only count indirections within the same block. */ if (instr->block == state->block) { unsigned indirections = get_num_indirections(src->ssa->parent_instr); if (instr->type == nir_instr_type_tex || is_memory_load(instr)) indirections++; state->indirections = MAX2(state->indirections, indirections); } return true; /* whether nir_foreach_src should keep going */ } /* Return the number of load indirections within the block. */ static unsigned get_num_indirections(nir_instr *instr) { /* Don't traverse phis because we could end up in an infinite recursion * if the phi points to the current block (such as a loop body). */ if (instr->type == nir_instr_type_phi) return 0; if (instr->index != UINT32_MAX) return instr->index; /* we've visited this instruction before */ struct indirection_state state; state.block = instr->block; state.indirections = 0; nir_foreach_src(instr, gather_indirections, &state); instr->index = state.indirections; return state.indirections; } static void process_block(nir_block *block, nir_load_grouping grouping, unsigned max_distance) { int max_indirection = -1; unsigned num_inst_per_level[256] = { 0 }; /* UINT32_MAX means the instruction has not been visited. Once * an instruction has been visited and its indirection level has been * determined, we'll store the indirection level in the index. The next * instruction that visits it will use the index instead of recomputing * the indirection level, which would result in an exponetial time * complexity. */ nir_foreach_instr(instr, block) { instr->index = UINT32_MAX; /* unknown */ } /* Count the number of load indirections for each load instruction * within this block. Store it in pass_flags. */ nir_foreach_instr(instr, block) { if (is_grouped_load(instr)) { unsigned indirections = get_num_indirections(instr); /* pass_flags has only 8 bits */ indirections = MIN2(indirections, 255); num_inst_per_level[indirections]++; instr->pass_flags = indirections; max_indirection = MAX2(max_indirection, (int)indirections); } } /* 255 contains all indirection levels >= 255, so ignore them. */ max_indirection = MIN2(max_indirection, 254); /* Each indirection level is grouped. */ for (int level = 0; level <= max_indirection; level++) { if (num_inst_per_level[level] <= 1) continue; set_instr_indices(block); nir_instr *resource = NULL; nir_instr *first_load = NULL, *last_load = NULL; /* Find the first and last instruction that use the same * resource and are within a certain distance of each other. * If found, group them by moving all movable instructions * between them out. */ nir_foreach_instr(current, block) { /* Don't group across barriers. */ if (is_barrier(current)) { /* Group unconditionally. */ handle_load_range(&first_load, &last_load, NULL, 0); first_load = NULL; last_load = NULL; continue; } /* Only group load instructions with the same indirection level. */ if (is_grouped_load(current) && current->pass_flags == level) { nir_instr *current_resource; switch (grouping) { case nir_group_all: if (!first_load) first_load = current; else last_load = current; break; case nir_group_same_resource_only: current_resource = get_uniform_inst_resource(current); if (current_resource) { if (!first_load) { first_load = current; resource = current_resource; } else if (current_resource == resource) { last_load = current; } } } } /* Group only if we exceeded the maximum distance. */ handle_load_range(&first_load, &last_load, current, max_distance); } /* Group unconditionally. */ handle_load_range(&first_load, &last_load, NULL, 0); } } /* max_distance is the maximum distance between the first and last instruction * in a group. */ void nir_group_loads(nir_shader *shader, nir_load_grouping grouping, unsigned max_distance) { nir_foreach_function_impl(impl, shader) { nir_foreach_block(block, impl) { process_block(block, grouping, max_distance); } nir_metadata_preserve(impl, nir_metadata_control_flow | nir_metadata_loop_analysis); } }