/* * Copyright 2023 Asahi Lina * SPDX-License-Identifier: MIT */ #include "agx_scratch.h" #include "asahi/compiler/agx_compile.h" #include "shaders/helper.h" #include "util/u_hexdump.h" #include "agx_bo.h" #include "libagx_shaders.h" #include "nir.h" #include "nir_builder_opcodes.h" #define AGX_ADDR_SHIFT 8 #define AGX_THREADS_PER_GROUP 32 #define AGX_SPILL_UNIT_DWORDS 8 // FIXME: What is the actual value here? Seems to be 96 + 8 or so? #define AGX_MAX_SUBGROUPS_PER_CORE 128 // Unknown if this goes higher. #define AGX_MAX_SCRATCH_BLOCK_LOG4 6 #define AGX_MAX_SCRATCH_DWORDS \ ((AGX_SPILL_UNIT_DWORDS << (2 * AGX_MAX_SCRATCH_BLOCK_LOG4)) * 4) struct spill_size { uint32_t log4_bsize; uint32_t count; }; struct agx_bo * agx_build_helper(struct agx_device *dev) { struct agx_bo *bo = agx_bo_create( dev, sizeof(libagx_g13_helper), 0, AGX_BO_READONLY | AGX_BO_EXEC | AGX_BO_LOW_VA, "Helper shader"); assert(bo); memcpy(bo->map, libagx_g13_helper, sizeof(libagx_g13_helper)); if (dev->debug & AGX_DBG_SCRATCH) fprintf(stderr, "Helper: 0x%" PRIx64 "\n", bo->va->addr); return bo; } static struct spill_size agx_scratch_get_spill_size(unsigned dwords) { if (!dwords) { return (struct spill_size){0, 0}; } assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large"); unsigned log4 = util_logbase2(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)) / 2; unsigned blocks = DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS << (2 * log4)); if (log4 > AGX_MAX_SCRATCH_BLOCK_LOG4) { // Max size case (4 blocks) assert(log4 == (AGX_MAX_SCRATCH_BLOCK_LOG4 + 1)); log4--; blocks = 4; } else if (blocks == 4) { // Non max size 4 block case, shift to next log4 unit for consistency. log4++; blocks = 1; } return (struct spill_size){log4, blocks}; } unsigned agx_scratch_get_bucket(uint32_t dwords) { /* For debugging/analysis purposes, scratch allocation sizes are * divided into buckets. Since we only allocate a single global * worst-case scratch buffer, these buckets do not have any meaning * for the actual allocation mechanism. They are only used to log * allocation sizes. We just use a simple log2 of the size here. */ if (!dwords) return 0; assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large"); return MIN2( AGX_SPILL_SIZE_BUCKETS - 1, 1 + util_logbase2_ceil(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS))); } static void agx_scratch_realloc(struct agx_scratch *scratch) { if (scratch->buf) agx_bo_unreference(scratch->dev, scratch->buf); struct spill_size size = agx_scratch_get_spill_size(scratch->size_dwords); if (scratch->dev->debug & AGX_DBG_SCRATCH) fprintf(stderr, "Scratch realloc: %d (%d:%d) x %d\n", scratch->size_dwords, size.log4_bsize, size.count, scratch->subgroups); unsigned block_dwords = AGX_SPILL_UNIT_DWORDS << (2 * size.log4_bsize); size_t block_size_bytes = (AGX_THREADS_PER_GROUP * 4) * block_dwords; scratch->size_dwords = block_dwords * size.count; if (scratch->dev->debug & AGX_DBG_SCRATCH) fprintf(stderr, "Block size: 0x%zx bytes (%d)\n", block_size_bytes, size.log4_bsize); unsigned block_count = size.count; if (scratch->dev->debug & AGX_DBG_SCRATCH) fprintf(stderr, "Block count: %d\n", block_count); size_t core_alloc = block_size_bytes * block_count * scratch->subgroups; size_t header_size = sizeof(struct agx_helper_header); size_t blocklist_off = header_size; size_t blocklist_core_size = scratch->subgroups * sizeof(struct agx_helper_block); size_t blocklist_size = blocklist_core_size * scratch->num_cores; size_t blocks_off = align(header_size + blocklist_size, block_size_bytes); size_t total_alloc = blocks_off + core_alloc * scratch->num_cores; unsigned flags = 0; #ifdef SCRATCH_DEBUG flags = AGX_BO_WRITEBACK; #endif scratch->buf = agx_bo_create(scratch->dev, total_alloc, block_size_bytes, flags, "Scratch"); memset(scratch->buf->map, 0, blocks_off); struct agx_helper_header *hdr = scratch->buf->map; scratch->header = hdr; uint64_t blocklist_gpu = scratch->buf->va->addr + blocklist_off; struct agx_helper_block *blocklist_cpu = scratch->buf->map + blocklist_off; #ifdef SCRATCH_DEBUG scratch->blocklist = blocklist_cpu; scratch->data = scratch->buf->map + blocks_off; scratch->core_size = block_size_bytes * block_count * scratch->subgroups; #endif uint64_t blocks_gpu = scratch->buf->va->addr + blocks_off; hdr->subgroups = scratch->subgroups; unsigned num_cores = 0; unsigned core_id; for (core_id = 0; core_id < AGX_MAX_CORE_ID; core_id++) { #ifndef SCRATCH_DEBUG_CORES unsigned cores_per_cluster = util_next_power_of_two(scratch->dev->params.num_cores_per_cluster); unsigned cluster = core_id / cores_per_cluster; unsigned core = core_id % cores_per_cluster; if (cluster >= scratch->dev->params.num_clusters_total) break; if (core >= scratch->dev->params.num_cores_per_cluster || !(scratch->dev->params.core_masks[cluster] & BITFIELD_BIT(core))) continue; #endif num_cores++; #ifdef SCRATCH_DEBUG scratch->core_present[core_id] = true; #endif hdr->cores[core_id].blocklist = blocklist_gpu; for (unsigned sg = 0; sg < scratch->subgroups; sg++) { uint32_t mask = BITFIELD_MASK(size.log4_bsize + 1); assert(!(blocks_gpu & (block_size_bytes - 1))); uint32_t base = blocks_gpu >> AGX_ADDR_SHIFT; uint32_t stride = block_size_bytes >> AGX_ADDR_SHIFT; blocklist_cpu[sg].blocks[0] = mask | base; for (int block = 1; block <= 3; block++) { if (block_count >= (block + 1)) blocklist_cpu[sg].blocks[block] = 1 | (base + block * stride); else blocklist_cpu[sg].blocks[block] = 0; } blocks_gpu += block_size_bytes * block_count; } blocklist_gpu += sizeof(struct agx_helper_block) * scratch->subgroups; blocklist_cpu += scratch->subgroups; } scratch->max_core_id = core_id; assert(num_cores == scratch->num_cores); if (scratch->dev->debug & AGX_DBG_SCRATCH) fprintf(stderr, "New Scratch @ 0x%" PRIx64 " (size: 0x%zx)\n", scratch->buf->va->addr, scratch->buf->size); } void agx_scratch_alloc(struct agx_scratch *scratch, unsigned dwords, size_t subgroups) { bool realloc = false; if (!dwords) return; assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large"); if (!subgroups) subgroups = AGX_MAX_SUBGROUPS_PER_CORE; subgroups = MIN2(AGX_MAX_SUBGROUPS_PER_CORE, subgroups); if (dwords > scratch->size_dwords) { scratch->size_dwords = dwords; realloc = true; } if (subgroups > scratch->subgroups) { scratch->subgroups = subgroups; realloc = true; } if (realloc) { agx_scratch_realloc(scratch); } } void agx_scratch_debug_pre(struct agx_scratch *scratch) { if (!scratch->buf) return; for (int core = 0; core < scratch->max_core_id; core++) { assert(!scratch->header->cores[core].alloc_cur); scratch->header->cores[core].alloc_max = 0; scratch->header->cores[core].alloc_failed = 0; memset(scratch->header->cores[core].alloc_count, 0, sizeof(scratch->header->cores[core].alloc_count)); } } void agx_scratch_debug_post(struct agx_scratch *scratch) { if (!scratch->buf) return; fprintf(stderr, "Scratch @ 0x%" PRIx64 "\n", scratch->buf->va->addr); for (int core = 0; core < scratch->max_core_id; core++) { fprintf(stderr, "Core %3d: max %d, failed %d, counts:", core, scratch->header->cores[core].alloc_max, scratch->header->cores[core].alloc_failed); for (unsigned bucket = 0; bucket < AGX_SPILL_SIZE_BUCKETS; bucket++) { fprintf(stderr, " %d:%-3d", bucket ? (AGX_SPILL_UNIT_DWORDS << (bucket - 1)) : 0, scratch->header->cores[core].alloc_count[bucket]); } fprintf(stderr, "\n"); assert(!scratch->header->cores[core].alloc_cur); assert(!scratch->header->cores[core].alloc_failed); } #ifdef SCRATCH_DEBUG unsigned core_index = 0; for (int core = 0; core < scratch->max_core_id; core++) { if (!scratch->core_present[core]) continue; void *p = scratch->data + scratch->core_size * core_index++; fprintf(stderr, "\nCORE %d (0x%lx)\n", core, scratch->core_size); u_hexdump(stderr, p, scratch->core_size, true); } #endif } void agx_scratch_init(struct agx_device *dev, struct agx_scratch *scratch) { memset(scratch, 0, sizeof(*scratch)); scratch->dev = dev; #ifdef SCRATCH_DEBUG_CORES scratch->num_cores = SCRATCH_DEBUG_CORES; #else scratch->num_cores = 0; for (unsigned cl = 0; cl < dev->params.num_clusters_total; cl++) { scratch->num_cores += util_bitcount(dev->params.core_masks[cl]); } #endif } void agx_scratch_fini(struct agx_scratch *scratch) { if (scratch->buf) agx_bo_unreference(scratch->dev, scratch->buf); scratch->buf = NULL; }