1*61046927SAndroid Build Coastguard Worker/* 2*61046927SAndroid Build Coastguard Worker * Copyright 2024 Valve Corporation 3*61046927SAndroid Build Coastguard Worker * SPDX-License-Identifier: MIT 4*61046927SAndroid Build Coastguard Worker */ 5*61046927SAndroid Build Coastguard Worker#include "agx_pack.h" 6*61046927SAndroid Build Coastguard Worker#include "compression.h" 7*61046927SAndroid Build Coastguard Worker#include "libagx.h" 8*61046927SAndroid Build Coastguard Worker 9*61046927SAndroid Build Coastguard Worker/* 10*61046927SAndroid Build Coastguard Worker * Decompress in place. The metadata is updated, so other processes can read the 11*61046927SAndroid Build Coastguard Worker * image with a compressed texture descriptor. 12*61046927SAndroid Build Coastguard Worker * 13*61046927SAndroid Build Coastguard Worker * Each workgroup processes one 16x16 tile, avoiding races. We use 32x1 14*61046927SAndroid Build Coastguard Worker * workgroups, matching the warp size, meaning each work-item must process 15*61046927SAndroid Build Coastguard Worker * (16*16)/(32*1) = 8 sampels. Matching the warp size eliminates cross-warp 16*61046927SAndroid Build Coastguard Worker * barriers. It also minimizes launched threads, accelerating the early exit. 17*61046927SAndroid Build Coastguard Worker */ 18*61046927SAndroid Build Coastguard Worker 19*61046927SAndroid Build Coastguard Worker/* Our compiler represents a bindless handle as a uint2 of a uniform base and an 20*61046927SAndroid Build Coastguard Worker * offset in bytes. Since the descriptors are all in the u0_u1 push, the former 21*61046927SAndroid Build Coastguard Worker * is hardcoded and the latter is an offsetof. 22*61046927SAndroid Build Coastguard Worker */ 23*61046927SAndroid Build Coastguard Worker#define HANDLE(field) (uint2)(0, offsetof(struct libagx_decompress_push, field)) 24*61046927SAndroid Build Coastguard Worker 25*61046927SAndroid Build Coastguard Worker/* 26*61046927SAndroid Build Coastguard Worker * The metadata buffer is fully twiddled, so interleave the X/Y coordinate bits. 27*61046927SAndroid Build Coastguard Worker * While dimensions are padded to powers-of-two, they are not padded to a 28*61046927SAndroid Build Coastguard Worker * square. If the width is more than 2x the height or vice versa, the additional 29*61046927SAndroid Build Coastguard Worker * bits are linear. So we interleave as much as possible, and then add what's 30*61046927SAndroid Build Coastguard Worker * remaining. Finally, layers are strided linear and added at the end. 31*61046927SAndroid Build Coastguard Worker */ 32*61046927SAndroid Build Coastguard Workeruint 33*61046927SAndroid Build Coastguard Workerindex_metadata(uint3 c, uint width, uint height, uint layer_stride) 34*61046927SAndroid Build Coastguard Worker{ 35*61046927SAndroid Build Coastguard Worker uint major_coord = width > height ? c.x : c.y; 36*61046927SAndroid Build Coastguard Worker uint minor_dim = min(width, height); 37*61046927SAndroid Build Coastguard Worker 38*61046927SAndroid Build Coastguard Worker uint intl_bits = libagx_logbase2_ceil(minor_dim); 39*61046927SAndroid Build Coastguard Worker uint intl_mask = (1 << intl_bits) - 1; 40*61046927SAndroid Build Coastguard Worker uint2 intl_coords = c.xy & intl_mask; 41*61046927SAndroid Build Coastguard Worker 42*61046927SAndroid Build Coastguard Worker return nir_interleave_agx(intl_coords.x, intl_coords.y) + 43*61046927SAndroid Build Coastguard Worker ((major_coord & ~intl_mask) << intl_bits) + (layer_stride * c.z); 44*61046927SAndroid Build Coastguard Worker} 45*61046927SAndroid Build Coastguard Worker 46*61046927SAndroid Build Coastguard Worker/* 47*61046927SAndroid Build Coastguard Worker * For multisampled images, a 2x2 or 1x2 group of samples form a single pixel. 48*61046927SAndroid Build Coastguard Worker * The following two helpers convert a coordinate in samples into a coordinate 49*61046927SAndroid Build Coastguard Worker * in pixels and a sample ID, respectively. They each assume that samples > 1. 50*61046927SAndroid Build Coastguard Worker */ 51*61046927SAndroid Build Coastguard Workerint4 52*61046927SAndroid Build Coastguard Workerdecompose_px(int4 c, uint samples) 53*61046927SAndroid Build Coastguard Worker{ 54*61046927SAndroid Build Coastguard Worker if (samples == 4) 55*61046927SAndroid Build Coastguard Worker c.xy >>= 1; 56*61046927SAndroid Build Coastguard Worker else 57*61046927SAndroid Build Coastguard Worker c.y >>= 1; 58*61046927SAndroid Build Coastguard Worker 59*61046927SAndroid Build Coastguard Worker return c; 60*61046927SAndroid Build Coastguard Worker} 61*61046927SAndroid Build Coastguard Worker 62*61046927SAndroid Build Coastguard Workeruint 63*61046927SAndroid Build Coastguard Workersample_id(int4 c, uint samples) 64*61046927SAndroid Build Coastguard Worker{ 65*61046927SAndroid Build Coastguard Worker if (samples == 4) 66*61046927SAndroid Build Coastguard Worker return (c.x & 1) | ((c.y & 1) << 1); 67*61046927SAndroid Build Coastguard Worker else 68*61046927SAndroid Build Coastguard Worker return c.y & 1; 69*61046927SAndroid Build Coastguard Worker} 70*61046927SAndroid Build Coastguard Worker 71*61046927SAndroid Build Coastguard Workervoid 72*61046927SAndroid Build Coastguard Workerlibagx_decompress(constant struct libagx_decompress_push *push, uint3 coord_tl, 73*61046927SAndroid Build Coastguard Worker uint local_id, uint samples) 74*61046927SAndroid Build Coastguard Worker{ 75*61046927SAndroid Build Coastguard Worker /* Index into the metadata buffer */ 76*61046927SAndroid Build Coastguard Worker uint index_tl = 77*61046927SAndroid Build Coastguard Worker index_metadata(coord_tl, push->metadata_width_tl, 78*61046927SAndroid Build Coastguard Worker push->metadata_height_tl, push->metadata_layer_stride_tl); 79*61046927SAndroid Build Coastguard Worker 80*61046927SAndroid Build Coastguard Worker /* If the tile is already uncompressed, there's nothing to do. */ 81*61046927SAndroid Build Coastguard Worker if (push->metadata[index_tl] == push->tile_uncompressed) 82*61046927SAndroid Build Coastguard Worker return; 83*61046927SAndroid Build Coastguard Worker 84*61046927SAndroid Build Coastguard Worker /* Tiles are 16x16 */ 85*61046927SAndroid Build Coastguard Worker uint2 coord_sa = (coord_tl.xy * 16); 86*61046927SAndroid Build Coastguard Worker uint layer = coord_tl.z; 87*61046927SAndroid Build Coastguard Worker 88*61046927SAndroid Build Coastguard Worker /* Since we use a 32x1 workgroup, each work-item handles half of a row. */ 89*61046927SAndroid Build Coastguard Worker uint offs_y_sa = local_id >> 1; 90*61046927SAndroid Build Coastguard Worker uint offs_x_sa = (local_id & 1) ? 8 : 0; 91*61046927SAndroid Build Coastguard Worker 92*61046927SAndroid Build Coastguard Worker int2 img_coord_sa_2d = convert_int2(coord_sa) + (int2)(offs_x_sa, offs_y_sa); 93*61046927SAndroid Build Coastguard Worker int4 img_coord_sa = (int4)(img_coord_sa_2d.x, img_coord_sa_2d.y, layer, 0); 94*61046927SAndroid Build Coastguard Worker 95*61046927SAndroid Build Coastguard Worker /* Read our half-row into registers. */ 96*61046927SAndroid Build Coastguard Worker uint4 texels[8]; 97*61046927SAndroid Build Coastguard Worker for (uint i = 0; i < 8; ++i) { 98*61046927SAndroid Build Coastguard Worker int4 c_sa = img_coord_sa + (int4)(i, 0, 0, 0); 99*61046927SAndroid Build Coastguard Worker if (samples == 1) { 100*61046927SAndroid Build Coastguard Worker texels[i] = nir_bindless_image_load_array(HANDLE(compressed), c_sa); 101*61046927SAndroid Build Coastguard Worker } else { 102*61046927SAndroid Build Coastguard Worker int4 dec_px = decompose_px(c_sa, samples); 103*61046927SAndroid Build Coastguard Worker texels[i] = nir_bindless_image_load_ms_array( 104*61046927SAndroid Build Coastguard Worker HANDLE(compressed), dec_px, sample_id(c_sa, samples)); 105*61046927SAndroid Build Coastguard Worker } 106*61046927SAndroid Build Coastguard Worker } 107*61046927SAndroid Build Coastguard Worker 108*61046927SAndroid Build Coastguard Worker sub_group_barrier(CLK_LOCAL_MEM_FENCE); 109*61046927SAndroid Build Coastguard Worker 110*61046927SAndroid Build Coastguard Worker /* Now that the whole tile is read, we write without racing. */ 111*61046927SAndroid Build Coastguard Worker for (uint i = 0; i < 8; ++i) { 112*61046927SAndroid Build Coastguard Worker int4 c_sa = img_coord_sa + (int4)(i, 0, 0, 0); 113*61046927SAndroid Build Coastguard Worker if (samples == 1) { 114*61046927SAndroid Build Coastguard Worker nir_bindless_image_store_array(HANDLE(uncompressed), c_sa, texels[i]); 115*61046927SAndroid Build Coastguard Worker } else { 116*61046927SAndroid Build Coastguard Worker int4 dec_px = decompose_px(c_sa, samples); 117*61046927SAndroid Build Coastguard Worker nir_bindless_image_store_ms_array(HANDLE(uncompressed), dec_px, 118*61046927SAndroid Build Coastguard Worker sample_id(c_sa, samples), texels[i]); 119*61046927SAndroid Build Coastguard Worker } 120*61046927SAndroid Build Coastguard Worker } 121*61046927SAndroid Build Coastguard Worker 122*61046927SAndroid Build Coastguard Worker /* We've replaced the body buffer. Mark the tile as uncompressed. */ 123*61046927SAndroid Build Coastguard Worker if (local_id == 0) { 124*61046927SAndroid Build Coastguard Worker push->metadata[index_tl] = push->tile_uncompressed; 125*61046927SAndroid Build Coastguard Worker } 126*61046927SAndroid Build Coastguard Worker} 127