1/* 2 * Copyright 2024 Valve Corporation 3 * SPDX-License-Identifier: MIT 4 */ 5#include "agx_pack.h" 6#include "compression.h" 7#include "libagx.h" 8 9/* 10 * Decompress in place. The metadata is updated, so other processes can read the 11 * image with a compressed texture descriptor. 12 * 13 * Each workgroup processes one 16x16 tile, avoiding races. We use 32x1 14 * workgroups, matching the warp size, meaning each work-item must process 15 * (16*16)/(32*1) = 8 sampels. Matching the warp size eliminates cross-warp 16 * barriers. It also minimizes launched threads, accelerating the early exit. 17 */ 18 19/* Our compiler represents a bindless handle as a uint2 of a uniform base and an 20 * offset in bytes. Since the descriptors are all in the u0_u1 push, the former 21 * is hardcoded and the latter is an offsetof. 22 */ 23#define HANDLE(field) (uint2)(0, offsetof(struct libagx_decompress_push, field)) 24 25/* 26 * The metadata buffer is fully twiddled, so interleave the X/Y coordinate bits. 27 * While dimensions are padded to powers-of-two, they are not padded to a 28 * square. If the width is more than 2x the height or vice versa, the additional 29 * bits are linear. So we interleave as much as possible, and then add what's 30 * remaining. Finally, layers are strided linear and added at the end. 31 */ 32uint 33index_metadata(uint3 c, uint width, uint height, uint layer_stride) 34{ 35 uint major_coord = width > height ? c.x : c.y; 36 uint minor_dim = min(width, height); 37 38 uint intl_bits = libagx_logbase2_ceil(minor_dim); 39 uint intl_mask = (1 << intl_bits) - 1; 40 uint2 intl_coords = c.xy & intl_mask; 41 42 return nir_interleave_agx(intl_coords.x, intl_coords.y) + 43 ((major_coord & ~intl_mask) << intl_bits) + (layer_stride * c.z); 44} 45 46/* 47 * For multisampled images, a 2x2 or 1x2 group of samples form a single pixel. 48 * The following two helpers convert a coordinate in samples into a coordinate 49 * in pixels and a sample ID, respectively. They each assume that samples > 1. 50 */ 51int4 52decompose_px(int4 c, uint samples) 53{ 54 if (samples == 4) 55 c.xy >>= 1; 56 else 57 c.y >>= 1; 58 59 return c; 60} 61 62uint 63sample_id(int4 c, uint samples) 64{ 65 if (samples == 4) 66 return (c.x & 1) | ((c.y & 1) << 1); 67 else 68 return c.y & 1; 69} 70 71void 72libagx_decompress(constant struct libagx_decompress_push *push, uint3 coord_tl, 73 uint local_id, uint samples) 74{ 75 /* Index into the metadata buffer */ 76 uint index_tl = 77 index_metadata(coord_tl, push->metadata_width_tl, 78 push->metadata_height_tl, push->metadata_layer_stride_tl); 79 80 /* If the tile is already uncompressed, there's nothing to do. */ 81 if (push->metadata[index_tl] == push->tile_uncompressed) 82 return; 83 84 /* Tiles are 16x16 */ 85 uint2 coord_sa = (coord_tl.xy * 16); 86 uint layer = coord_tl.z; 87 88 /* Since we use a 32x1 workgroup, each work-item handles half of a row. */ 89 uint offs_y_sa = local_id >> 1; 90 uint offs_x_sa = (local_id & 1) ? 8 : 0; 91 92 int2 img_coord_sa_2d = convert_int2(coord_sa) + (int2)(offs_x_sa, offs_y_sa); 93 int4 img_coord_sa = (int4)(img_coord_sa_2d.x, img_coord_sa_2d.y, layer, 0); 94 95 /* Read our half-row into registers. */ 96 uint4 texels[8]; 97 for (uint i = 0; i < 8; ++i) { 98 int4 c_sa = img_coord_sa + (int4)(i, 0, 0, 0); 99 if (samples == 1) { 100 texels[i] = nir_bindless_image_load_array(HANDLE(compressed), c_sa); 101 } else { 102 int4 dec_px = decompose_px(c_sa, samples); 103 texels[i] = nir_bindless_image_load_ms_array( 104 HANDLE(compressed), dec_px, sample_id(c_sa, samples)); 105 } 106 } 107 108 sub_group_barrier(CLK_LOCAL_MEM_FENCE); 109 110 /* Now that the whole tile is read, we write without racing. */ 111 for (uint i = 0; i < 8; ++i) { 112 int4 c_sa = img_coord_sa + (int4)(i, 0, 0, 0); 113 if (samples == 1) { 114 nir_bindless_image_store_array(HANDLE(uncompressed), c_sa, texels[i]); 115 } else { 116 int4 dec_px = decompose_px(c_sa, samples); 117 nir_bindless_image_store_ms_array(HANDLE(uncompressed), dec_px, 118 sample_id(c_sa, samples), texels[i]); 119 } 120 } 121 122 /* We've replaced the body buffer. Mark the tile as uncompressed. */ 123 if (local_id == 0) { 124 push->metadata[index_tl] = push->tile_uncompressed; 125 } 126} 127