xref: /aosp_15_r20/external/mesa3d/src/asahi/lib/shaders/compression.cl (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1/*
2 * Copyright 2024 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 */
5#include "agx_pack.h"
6#include "compression.h"
7#include "libagx.h"
8
9/*
10 * Decompress in place. The metadata is updated, so other processes can read the
11 * image with a compressed texture descriptor.
12 *
13 * Each workgroup processes one 16x16 tile, avoiding races. We use 32x1
14 * workgroups, matching the warp size, meaning each work-item must process
15 * (16*16)/(32*1) = 8 sampels. Matching the warp size eliminates cross-warp
16 * barriers. It also minimizes launched threads, accelerating the early exit.
17 */
18
19/* Our compiler represents a bindless handle as a uint2 of a uniform base and an
20 * offset in bytes. Since the descriptors are all in the u0_u1 push, the former
21 * is hardcoded and the latter is an offsetof.
22 */
23#define HANDLE(field) (uint2)(0, offsetof(struct libagx_decompress_push, field))
24
25/*
26 * The metadata buffer is fully twiddled, so interleave the X/Y coordinate bits.
27 * While dimensions are padded to powers-of-two, they are not padded to a
28 * square. If the width is more than 2x the height or vice versa, the additional
29 * bits are linear. So we interleave as much as possible, and then add what's
30 * remaining. Finally, layers are strided linear and added at the end.
31 */
32uint
33index_metadata(uint3 c, uint width, uint height, uint layer_stride)
34{
35   uint major_coord = width > height ? c.x : c.y;
36   uint minor_dim = min(width, height);
37
38   uint intl_bits = libagx_logbase2_ceil(minor_dim);
39   uint intl_mask = (1 << intl_bits) - 1;
40   uint2 intl_coords = c.xy & intl_mask;
41
42   return nir_interleave_agx(intl_coords.x, intl_coords.y) +
43          ((major_coord & ~intl_mask) << intl_bits) + (layer_stride * c.z);
44}
45
46/*
47 * For multisampled images, a 2x2 or 1x2 group of samples form a single pixel.
48 * The following two helpers convert a coordinate in samples into a coordinate
49 * in pixels and a sample ID, respectively. They each assume that samples > 1.
50 */
51int4
52decompose_px(int4 c, uint samples)
53{
54   if (samples == 4)
55      c.xy >>= 1;
56   else
57      c.y >>= 1;
58
59   return c;
60}
61
62uint
63sample_id(int4 c, uint samples)
64{
65   if (samples == 4)
66      return (c.x & 1) | ((c.y & 1) << 1);
67   else
68      return c.y & 1;
69}
70
71void
72libagx_decompress(constant struct libagx_decompress_push *push, uint3 coord_tl,
73                  uint local_id, uint samples)
74{
75   /* Index into the metadata buffer */
76   uint index_tl =
77      index_metadata(coord_tl, push->metadata_width_tl,
78                     push->metadata_height_tl, push->metadata_layer_stride_tl);
79
80   /* If the tile is already uncompressed, there's nothing to do. */
81   if (push->metadata[index_tl] == push->tile_uncompressed)
82      return;
83
84   /* Tiles are 16x16 */
85   uint2 coord_sa = (coord_tl.xy * 16);
86   uint layer = coord_tl.z;
87
88   /* Since we use a 32x1 workgroup, each work-item handles half of a row. */
89   uint offs_y_sa = local_id >> 1;
90   uint offs_x_sa = (local_id & 1) ? 8 : 0;
91
92   int2 img_coord_sa_2d = convert_int2(coord_sa) + (int2)(offs_x_sa, offs_y_sa);
93   int4 img_coord_sa = (int4)(img_coord_sa_2d.x, img_coord_sa_2d.y, layer, 0);
94
95   /* Read our half-row into registers. */
96   uint4 texels[8];
97   for (uint i = 0; i < 8; ++i) {
98      int4 c_sa = img_coord_sa + (int4)(i, 0, 0, 0);
99      if (samples == 1) {
100         texels[i] = nir_bindless_image_load_array(HANDLE(compressed), c_sa);
101      } else {
102         int4 dec_px = decompose_px(c_sa, samples);
103         texels[i] = nir_bindless_image_load_ms_array(
104            HANDLE(compressed), dec_px, sample_id(c_sa, samples));
105      }
106   }
107
108   sub_group_barrier(CLK_LOCAL_MEM_FENCE);
109
110   /* Now that the whole tile is read, we write without racing. */
111   for (uint i = 0; i < 8; ++i) {
112      int4 c_sa = img_coord_sa + (int4)(i, 0, 0, 0);
113      if (samples == 1) {
114         nir_bindless_image_store_array(HANDLE(uncompressed), c_sa, texels[i]);
115      } else {
116         int4 dec_px = decompose_px(c_sa, samples);
117         nir_bindless_image_store_ms_array(HANDLE(uncompressed), dec_px,
118                                           sample_id(c_sa, samples), texels[i]);
119      }
120   }
121
122   /* We've replaced the body buffer. Mark the tile as uncompressed. */
123   if (local_id == 0) {
124      push->metadata[index_tl] = push->tile_uncompressed;
125   }
126}
127