xref: /aosp_15_r20/external/mesa3d/src/asahi/lib/shaders/compression.cl (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1*61046927SAndroid Build Coastguard Worker/*
2*61046927SAndroid Build Coastguard Worker * Copyright 2024 Valve Corporation
3*61046927SAndroid Build Coastguard Worker * SPDX-License-Identifier: MIT
4*61046927SAndroid Build Coastguard Worker */
5*61046927SAndroid Build Coastguard Worker#include "agx_pack.h"
6*61046927SAndroid Build Coastguard Worker#include "compression.h"
7*61046927SAndroid Build Coastguard Worker#include "libagx.h"
8*61046927SAndroid Build Coastguard Worker
9*61046927SAndroid Build Coastguard Worker/*
10*61046927SAndroid Build Coastguard Worker * Decompress in place. The metadata is updated, so other processes can read the
11*61046927SAndroid Build Coastguard Worker * image with a compressed texture descriptor.
12*61046927SAndroid Build Coastguard Worker *
13*61046927SAndroid Build Coastguard Worker * Each workgroup processes one 16x16 tile, avoiding races. We use 32x1
14*61046927SAndroid Build Coastguard Worker * workgroups, matching the warp size, meaning each work-item must process
15*61046927SAndroid Build Coastguard Worker * (16*16)/(32*1) = 8 sampels. Matching the warp size eliminates cross-warp
16*61046927SAndroid Build Coastguard Worker * barriers. It also minimizes launched threads, accelerating the early exit.
17*61046927SAndroid Build Coastguard Worker */
18*61046927SAndroid Build Coastguard Worker
19*61046927SAndroid Build Coastguard Worker/* Our compiler represents a bindless handle as a uint2 of a uniform base and an
20*61046927SAndroid Build Coastguard Worker * offset in bytes. Since the descriptors are all in the u0_u1 push, the former
21*61046927SAndroid Build Coastguard Worker * is hardcoded and the latter is an offsetof.
22*61046927SAndroid Build Coastguard Worker */
23*61046927SAndroid Build Coastguard Worker#define HANDLE(field) (uint2)(0, offsetof(struct libagx_decompress_push, field))
24*61046927SAndroid Build Coastguard Worker
25*61046927SAndroid Build Coastguard Worker/*
26*61046927SAndroid Build Coastguard Worker * The metadata buffer is fully twiddled, so interleave the X/Y coordinate bits.
27*61046927SAndroid Build Coastguard Worker * While dimensions are padded to powers-of-two, they are not padded to a
28*61046927SAndroid Build Coastguard Worker * square. If the width is more than 2x the height or vice versa, the additional
29*61046927SAndroid Build Coastguard Worker * bits are linear. So we interleave as much as possible, and then add what's
30*61046927SAndroid Build Coastguard Worker * remaining. Finally, layers are strided linear and added at the end.
31*61046927SAndroid Build Coastguard Worker */
32*61046927SAndroid Build Coastguard Workeruint
33*61046927SAndroid Build Coastguard Workerindex_metadata(uint3 c, uint width, uint height, uint layer_stride)
34*61046927SAndroid Build Coastguard Worker{
35*61046927SAndroid Build Coastguard Worker   uint major_coord = width > height ? c.x : c.y;
36*61046927SAndroid Build Coastguard Worker   uint minor_dim = min(width, height);
37*61046927SAndroid Build Coastguard Worker
38*61046927SAndroid Build Coastguard Worker   uint intl_bits = libagx_logbase2_ceil(minor_dim);
39*61046927SAndroid Build Coastguard Worker   uint intl_mask = (1 << intl_bits) - 1;
40*61046927SAndroid Build Coastguard Worker   uint2 intl_coords = c.xy & intl_mask;
41*61046927SAndroid Build Coastguard Worker
42*61046927SAndroid Build Coastguard Worker   return nir_interleave_agx(intl_coords.x, intl_coords.y) +
43*61046927SAndroid Build Coastguard Worker          ((major_coord & ~intl_mask) << intl_bits) + (layer_stride * c.z);
44*61046927SAndroid Build Coastguard Worker}
45*61046927SAndroid Build Coastguard Worker
46*61046927SAndroid Build Coastguard Worker/*
47*61046927SAndroid Build Coastguard Worker * For multisampled images, a 2x2 or 1x2 group of samples form a single pixel.
48*61046927SAndroid Build Coastguard Worker * The following two helpers convert a coordinate in samples into a coordinate
49*61046927SAndroid Build Coastguard Worker * in pixels and a sample ID, respectively. They each assume that samples > 1.
50*61046927SAndroid Build Coastguard Worker */
51*61046927SAndroid Build Coastguard Workerint4
52*61046927SAndroid Build Coastguard Workerdecompose_px(int4 c, uint samples)
53*61046927SAndroid Build Coastguard Worker{
54*61046927SAndroid Build Coastguard Worker   if (samples == 4)
55*61046927SAndroid Build Coastguard Worker      c.xy >>= 1;
56*61046927SAndroid Build Coastguard Worker   else
57*61046927SAndroid Build Coastguard Worker      c.y >>= 1;
58*61046927SAndroid Build Coastguard Worker
59*61046927SAndroid Build Coastguard Worker   return c;
60*61046927SAndroid Build Coastguard Worker}
61*61046927SAndroid Build Coastguard Worker
62*61046927SAndroid Build Coastguard Workeruint
63*61046927SAndroid Build Coastguard Workersample_id(int4 c, uint samples)
64*61046927SAndroid Build Coastguard Worker{
65*61046927SAndroid Build Coastguard Worker   if (samples == 4)
66*61046927SAndroid Build Coastguard Worker      return (c.x & 1) | ((c.y & 1) << 1);
67*61046927SAndroid Build Coastguard Worker   else
68*61046927SAndroid Build Coastguard Worker      return c.y & 1;
69*61046927SAndroid Build Coastguard Worker}
70*61046927SAndroid Build Coastguard Worker
71*61046927SAndroid Build Coastguard Workervoid
72*61046927SAndroid Build Coastguard Workerlibagx_decompress(constant struct libagx_decompress_push *push, uint3 coord_tl,
73*61046927SAndroid Build Coastguard Worker                  uint local_id, uint samples)
74*61046927SAndroid Build Coastguard Worker{
75*61046927SAndroid Build Coastguard Worker   /* Index into the metadata buffer */
76*61046927SAndroid Build Coastguard Worker   uint index_tl =
77*61046927SAndroid Build Coastguard Worker      index_metadata(coord_tl, push->metadata_width_tl,
78*61046927SAndroid Build Coastguard Worker                     push->metadata_height_tl, push->metadata_layer_stride_tl);
79*61046927SAndroid Build Coastguard Worker
80*61046927SAndroid Build Coastguard Worker   /* If the tile is already uncompressed, there's nothing to do. */
81*61046927SAndroid Build Coastguard Worker   if (push->metadata[index_tl] == push->tile_uncompressed)
82*61046927SAndroid Build Coastguard Worker      return;
83*61046927SAndroid Build Coastguard Worker
84*61046927SAndroid Build Coastguard Worker   /* Tiles are 16x16 */
85*61046927SAndroid Build Coastguard Worker   uint2 coord_sa = (coord_tl.xy * 16);
86*61046927SAndroid Build Coastguard Worker   uint layer = coord_tl.z;
87*61046927SAndroid Build Coastguard Worker
88*61046927SAndroid Build Coastguard Worker   /* Since we use a 32x1 workgroup, each work-item handles half of a row. */
89*61046927SAndroid Build Coastguard Worker   uint offs_y_sa = local_id >> 1;
90*61046927SAndroid Build Coastguard Worker   uint offs_x_sa = (local_id & 1) ? 8 : 0;
91*61046927SAndroid Build Coastguard Worker
92*61046927SAndroid Build Coastguard Worker   int2 img_coord_sa_2d = convert_int2(coord_sa) + (int2)(offs_x_sa, offs_y_sa);
93*61046927SAndroid Build Coastguard Worker   int4 img_coord_sa = (int4)(img_coord_sa_2d.x, img_coord_sa_2d.y, layer, 0);
94*61046927SAndroid Build Coastguard Worker
95*61046927SAndroid Build Coastguard Worker   /* Read our half-row into registers. */
96*61046927SAndroid Build Coastguard Worker   uint4 texels[8];
97*61046927SAndroid Build Coastguard Worker   for (uint i = 0; i < 8; ++i) {
98*61046927SAndroid Build Coastguard Worker      int4 c_sa = img_coord_sa + (int4)(i, 0, 0, 0);
99*61046927SAndroid Build Coastguard Worker      if (samples == 1) {
100*61046927SAndroid Build Coastguard Worker         texels[i] = nir_bindless_image_load_array(HANDLE(compressed), c_sa);
101*61046927SAndroid Build Coastguard Worker      } else {
102*61046927SAndroid Build Coastguard Worker         int4 dec_px = decompose_px(c_sa, samples);
103*61046927SAndroid Build Coastguard Worker         texels[i] = nir_bindless_image_load_ms_array(
104*61046927SAndroid Build Coastguard Worker            HANDLE(compressed), dec_px, sample_id(c_sa, samples));
105*61046927SAndroid Build Coastguard Worker      }
106*61046927SAndroid Build Coastguard Worker   }
107*61046927SAndroid Build Coastguard Worker
108*61046927SAndroid Build Coastguard Worker   sub_group_barrier(CLK_LOCAL_MEM_FENCE);
109*61046927SAndroid Build Coastguard Worker
110*61046927SAndroid Build Coastguard Worker   /* Now that the whole tile is read, we write without racing. */
111*61046927SAndroid Build Coastguard Worker   for (uint i = 0; i < 8; ++i) {
112*61046927SAndroid Build Coastguard Worker      int4 c_sa = img_coord_sa + (int4)(i, 0, 0, 0);
113*61046927SAndroid Build Coastguard Worker      if (samples == 1) {
114*61046927SAndroid Build Coastguard Worker         nir_bindless_image_store_array(HANDLE(uncompressed), c_sa, texels[i]);
115*61046927SAndroid Build Coastguard Worker      } else {
116*61046927SAndroid Build Coastguard Worker         int4 dec_px = decompose_px(c_sa, samples);
117*61046927SAndroid Build Coastguard Worker         nir_bindless_image_store_ms_array(HANDLE(uncompressed), dec_px,
118*61046927SAndroid Build Coastguard Worker                                           sample_id(c_sa, samples), texels[i]);
119*61046927SAndroid Build Coastguard Worker      }
120*61046927SAndroid Build Coastguard Worker   }
121*61046927SAndroid Build Coastguard Worker
122*61046927SAndroid Build Coastguard Worker   /* We've replaced the body buffer. Mark the tile as uncompressed. */
123*61046927SAndroid Build Coastguard Worker   if (local_id == 0) {
124*61046927SAndroid Build Coastguard Worker      push->metadata[index_tl] = push->tile_uncompressed;
125*61046927SAndroid Build Coastguard Worker   }
126*61046927SAndroid Build Coastguard Worker}
127