xref: /aosp_15_r20/external/mesa3d/src/compiler/glsl/bc1.glsl (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1/*
2 * Copyright 2020-2022 Matias N. Goldberg
3 * Copyright 2022 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24#version 310 es
25
26#if defined(GL_ES) && GL_ES == 1
27	// Desktop GLSL allows the const keyword for either compile-time or
28	// run-time constants. GLSL ES only allows the keyword for compile-time
29	// constants. Since we use const on run-time constants, define it to
30	// nothing.
31	#define const
32#endif
33
34%s // include "CrossPlatformSettings_piece_all.glsl"
35
36#define FLT_MAX 340282346638528859811704183484516925440.0f
37
38layout( location = 0 ) uniform uint p_numRefinements;
39
40uniform sampler2D srcTex;
41
42layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture;
43
44layout( std430, binding = 1 ) readonly restrict buffer globalBuffer
45{
46	float2 c_oMatch5[256];
47	float2 c_oMatch6[256];
48};
49
50layout( local_size_x = 8,  //
51		local_size_y = 8,  //
52		local_size_z = 1 ) in;
53
54float3 rgb565to888( float rgb565 )
55{
56	float3 retVal;
57	retVal.x = floor( rgb565 / 2048.0f );
58	retVal.y = floor( mod( rgb565, 2048.0f ) / 32.0f );
59	retVal.z = floor( mod( rgb565, 32.0f ) );
60
61	// This is the correct 565 to 888 conversion:
62	//		rgb = floor( rgb * ( 255.0f / float3( 31.0f, 63.0f, 31.0f ) ) + 0.5f )
63	//
64	// However stb_dxt follows a different one:
65	//		rb = floor( rb * ( 256 / 32 + 8 / 32 ) );
66	//		g  = floor( g  * ( 256 / 64 + 4 / 64 ) );
67	//
68	// I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded
69	// It's quite possible this is the reason:
70	//		http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/
71	//
72	// Or maybe it's just because it's cheap to do with integer shifts.
73	// Anyway, we follow stb_dxt's conversion just in case
74	// (gives almost the same result, with 1 or -1 of difference for a very few values)
75	//
76	// Perhaps when we make 888 -> 565 -> 888 it doesn't matter
77	// because they end up mapping to the original number
78
79	return floor( retVal * float3( 8.25f, 4.0625f, 8.25f ) );
80}
81
82float rgb888to565( float3 rgbValue )
83{
84	rgbValue.rb = floor( rgbValue.rb * 31.0f / 255.0f + 0.5f );
85	rgbValue.g = floor( rgbValue.g * 63.0f / 255.0f + 0.5f );
86
87	return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b;
88}
89
90// linear interpolation at 1/3 point between a and b, using desired rounding type
91float3 lerp13( float3 a, float3 b )
92{
93#ifdef STB_DXT_USE_ROUNDING_BIAS
94	// with rounding bias
95	return a + floor( ( b - a ) * ( 1.0f / 3.0f ) + 0.5f );
96#else
97	// without rounding bias
98	return floor( ( 2.0f * a + b ) / 3.0f );
99#endif
100}
101
102/// Unpacks a block of 4 colours from two 16-bit endpoints
103void EvalColors( out float3 colours[4], float c0, float c1 )
104{
105	colours[0] = rgb565to888( c0 );
106	colours[1] = rgb565to888( c1 );
107	colours[2] = lerp13( colours[0], colours[1] );
108	colours[3] = lerp13( colours[1], colours[0] );
109}
110
111/** The color optimization function. (Clever code, part 1)
112@param outMinEndp16 [out]
113	Minimum endpoint, in RGB565
114@param outMaxEndp16 [out]
115	Maximum endpoint, in RGB565
116*/
117void OptimizeColorsBlock( const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16 )
118{
119	// determine color distribution
120	float3 avgColour;
121	float3 minColour;
122	float3 maxColour;
123
124	avgColour = minColour = maxColour = unpackUnorm4x8( srcPixelsBlock[0] ).xyz;
125	for( int i = 1; i < 16; ++i )
126	{
127		const float3 currColourUnorm = unpackUnorm4x8( srcPixelsBlock[i] ).xyz;
128		avgColour += currColourUnorm;
129		minColour = min( minColour, currColourUnorm );
130		maxColour = max( maxColour, currColourUnorm );
131	}
132
133	avgColour = round( avgColour * 255.0f / 16.0f );
134	maxColour *= 255.0f;
135	minColour *= 255.0f;
136
137	// determine covariance matrix
138	float cov[6];
139	for( int i = 0; i < 6; ++i )
140		cov[i] = 0.0f;
141
142	for( int i = 0; i < 16; ++i )
143	{
144		const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
145		float3 rgbDiff = currColour - avgColour;
146
147		cov[0] += rgbDiff.r * rgbDiff.r;
148		cov[1] += rgbDiff.r * rgbDiff.g;
149		cov[2] += rgbDiff.r * rgbDiff.b;
150		cov[3] += rgbDiff.g * rgbDiff.g;
151		cov[4] += rgbDiff.g * rgbDiff.b;
152		cov[5] += rgbDiff.b * rgbDiff.b;
153	}
154
155	// convert covariance matrix to float, find principal axis via power iter
156	for( int i = 0; i < 6; ++i )
157		cov[i] /= 255.0f;
158
159	float3 vF = maxColour - minColour;
160
161	const int nIterPower = 4;
162	for( int iter = 0; iter < nIterPower; ++iter )
163	{
164		const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2];
165		const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4];
166		const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5];
167
168		vF.r = r;
169		vF.g = g;
170		vF.b = b;
171	}
172
173	float magn = max3( abs( vF.r ), abs( vF.g ), abs( vF.b ) );
174	float3 v;
175
176	if( magn < 4.0f )
177	{                  // too small, default to luminance
178		v.r = 299.0f;  // JPEG YCbCr luma coefs, scaled by 1000.
179		v.g = 587.0f;
180		v.b = 114.0f;
181	}
182	else
183	{
184		v = trunc( vF * ( 512.0f / magn ) );
185	}
186
187	// Pick colors at extreme points
188	float3 minEndpoint, maxEndpoint;
189	float minDot = FLT_MAX;
190	float maxDot = -FLT_MAX;
191	for( int i = 0; i < 16; ++i )
192	{
193		const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
194		const float dotValue = dot( currColour, v );
195
196		if( dotValue < minDot )
197		{
198			minDot = dotValue;
199			minEndpoint = currColour;
200		}
201
202		if( dotValue > maxDot )
203		{
204			maxDot = dotValue;
205			maxEndpoint = currColour;
206		}
207	}
208
209	outMinEndp16 = rgb888to565( minEndpoint );
210	outMaxEndp16 = rgb888to565( maxEndpoint );
211}
212
213// The color matching function
214uint MatchColorsBlock( const uint srcPixelsBlock[16], float3 colour[4] )
215{
216	uint mask = 0u;
217	float3 dir = colour[0] - colour[1];
218	float stops[4];
219
220	for( int i = 0; i < 4; ++i )
221		stops[i] = dot( colour[i], dir );
222
223	// think of the colors as arranged on a line; project point onto that line, then choose
224	// next color out of available ones. we compute the crossover points for "best color in top
225	// half"/"best in bottom half" and then the same inside that subinterval.
226	//
227	// relying on this 1d approximation isn't always optimal in terms of euclidean distance,
228	// but it's very close and a lot faster.
229	// http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
230
231	float c0Point = trunc( ( stops[1] + stops[3] ) * 0.5f );
232	float halfPoint = trunc( ( stops[3] + stops[2] ) * 0.5f );
233	float c3Point = trunc( ( stops[2] + stops[0] ) * 0.5f );
234
235#ifndef BC1_DITHER
236	// the version without dithering is straightforward
237	for( uint i = 16u; i-- > 0u; )
238	{
239		const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
240
241		const float dotValue = dot( currColour, dir );
242		mask <<= 2u;
243
244		if( dotValue < halfPoint )
245			mask |= ( ( dotValue < c0Point ) ? 1u : 3u );
246		else
247			mask |= ( ( dotValue < c3Point ) ? 2u : 0u );
248	}
249#else
250	// with floyd-steinberg dithering
251	float4 ep1 = float4( 0, 0, 0, 0 );
252	float4 ep2 = float4( 0, 0, 0, 0 );
253
254	c0Point *= 16.0f;
255	halfPoint *= 16.0f;
256	c3Point *= 16.0f;
257
258	for( uint y = 0u; y < 4u; ++y )
259	{
260		float ditherDot;
261		uint lmask, step;
262
263		float3 currColour;
264		float dotValue;
265
266		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 0u] ).xyz * 255.0f;
267		dotValue = dot( currColour, dir );
268
269		ditherDot = ( dotValue * 16.0f ) + ( 3.0f * ep2[1] + 5.0f * ep2[0] );
270		if( ditherDot < halfPoint )
271			step = ( ditherDot < c0Point ) ? 1u : 3u;
272		else
273			step = ( ditherDot < c3Point ) ? 2u : 0u;
274		ep1[0] = dotValue - stops[step];
275		lmask = step;
276
277		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 1u] ).xyz * 255.0f;
278		dotValue = dot( currColour, dir );
279
280		ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] );
281		if( ditherDot < halfPoint )
282			step = ( ditherDot < c0Point ) ? 1u : 3u;
283		else
284			step = ( ditherDot < c3Point ) ? 2u : 0u;
285		ep1[1] = dotValue - stops[step];
286		lmask |= step << 2u;
287
288		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f;
289		dotValue = dot( currColour, dir );
290
291		ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] );
292		if( ditherDot < halfPoint )
293			step = ( ditherDot < c0Point ) ? 1u : 3u;
294		else
295			step = ( ditherDot < c3Point ) ? 2u : 0u;
296		ep1[2] = dotValue - stops[step];
297		lmask |= step << 4u;
298
299		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f;
300		dotValue = dot( currColour, dir );
301
302		ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] );
303		if( ditherDot < halfPoint )
304			step = ( ditherDot < c0Point ) ? 1u : 3u;
305		else
306			step = ( ditherDot < c3Point ) ? 2u : 0u;
307		ep1[3] = dotValue - stops[step];
308		lmask |= step << 6u;
309
310		mask |= lmask << ( y * 8u );
311		{
312			float4 tmp = ep1;
313			ep1 = ep2;
314			ep2 = tmp;
315		}  // swap
316	}
317#endif
318
319	return mask;
320}
321
322// The refinement function. (Clever code, part 2)
323// Tries to optimize colors to suit block contents better.
324// (By solving a least squares system via normal equations+Cramer's rule)
325bool RefineBlock( const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16,
326				  inout float inOutMaxEndp16 )
327{
328	float newMin16, newMax16;
329	const float oldMin = inOutMinEndp16;
330	const float oldMax = inOutMaxEndp16;
331
332	if( ( mask ^ ( mask << 2u ) ) < 4u )  // all pixels have the same index?
333	{
334		// yes, linear system would be singular; solve using optimal
335		// single-color match on average color
336		float3 rgbVal = float3( 8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f );
337		for( int i = 0; i < 16; ++i )
338			rgbVal += unpackUnorm4x8( srcPixelsBlock[i] ).xyz;
339
340		rgbVal = floor( rgbVal * ( 255.0f / 16.0f ) );
341
342		newMax16 = c_oMatch5[uint( rgbVal.r )][0] * 2048.0f +  //
343				   c_oMatch6[uint( rgbVal.g )][0] * 32.0f +    //
344				   c_oMatch5[uint( rgbVal.b )][0];
345		newMin16 = c_oMatch5[uint( rgbVal.r )][1] * 2048.0f +  //
346				   c_oMatch6[uint( rgbVal.g )][1] * 32.0f +    //
347				   c_oMatch5[uint( rgbVal.b )][1];
348	}
349	else
350	{
351		const float w1Tab[4] = float[4]( 3.0f, 0.0f, 2.0f, 1.0f );
352		const float prods[4] = float[4]( 589824.0f, 2304.0f, 262402.0f, 66562.0f );
353		// ^some magic to save a lot of multiplies in the accumulating loop...
354		// (precomputed products of weights for least squares system, accumulated inside one 32-bit
355		// register)
356
357		float akku = 0.0f;
358		uint cm = mask;
359		float3 at1 = float3( 0, 0, 0 );
360		float3 at2 = float3( 0, 0, 0 );
361		for( int i = 0; i < 16; ++i, cm >>= 2u )
362		{
363			const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
364
365			const uint step = cm & 3u;
366			const float w1 = w1Tab[step];
367			akku += prods[step];
368			at1 += currColour * w1;
369			at2 += currColour;
370		}
371
372		at2 = 3.0f * at2 - at1;
373
374		// extract solutions and decide solvability
375		const float xx = floor( akku / 65535.0f );
376		const float yy = floor( mod( akku, 65535.0f ) / 256.0f );
377		const float xy = mod( akku, 256.0f );
378
379		float2 f_rb_g;
380		f_rb_g.x = 3.0f * 31.0f / 255.0f / ( xx * yy - xy * xy );
381		f_rb_g.y = f_rb_g.x * 63.0f / 31.0f;
382
383		// solve.
384		const float3 newMaxVal = clamp( floor( ( at1 * yy - at2 * xy ) * f_rb_g.xyx + 0.5f ),
385										float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) );
386		newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z;
387
388		const float3 newMinVal = clamp( floor( ( at2 * xx - at1 * xy ) * f_rb_g.xyx + 0.5f ),
389										float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) );
390		newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z;
391	}
392
393	inOutMinEndp16 = newMin16;
394	inOutMaxEndp16 = newMax16;
395
396	return oldMin != newMin16 || oldMax != newMax16;
397}
398
399#ifdef BC1_DITHER
400/// Quantizes 'srcValue' which is originally in 888 (full range),
401/// converting it to 565 and then back to 888 (quantized)
402float3 quant( float3 srcValue )
403{
404	srcValue = clamp( srcValue, 0.0f, 255.0f );
405	// Convert 888 -> 565
406	srcValue = floor( srcValue * float3( 31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f ) + 0.5f );
407	// Convert 565 -> 888 back
408	srcValue = floor( srcValue * float3( 8.25f, 4.0625f, 8.25f ) );
409
410	return srcValue;
411}
412
413void DitherBlock( const uint srcPixBlck[16], out uint dthPixBlck[16] )
414{
415	float3 ep1[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) );
416	float3 ep2[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) );
417
418	for( uint y = 0u; y < 16u; y += 4u )
419	{
420		float3 srcPixel, dithPixel;
421
422		srcPixel = unpackUnorm4x8( srcPixBlck[y + 0u] ).xyz * 255.0f;
423		dithPixel = quant( srcPixel + trunc( ( 3.0f * ep2[1] + 5.0f * ep2[0] ) * ( 1.0f / 16.0f ) ) );
424		ep1[0] = srcPixel - dithPixel;
425		dthPixBlck[y + 0u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );
426
427		srcPixel = unpackUnorm4x8( srcPixBlck[y + 1u] ).xyz * 255.0f;
428		dithPixel = quant(
429			srcPixel + trunc( ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] ) * ( 1.0f / 16.0f ) ) );
430		ep1[1] = srcPixel - dithPixel;
431		dthPixBlck[y + 1u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );
432
433		srcPixel = unpackUnorm4x8( srcPixBlck[y + 2u] ).xyz * 255.0f;
434		dithPixel = quant(
435			srcPixel + trunc( ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] ) * ( 1.0f / 16.0f ) ) );
436		ep1[2] = srcPixel - dithPixel;
437		dthPixBlck[y + 2u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );
438
439		srcPixel = unpackUnorm4x8( srcPixBlck[y + 3u] ).xyz * 255.0f;
440		dithPixel = quant( srcPixel + trunc( ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] ) * ( 1.0f / 16.0f ) ) );
441		ep1[3] = srcPixel - dithPixel;
442		dthPixBlck[y + 3u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );
443
444		// swap( ep1, ep2 )
445		for( uint i = 0u; i < 4u; ++i )
446		{
447			float3 tmp = ep1[i];
448			ep1[i] = ep2[i];
449			ep2[i] = tmp;
450		}
451	}
452}
453#endif
454
455void main()
456{
457	uint srcPixelsBlock[16];
458
459	bool bAllColoursEqual = true;
460
461	// Load the whole 4x4 block
462	const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u;
463	for( uint i = 0u; i < 16u; ++i )
464	{
465		const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i & 0x03u, i >> 2u );
466		const float3 srcPixels0 = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyz;
467		srcPixelsBlock[i] = packUnorm4x8( float4( srcPixels0, 1.0f ) );
468		bAllColoursEqual = bAllColoursEqual && srcPixelsBlock[0] == srcPixelsBlock[i];
469	}
470
471	float maxEndp16, minEndp16;
472	uint mask = 0u;
473
474	if( bAllColoursEqual )
475	{
476		const uint3 rgbVal = uint3( unpackUnorm4x8( srcPixelsBlock[0] ).xyz * 255.0f );
477		mask = 0xAAAAAAAAu;
478		maxEndp16 =
479			c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0];
480		minEndp16 =
481			c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1];
482	}
483	else
484	{
485#ifdef BC1_DITHER
486		uint ditherPixelsBlock[16];
487		// first step: compute dithered version for PCA if desired
488		DitherBlock( srcPixelsBlock, ditherPixelsBlock );
489#else
490#	define ditherPixelsBlock srcPixelsBlock
491#endif
492
493		// second step: pca+map along principal axis
494		OptimizeColorsBlock( ditherPixelsBlock, minEndp16, maxEndp16 );
495		if( minEndp16 != maxEndp16 )
496		{
497			float3 colours[4];
498			EvalColors( colours, maxEndp16, minEndp16 );  // Note min/max are inverted
499			mask = MatchColorsBlock( srcPixelsBlock, colours );
500		}
501
502		// third step: refine (multiple times if requested)
503		bool bStopRefinement = false;
504		for( uint i = 0u; i < p_numRefinements && !bStopRefinement; ++i )
505		{
506			const uint lastMask = mask;
507
508			if( RefineBlock( ditherPixelsBlock, mask, minEndp16, maxEndp16 ) )
509			{
510				if( minEndp16 != maxEndp16 )
511				{
512					float3 colours[4];
513					EvalColors( colours, maxEndp16, minEndp16 );  // Note min/max are inverted
514					mask = MatchColorsBlock( srcPixelsBlock, colours );
515				}
516				else
517				{
518					mask = 0u;
519					bStopRefinement = true;
520				}
521			}
522
523			bStopRefinement = mask == lastMask || bStopRefinement;
524		}
525	}
526
527	// write the color block
528	if( maxEndp16 < minEndp16 )
529	{
530		const float tmpValue = minEndp16;
531		minEndp16 = maxEndp16;
532		maxEndp16 = tmpValue;
533		mask ^= 0x55555555u;
534	}
535
536	uint4 outputBytes;
537	outputBytes.x = uint( maxEndp16 );
538	outputBytes.y = uint( minEndp16 );
539	outputBytes.z = mask & 0xFFFFu;
540	outputBytes.w = mask >> 16u;
541
542	uint2 dstUV = gl_GlobalInvocationID.xy;
543	imageStore( dstTexture, int2( dstUV ), outputBytes );
544}
545