1/* 2 * Copyright 2020-2022 Matias N. Goldberg 3 * Copyright 2022 Intel Corporation 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24#version 310 es 25 26#if defined(GL_ES) && GL_ES == 1 27 // Desktop GLSL allows the const keyword for either compile-time or 28 // run-time constants. GLSL ES only allows the keyword for compile-time 29 // constants. Since we use const on run-time constants, define it to 30 // nothing. 31 #define const 32#endif 33 34%s // include "CrossPlatformSettings_piece_all.glsl" 35 36#define FLT_MAX 340282346638528859811704183484516925440.0f 37 38layout( location = 0 ) uniform uint p_numRefinements; 39 40uniform sampler2D srcTex; 41 42layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture; 43 44layout( std430, binding = 1 ) readonly restrict buffer globalBuffer 45{ 46 float2 c_oMatch5[256]; 47 float2 c_oMatch6[256]; 48}; 49 50layout( local_size_x = 8, // 51 local_size_y = 8, // 52 local_size_z = 1 ) in; 53 54float3 rgb565to888( float rgb565 ) 55{ 56 float3 retVal; 57 retVal.x = floor( rgb565 / 2048.0f ); 58 retVal.y = floor( mod( rgb565, 2048.0f ) / 32.0f ); 59 retVal.z = floor( mod( rgb565, 32.0f ) ); 60 61 // This is the correct 565 to 888 conversion: 62 // rgb = floor( rgb * ( 255.0f / float3( 31.0f, 63.0f, 31.0f ) ) + 0.5f ) 63 // 64 // However stb_dxt follows a different one: 65 // rb = floor( rb * ( 256 / 32 + 8 / 32 ) ); 66 // g = floor( g * ( 256 / 64 + 4 / 64 ) ); 67 // 68 // I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded 69 // It's quite possible this is the reason: 70 // http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/ 71 // 72 // Or maybe it's just because it's cheap to do with integer shifts. 73 // Anyway, we follow stb_dxt's conversion just in case 74 // (gives almost the same result, with 1 or -1 of difference for a very few values) 75 // 76 // Perhaps when we make 888 -> 565 -> 888 it doesn't matter 77 // because they end up mapping to the original number 78 79 return floor( retVal * float3( 8.25f, 4.0625f, 8.25f ) ); 80} 81 82float rgb888to565( float3 rgbValue ) 83{ 84 rgbValue.rb = floor( rgbValue.rb * 31.0f / 255.0f + 0.5f ); 85 rgbValue.g = floor( rgbValue.g * 63.0f / 255.0f + 0.5f ); 86 87 return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b; 88} 89 90// linear interpolation at 1/3 point between a and b, using desired rounding type 91float3 lerp13( float3 a, float3 b ) 92{ 93#ifdef STB_DXT_USE_ROUNDING_BIAS 94 // with rounding bias 95 return a + floor( ( b - a ) * ( 1.0f / 3.0f ) + 0.5f ); 96#else 97 // without rounding bias 98 return floor( ( 2.0f * a + b ) / 3.0f ); 99#endif 100} 101 102/// Unpacks a block of 4 colours from two 16-bit endpoints 103void EvalColors( out float3 colours[4], float c0, float c1 ) 104{ 105 colours[0] = rgb565to888( c0 ); 106 colours[1] = rgb565to888( c1 ); 107 colours[2] = lerp13( colours[0], colours[1] ); 108 colours[3] = lerp13( colours[1], colours[0] ); 109} 110 111/** The color optimization function. (Clever code, part 1) 112@param outMinEndp16 [out] 113 Minimum endpoint, in RGB565 114@param outMaxEndp16 [out] 115 Maximum endpoint, in RGB565 116*/ 117void OptimizeColorsBlock( const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16 ) 118{ 119 // determine color distribution 120 float3 avgColour; 121 float3 minColour; 122 float3 maxColour; 123 124 avgColour = minColour = maxColour = unpackUnorm4x8( srcPixelsBlock[0] ).xyz; 125 for( int i = 1; i < 16; ++i ) 126 { 127 const float3 currColourUnorm = unpackUnorm4x8( srcPixelsBlock[i] ).xyz; 128 avgColour += currColourUnorm; 129 minColour = min( minColour, currColourUnorm ); 130 maxColour = max( maxColour, currColourUnorm ); 131 } 132 133 avgColour = round( avgColour * 255.0f / 16.0f ); 134 maxColour *= 255.0f; 135 minColour *= 255.0f; 136 137 // determine covariance matrix 138 float cov[6]; 139 for( int i = 0; i < 6; ++i ) 140 cov[i] = 0.0f; 141 142 for( int i = 0; i < 16; ++i ) 143 { 144 const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; 145 float3 rgbDiff = currColour - avgColour; 146 147 cov[0] += rgbDiff.r * rgbDiff.r; 148 cov[1] += rgbDiff.r * rgbDiff.g; 149 cov[2] += rgbDiff.r * rgbDiff.b; 150 cov[3] += rgbDiff.g * rgbDiff.g; 151 cov[4] += rgbDiff.g * rgbDiff.b; 152 cov[5] += rgbDiff.b * rgbDiff.b; 153 } 154 155 // convert covariance matrix to float, find principal axis via power iter 156 for( int i = 0; i < 6; ++i ) 157 cov[i] /= 255.0f; 158 159 float3 vF = maxColour - minColour; 160 161 const int nIterPower = 4; 162 for( int iter = 0; iter < nIterPower; ++iter ) 163 { 164 const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2]; 165 const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4]; 166 const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5]; 167 168 vF.r = r; 169 vF.g = g; 170 vF.b = b; 171 } 172 173 float magn = max3( abs( vF.r ), abs( vF.g ), abs( vF.b ) ); 174 float3 v; 175 176 if( magn < 4.0f ) 177 { // too small, default to luminance 178 v.r = 299.0f; // JPEG YCbCr luma coefs, scaled by 1000. 179 v.g = 587.0f; 180 v.b = 114.0f; 181 } 182 else 183 { 184 v = trunc( vF * ( 512.0f / magn ) ); 185 } 186 187 // Pick colors at extreme points 188 float3 minEndpoint, maxEndpoint; 189 float minDot = FLT_MAX; 190 float maxDot = -FLT_MAX; 191 for( int i = 0; i < 16; ++i ) 192 { 193 const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; 194 const float dotValue = dot( currColour, v ); 195 196 if( dotValue < minDot ) 197 { 198 minDot = dotValue; 199 minEndpoint = currColour; 200 } 201 202 if( dotValue > maxDot ) 203 { 204 maxDot = dotValue; 205 maxEndpoint = currColour; 206 } 207 } 208 209 outMinEndp16 = rgb888to565( minEndpoint ); 210 outMaxEndp16 = rgb888to565( maxEndpoint ); 211} 212 213// The color matching function 214uint MatchColorsBlock( const uint srcPixelsBlock[16], float3 colour[4] ) 215{ 216 uint mask = 0u; 217 float3 dir = colour[0] - colour[1]; 218 float stops[4]; 219 220 for( int i = 0; i < 4; ++i ) 221 stops[i] = dot( colour[i], dir ); 222 223 // think of the colors as arranged on a line; project point onto that line, then choose 224 // next color out of available ones. we compute the crossover points for "best color in top 225 // half"/"best in bottom half" and then the same inside that subinterval. 226 // 227 // relying on this 1d approximation isn't always optimal in terms of euclidean distance, 228 // but it's very close and a lot faster. 229 // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html 230 231 float c0Point = trunc( ( stops[1] + stops[3] ) * 0.5f ); 232 float halfPoint = trunc( ( stops[3] + stops[2] ) * 0.5f ); 233 float c3Point = trunc( ( stops[2] + stops[0] ) * 0.5f ); 234 235#ifndef BC1_DITHER 236 // the version without dithering is straightforward 237 for( uint i = 16u; i-- > 0u; ) 238 { 239 const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; 240 241 const float dotValue = dot( currColour, dir ); 242 mask <<= 2u; 243 244 if( dotValue < halfPoint ) 245 mask |= ( ( dotValue < c0Point ) ? 1u : 3u ); 246 else 247 mask |= ( ( dotValue < c3Point ) ? 2u : 0u ); 248 } 249#else 250 // with floyd-steinberg dithering 251 float4 ep1 = float4( 0, 0, 0, 0 ); 252 float4 ep2 = float4( 0, 0, 0, 0 ); 253 254 c0Point *= 16.0f; 255 halfPoint *= 16.0f; 256 c3Point *= 16.0f; 257 258 for( uint y = 0u; y < 4u; ++y ) 259 { 260 float ditherDot; 261 uint lmask, step; 262 263 float3 currColour; 264 float dotValue; 265 266 currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 0u] ).xyz * 255.0f; 267 dotValue = dot( currColour, dir ); 268 269 ditherDot = ( dotValue * 16.0f ) + ( 3.0f * ep2[1] + 5.0f * ep2[0] ); 270 if( ditherDot < halfPoint ) 271 step = ( ditherDot < c0Point ) ? 1u : 3u; 272 else 273 step = ( ditherDot < c3Point ) ? 2u : 0u; 274 ep1[0] = dotValue - stops[step]; 275 lmask = step; 276 277 currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 1u] ).xyz * 255.0f; 278 dotValue = dot( currColour, dir ); 279 280 ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] ); 281 if( ditherDot < halfPoint ) 282 step = ( ditherDot < c0Point ) ? 1u : 3u; 283 else 284 step = ( ditherDot < c3Point ) ? 2u : 0u; 285 ep1[1] = dotValue - stops[step]; 286 lmask |= step << 2u; 287 288 currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f; 289 dotValue = dot( currColour, dir ); 290 291 ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] ); 292 if( ditherDot < halfPoint ) 293 step = ( ditherDot < c0Point ) ? 1u : 3u; 294 else 295 step = ( ditherDot < c3Point ) ? 2u : 0u; 296 ep1[2] = dotValue - stops[step]; 297 lmask |= step << 4u; 298 299 currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f; 300 dotValue = dot( currColour, dir ); 301 302 ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] ); 303 if( ditherDot < halfPoint ) 304 step = ( ditherDot < c0Point ) ? 1u : 3u; 305 else 306 step = ( ditherDot < c3Point ) ? 2u : 0u; 307 ep1[3] = dotValue - stops[step]; 308 lmask |= step << 6u; 309 310 mask |= lmask << ( y * 8u ); 311 { 312 float4 tmp = ep1; 313 ep1 = ep2; 314 ep2 = tmp; 315 } // swap 316 } 317#endif 318 319 return mask; 320} 321 322// The refinement function. (Clever code, part 2) 323// Tries to optimize colors to suit block contents better. 324// (By solving a least squares system via normal equations+Cramer's rule) 325bool RefineBlock( const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16, 326 inout float inOutMaxEndp16 ) 327{ 328 float newMin16, newMax16; 329 const float oldMin = inOutMinEndp16; 330 const float oldMax = inOutMaxEndp16; 331 332 if( ( mask ^ ( mask << 2u ) ) < 4u ) // all pixels have the same index? 333 { 334 // yes, linear system would be singular; solve using optimal 335 // single-color match on average color 336 float3 rgbVal = float3( 8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f ); 337 for( int i = 0; i < 16; ++i ) 338 rgbVal += unpackUnorm4x8( srcPixelsBlock[i] ).xyz; 339 340 rgbVal = floor( rgbVal * ( 255.0f / 16.0f ) ); 341 342 newMax16 = c_oMatch5[uint( rgbVal.r )][0] * 2048.0f + // 343 c_oMatch6[uint( rgbVal.g )][0] * 32.0f + // 344 c_oMatch5[uint( rgbVal.b )][0]; 345 newMin16 = c_oMatch5[uint( rgbVal.r )][1] * 2048.0f + // 346 c_oMatch6[uint( rgbVal.g )][1] * 32.0f + // 347 c_oMatch5[uint( rgbVal.b )][1]; 348 } 349 else 350 { 351 const float w1Tab[4] = float[4]( 3.0f, 0.0f, 2.0f, 1.0f ); 352 const float prods[4] = float[4]( 589824.0f, 2304.0f, 262402.0f, 66562.0f ); 353 // ^some magic to save a lot of multiplies in the accumulating loop... 354 // (precomputed products of weights for least squares system, accumulated inside one 32-bit 355 // register) 356 357 float akku = 0.0f; 358 uint cm = mask; 359 float3 at1 = float3( 0, 0, 0 ); 360 float3 at2 = float3( 0, 0, 0 ); 361 for( int i = 0; i < 16; ++i, cm >>= 2u ) 362 { 363 const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; 364 365 const uint step = cm & 3u; 366 const float w1 = w1Tab[step]; 367 akku += prods[step]; 368 at1 += currColour * w1; 369 at2 += currColour; 370 } 371 372 at2 = 3.0f * at2 - at1; 373 374 // extract solutions and decide solvability 375 const float xx = floor( akku / 65535.0f ); 376 const float yy = floor( mod( akku, 65535.0f ) / 256.0f ); 377 const float xy = mod( akku, 256.0f ); 378 379 float2 f_rb_g; 380 f_rb_g.x = 3.0f * 31.0f / 255.0f / ( xx * yy - xy * xy ); 381 f_rb_g.y = f_rb_g.x * 63.0f / 31.0f; 382 383 // solve. 384 const float3 newMaxVal = clamp( floor( ( at1 * yy - at2 * xy ) * f_rb_g.xyx + 0.5f ), 385 float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) ); 386 newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z; 387 388 const float3 newMinVal = clamp( floor( ( at2 * xx - at1 * xy ) * f_rb_g.xyx + 0.5f ), 389 float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) ); 390 newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z; 391 } 392 393 inOutMinEndp16 = newMin16; 394 inOutMaxEndp16 = newMax16; 395 396 return oldMin != newMin16 || oldMax != newMax16; 397} 398 399#ifdef BC1_DITHER 400/// Quantizes 'srcValue' which is originally in 888 (full range), 401/// converting it to 565 and then back to 888 (quantized) 402float3 quant( float3 srcValue ) 403{ 404 srcValue = clamp( srcValue, 0.0f, 255.0f ); 405 // Convert 888 -> 565 406 srcValue = floor( srcValue * float3( 31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f ) + 0.5f ); 407 // Convert 565 -> 888 back 408 srcValue = floor( srcValue * float3( 8.25f, 4.0625f, 8.25f ) ); 409 410 return srcValue; 411} 412 413void DitherBlock( const uint srcPixBlck[16], out uint dthPixBlck[16] ) 414{ 415 float3 ep1[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) ); 416 float3 ep2[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) ); 417 418 for( uint y = 0u; y < 16u; y += 4u ) 419 { 420 float3 srcPixel, dithPixel; 421 422 srcPixel = unpackUnorm4x8( srcPixBlck[y + 0u] ).xyz * 255.0f; 423 dithPixel = quant( srcPixel + trunc( ( 3.0f * ep2[1] + 5.0f * ep2[0] ) * ( 1.0f / 16.0f ) ) ); 424 ep1[0] = srcPixel - dithPixel; 425 dthPixBlck[y + 0u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); 426 427 srcPixel = unpackUnorm4x8( srcPixBlck[y + 1u] ).xyz * 255.0f; 428 dithPixel = quant( 429 srcPixel + trunc( ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] ) * ( 1.0f / 16.0f ) ) ); 430 ep1[1] = srcPixel - dithPixel; 431 dthPixBlck[y + 1u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); 432 433 srcPixel = unpackUnorm4x8( srcPixBlck[y + 2u] ).xyz * 255.0f; 434 dithPixel = quant( 435 srcPixel + trunc( ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] ) * ( 1.0f / 16.0f ) ) ); 436 ep1[2] = srcPixel - dithPixel; 437 dthPixBlck[y + 2u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); 438 439 srcPixel = unpackUnorm4x8( srcPixBlck[y + 3u] ).xyz * 255.0f; 440 dithPixel = quant( srcPixel + trunc( ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] ) * ( 1.0f / 16.0f ) ) ); 441 ep1[3] = srcPixel - dithPixel; 442 dthPixBlck[y + 3u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); 443 444 // swap( ep1, ep2 ) 445 for( uint i = 0u; i < 4u; ++i ) 446 { 447 float3 tmp = ep1[i]; 448 ep1[i] = ep2[i]; 449 ep2[i] = tmp; 450 } 451 } 452} 453#endif 454 455void main() 456{ 457 uint srcPixelsBlock[16]; 458 459 bool bAllColoursEqual = true; 460 461 // Load the whole 4x4 block 462 const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u; 463 for( uint i = 0u; i < 16u; ++i ) 464 { 465 const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i & 0x03u, i >> 2u ); 466 const float3 srcPixels0 = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyz; 467 srcPixelsBlock[i] = packUnorm4x8( float4( srcPixels0, 1.0f ) ); 468 bAllColoursEqual = bAllColoursEqual && srcPixelsBlock[0] == srcPixelsBlock[i]; 469 } 470 471 float maxEndp16, minEndp16; 472 uint mask = 0u; 473 474 if( bAllColoursEqual ) 475 { 476 const uint3 rgbVal = uint3( unpackUnorm4x8( srcPixelsBlock[0] ).xyz * 255.0f ); 477 mask = 0xAAAAAAAAu; 478 maxEndp16 = 479 c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0]; 480 minEndp16 = 481 c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1]; 482 } 483 else 484 { 485#ifdef BC1_DITHER 486 uint ditherPixelsBlock[16]; 487 // first step: compute dithered version for PCA if desired 488 DitherBlock( srcPixelsBlock, ditherPixelsBlock ); 489#else 490# define ditherPixelsBlock srcPixelsBlock 491#endif 492 493 // second step: pca+map along principal axis 494 OptimizeColorsBlock( ditherPixelsBlock, minEndp16, maxEndp16 ); 495 if( minEndp16 != maxEndp16 ) 496 { 497 float3 colours[4]; 498 EvalColors( colours, maxEndp16, minEndp16 ); // Note min/max are inverted 499 mask = MatchColorsBlock( srcPixelsBlock, colours ); 500 } 501 502 // third step: refine (multiple times if requested) 503 bool bStopRefinement = false; 504 for( uint i = 0u; i < p_numRefinements && !bStopRefinement; ++i ) 505 { 506 const uint lastMask = mask; 507 508 if( RefineBlock( ditherPixelsBlock, mask, minEndp16, maxEndp16 ) ) 509 { 510 if( minEndp16 != maxEndp16 ) 511 { 512 float3 colours[4]; 513 EvalColors( colours, maxEndp16, minEndp16 ); // Note min/max are inverted 514 mask = MatchColorsBlock( srcPixelsBlock, colours ); 515 } 516 else 517 { 518 mask = 0u; 519 bStopRefinement = true; 520 } 521 } 522 523 bStopRefinement = mask == lastMask || bStopRefinement; 524 } 525 } 526 527 // write the color block 528 if( maxEndp16 < minEndp16 ) 529 { 530 const float tmpValue = minEndp16; 531 minEndp16 = maxEndp16; 532 maxEndp16 = tmpValue; 533 mask ^= 0x55555555u; 534 } 535 536 uint4 outputBytes; 537 outputBytes.x = uint( maxEndp16 ); 538 outputBytes.y = uint( minEndp16 ); 539 outputBytes.z = mask & 0xFFFFu; 540 outputBytes.w = mask >> 16u; 541 542 uint2 dstUV = gl_GlobalInvocationID.xy; 543 imageStore( dstTexture, int2( dstUV ), outputBytes ); 544} 545