1*61046927SAndroid Build Coastguard Worker/* 2*61046927SAndroid Build Coastguard Worker * Copyright 2020-2022 Matias N. Goldberg 3*61046927SAndroid Build Coastguard Worker * Copyright 2022 Intel Corporation 4*61046927SAndroid Build Coastguard Worker * 5*61046927SAndroid Build Coastguard Worker * Permission is hereby granted, free of charge, to any person obtaining a 6*61046927SAndroid Build Coastguard Worker * copy of this software and associated documentation files (the "Software"), 7*61046927SAndroid Build Coastguard Worker * to deal in the Software without restriction, including without limitation 8*61046927SAndroid Build Coastguard Worker * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9*61046927SAndroid Build Coastguard Worker * and/or sell copies of the Software, and to permit persons to whom the 10*61046927SAndroid Build Coastguard Worker * Software is furnished to do so, subject to the following conditions: 11*61046927SAndroid Build Coastguard Worker * 12*61046927SAndroid Build Coastguard Worker * The above copyright notice and this permission notice shall be included in 13*61046927SAndroid Build Coastguard Worker * all copies or substantial portions of the Software. 14*61046927SAndroid Build Coastguard Worker * 15*61046927SAndroid Build Coastguard Worker * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16*61046927SAndroid Build Coastguard Worker * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17*61046927SAndroid Build Coastguard Worker * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18*61046927SAndroid Build Coastguard Worker * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19*61046927SAndroid Build Coastguard Worker * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20*61046927SAndroid Build Coastguard Worker * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21*61046927SAndroid Build Coastguard Worker * DEALINGS IN THE SOFTWARE. 22*61046927SAndroid Build Coastguard Worker */ 23*61046927SAndroid Build Coastguard Worker 24*61046927SAndroid Build Coastguard Worker#version 310 es 25*61046927SAndroid Build Coastguard Worker 26*61046927SAndroid Build Coastguard Worker#if defined(GL_ES) && GL_ES == 1 27*61046927SAndroid Build Coastguard Worker // Desktop GLSL allows the const keyword for either compile-time or 28*61046927SAndroid Build Coastguard Worker // run-time constants. GLSL ES only allows the keyword for compile-time 29*61046927SAndroid Build Coastguard Worker // constants. Since we use const on run-time constants, define it to 30*61046927SAndroid Build Coastguard Worker // nothing. 31*61046927SAndroid Build Coastguard Worker #define const 32*61046927SAndroid Build Coastguard Worker#endif 33*61046927SAndroid Build Coastguard Worker 34*61046927SAndroid Build Coastguard Worker#define __sharedOnlyBarrier memoryBarrierShared();barrier(); 35*61046927SAndroid Build Coastguard Worker 36*61046927SAndroid Build Coastguard Worker%s // include "CrossPlatformSettings_piece_all.glsl" 37*61046927SAndroid Build Coastguard Worker 38*61046927SAndroid Build Coastguard Workershared float2 g_minMaxValues[4u * 4u * 4u]; 39*61046927SAndroid Build Coastguard Workershared uint2 g_mask[4u * 4u]; 40*61046927SAndroid Build Coastguard Worker 41*61046927SAndroid Build Coastguard Workerlayout( location = 0 ) uniform uint2 params; 42*61046927SAndroid Build Coastguard Worker 43*61046927SAndroid Build Coastguard Worker#define p_channelIdx params.x 44*61046927SAndroid Build Coastguard Worker#define p_useSNorm params.y 45*61046927SAndroid Build Coastguard Worker 46*61046927SAndroid Build Coastguard Workeruniform sampler2D srcTex; 47*61046927SAndroid Build Coastguard Worker 48*61046927SAndroid Build Coastguard Workerlayout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture; 49*61046927SAndroid Build Coastguard Worker 50*61046927SAndroid Build Coastguard Workerlayout( local_size_x = 4, // 51*61046927SAndroid Build Coastguard Worker local_size_y = 4, // 52*61046927SAndroid Build Coastguard Worker local_size_z = 4 ) in; 53*61046927SAndroid Build Coastguard Worker 54*61046927SAndroid Build Coastguard Worker/// Each block is 16 pixels 55*61046927SAndroid Build Coastguard Worker/// Each thread works on 4 pixels 56*61046927SAndroid Build Coastguard Worker/// Therefore each block needs 4 threads, generating 8 masks 57*61046927SAndroid Build Coastguard Worker/// At the end these 8 masks get merged into 2 and results written to output 58*61046927SAndroid Build Coastguard Worker/// 59*61046927SAndroid Build Coastguard Worker/// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?** 60*61046927SAndroid Build Coastguard Worker/// 61*61046927SAndroid Build Coastguard Worker/// A: It's a sweetspot. 62*61046927SAndroid Build Coastguard Worker/// - Very short threads cannot fill expensive GPUs with enough work (dispatch bound) 63*61046927SAndroid Build Coastguard Worker/// - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks) 64*61046927SAndroid Build Coastguard Worker/// overhead, and also more LDS usage which reduces occupancy. 65*61046927SAndroid Build Coastguard Worker/// - Long threads (e.g. 1 thread per block) misses parallelism opportunities 66*61046927SAndroid Build Coastguard Workervoid main() 67*61046927SAndroid Build Coastguard Worker{ 68*61046927SAndroid Build Coastguard Worker float minVal, maxVal; 69*61046927SAndroid Build Coastguard Worker float4 srcPixel; 70*61046927SAndroid Build Coastguard Worker 71*61046927SAndroid Build Coastguard Worker const uint blockThreadId = gl_LocalInvocationID.x; 72*61046927SAndroid Build Coastguard Worker 73*61046927SAndroid Build Coastguard Worker const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u; 74*61046927SAndroid Build Coastguard Worker 75*61046927SAndroid Build Coastguard Worker for( uint i = 0u; i < 4u; ++i ) 76*61046927SAndroid Build Coastguard Worker { 77*61046927SAndroid Build Coastguard Worker const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i, blockThreadId ); 78*61046927SAndroid Build Coastguard Worker 79*61046927SAndroid Build Coastguard Worker const float4 value = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyzw; 80*61046927SAndroid Build Coastguard Worker srcPixel[i] = p_channelIdx == 0u ? value.x : ( p_channelIdx == 1u ? value.y : value.w ); 81*61046927SAndroid Build Coastguard Worker srcPixel[i] *= 255.0f; 82*61046927SAndroid Build Coastguard Worker } 83*61046927SAndroid Build Coastguard Worker 84*61046927SAndroid Build Coastguard Worker minVal = min3( srcPixel.x, srcPixel.y, srcPixel.z ); 85*61046927SAndroid Build Coastguard Worker maxVal = max3( srcPixel.x, srcPixel.y, srcPixel.z ); 86*61046927SAndroid Build Coastguard Worker minVal = min( minVal, srcPixel.w ); 87*61046927SAndroid Build Coastguard Worker maxVal = max( maxVal, srcPixel.w ); 88*61046927SAndroid Build Coastguard Worker 89*61046927SAndroid Build Coastguard Worker const uint minMaxIdxBase = ( gl_LocalInvocationID.z << 4u ) + ( gl_LocalInvocationID.y << 2u ); 90*61046927SAndroid Build Coastguard Worker const uint maskIdxBase = ( gl_LocalInvocationID.z << 2u ) + gl_LocalInvocationID.y; 91*61046927SAndroid Build Coastguard Worker 92*61046927SAndroid Build Coastguard Worker g_minMaxValues[minMaxIdxBase + blockThreadId] = float2( minVal, maxVal ); 93*61046927SAndroid Build Coastguard Worker g_mask[maskIdxBase] = uint2( 0u, 0u ); 94*61046927SAndroid Build Coastguard Worker 95*61046927SAndroid Build Coastguard Worker __sharedOnlyBarrier; 96*61046927SAndroid Build Coastguard Worker 97*61046927SAndroid Build Coastguard Worker // Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded 98*61046927SAndroid Build Coastguard Worker for( uint i = 0u; i < 4u; ++i ) 99*61046927SAndroid Build Coastguard Worker { 100*61046927SAndroid Build Coastguard Worker minVal = min( g_minMaxValues[minMaxIdxBase + i].x, minVal ); 101*61046927SAndroid Build Coastguard Worker maxVal = max( g_minMaxValues[minMaxIdxBase + i].y, maxVal ); 102*61046927SAndroid Build Coastguard Worker } 103*61046927SAndroid Build Coastguard Worker 104*61046927SAndroid Build Coastguard Worker // determine bias and emit color indices 105*61046927SAndroid Build Coastguard Worker // given the choice of maxVal/minVal, these indices are optimal: 106*61046927SAndroid Build Coastguard Worker // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/ 107*61046927SAndroid Build Coastguard Worker float dist = maxVal - minVal; 108*61046927SAndroid Build Coastguard Worker float dist4 = dist * 4.0f; 109*61046927SAndroid Build Coastguard Worker float dist2 = dist * 2.0f; 110*61046927SAndroid Build Coastguard Worker float bias = ( dist < 8.0f ) ? ( dist - 1.0f ) : ( trunc( dist * 0.5f ) + 2.0f ); 111*61046927SAndroid Build Coastguard Worker bias -= minVal * 7.0f; 112*61046927SAndroid Build Coastguard Worker 113*61046927SAndroid Build Coastguard Worker uint mask0 = 0u, mask1 = 0u; 114*61046927SAndroid Build Coastguard Worker 115*61046927SAndroid Build Coastguard Worker for( uint i = 0u; i < 4u; ++i ) 116*61046927SAndroid Build Coastguard Worker { 117*61046927SAndroid Build Coastguard Worker float a = srcPixel[i] * 7.0f + bias; 118*61046927SAndroid Build Coastguard Worker 119*61046927SAndroid Build Coastguard Worker int ind = 0; 120*61046927SAndroid Build Coastguard Worker 121*61046927SAndroid Build Coastguard Worker // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max). 122*61046927SAndroid Build Coastguard Worker if( a >= dist4 ) 123*61046927SAndroid Build Coastguard Worker { 124*61046927SAndroid Build Coastguard Worker ind = 4; 125*61046927SAndroid Build Coastguard Worker a -= dist4; 126*61046927SAndroid Build Coastguard Worker } 127*61046927SAndroid Build Coastguard Worker 128*61046927SAndroid Build Coastguard Worker if( a >= dist2 ) 129*61046927SAndroid Build Coastguard Worker { 130*61046927SAndroid Build Coastguard Worker ind += 2; 131*61046927SAndroid Build Coastguard Worker a -= dist2; 132*61046927SAndroid Build Coastguard Worker } 133*61046927SAndroid Build Coastguard Worker 134*61046927SAndroid Build Coastguard Worker if( a >= dist ) 135*61046927SAndroid Build Coastguard Worker ind += 1; 136*61046927SAndroid Build Coastguard Worker 137*61046927SAndroid Build Coastguard Worker // turn linear scale into DXT index (0/1 are extremal pts) 138*61046927SAndroid Build Coastguard Worker ind = -ind & 7; 139*61046927SAndroid Build Coastguard Worker ind ^= ( 2 > ind ) ? 1 : 0; 140*61046927SAndroid Build Coastguard Worker 141*61046927SAndroid Build Coastguard Worker // write index 142*61046927SAndroid Build Coastguard Worker const uint bits = 16u + ( ( blockThreadId << 2u ) + i ) * 3u; 143*61046927SAndroid Build Coastguard Worker if( bits < 32u ) 144*61046927SAndroid Build Coastguard Worker { 145*61046927SAndroid Build Coastguard Worker mask0 |= uint( ind ) << bits; 146*61046927SAndroid Build Coastguard Worker if( bits + 3u > 32u ) 147*61046927SAndroid Build Coastguard Worker { 148*61046927SAndroid Build Coastguard Worker mask1 |= uint( ind ) >> ( 32u - bits ); 149*61046927SAndroid Build Coastguard Worker } 150*61046927SAndroid Build Coastguard Worker } 151*61046927SAndroid Build Coastguard Worker else 152*61046927SAndroid Build Coastguard Worker { 153*61046927SAndroid Build Coastguard Worker mask1 |= uint( ind ) << ( bits - 32u ); 154*61046927SAndroid Build Coastguard Worker } 155*61046927SAndroid Build Coastguard Worker } 156*61046927SAndroid Build Coastguard Worker 157*61046927SAndroid Build Coastguard Worker if( mask0 != 0u ) 158*61046927SAndroid Build Coastguard Worker atomicOr( g_mask[maskIdxBase].x, mask0 ); 159*61046927SAndroid Build Coastguard Worker if( mask1 != 0u ) 160*61046927SAndroid Build Coastguard Worker atomicOr( g_mask[maskIdxBase].y, mask1 ); 161*61046927SAndroid Build Coastguard Worker 162*61046927SAndroid Build Coastguard Worker __sharedOnlyBarrier; 163*61046927SAndroid Build Coastguard Worker 164*61046927SAndroid Build Coastguard Worker if( blockThreadId == 0u ) 165*61046927SAndroid Build Coastguard Worker { 166*61046927SAndroid Build Coastguard Worker // Save data 167*61046927SAndroid Build Coastguard Worker uint4 outputBytes; 168*61046927SAndroid Build Coastguard Worker 169*61046927SAndroid Build Coastguard Worker if( p_useSNorm != 0u ) 170*61046927SAndroid Build Coastguard Worker { 171*61046927SAndroid Build Coastguard Worker outputBytes.x = 172*61046927SAndroid Build Coastguard Worker packSnorm4x8( float4( maxVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, 173*61046927SAndroid Build Coastguard Worker minVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, 0.0f, 0.0f ) ); 174*61046927SAndroid Build Coastguard Worker } 175*61046927SAndroid Build Coastguard Worker else 176*61046927SAndroid Build Coastguard Worker { 177*61046927SAndroid Build Coastguard Worker outputBytes.x = packUnorm4x8( 178*61046927SAndroid Build Coastguard Worker float4( maxVal * ( 1.0f / 255.0f ), minVal * ( 1.0f / 255.0f ), 0.0f, 0.0f ) ); 179*61046927SAndroid Build Coastguard Worker } 180*61046927SAndroid Build Coastguard Worker outputBytes.y = g_mask[maskIdxBase].x >> 16u; 181*61046927SAndroid Build Coastguard Worker outputBytes.z = g_mask[maskIdxBase].y & 0xFFFFu; 182*61046927SAndroid Build Coastguard Worker outputBytes.w = g_mask[maskIdxBase].y >> 16u; 183*61046927SAndroid Build Coastguard Worker 184*61046927SAndroid Build Coastguard Worker uint2 dstUV = gl_GlobalInvocationID.yz; 185*61046927SAndroid Build Coastguard Worker imageStore( dstTexture, int2( dstUV ), outputBytes ); 186*61046927SAndroid Build Coastguard Worker } 187*61046927SAndroid Build Coastguard Worker} 188