xref: /aosp_15_r20/external/swiftshader/src/Pipeline/SamplerCore.cpp (revision 03ce13f70fcc45d86ee91b7ee4cab1936a95046e)
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "SamplerCore.hpp"
16 
17 #include "Constants.hpp"
18 #include "PixelRoutine.hpp"
19 #include "System/Debug.hpp"
20 #include "Vulkan/VkSampler.hpp"
21 
22 namespace sw {
23 
SamplerCore(Pointer<Byte> & constants,const Sampler & state,SamplerFunction function)24 SamplerCore::SamplerCore(Pointer<Byte> &constants, const Sampler &state, SamplerFunction function)
25     : constants(constants)
26     , state(state)
27     , function(function)
28 {
29 }
30 
sampleTexture(Pointer<Byte> & texture,SIMD::Float uvwa[4],const SIMD::Float & dRef,const Float & lodOrBias,const SIMD::Float & dsx,const SIMD::Float & dsy,SIMD::Int offset[4],const SIMD::Int & sample)31 SIMD::Float4 SamplerCore::sampleTexture(Pointer<Byte> &texture, SIMD::Float uvwa[4], const SIMD::Float &dRef, const Float &lodOrBias, const SIMD::Float &dsx, const SIMD::Float &dsy, SIMD::Int offset[4], const SIMD::Int &sample)
32 {
33 	SIMD::Float4 c;
34 
35 	for(int i = 0; i < SIMD::Width / 4; i++)
36 	{
37 		Float4 uvwa128[4];
38 		uvwa128[0] = Extract128(uvwa[0], i);
39 		uvwa128[1] = Extract128(uvwa[1], i);
40 		uvwa128[2] = Extract128(uvwa[2], i);
41 		uvwa128[3] = Extract128(uvwa[3], i);
42 
43 		Vector4i offset128;
44 		offset128[0] = Extract128(offset[0], i);
45 		offset128[1] = Extract128(offset[1], i);
46 		offset128[2] = Extract128(offset[2], i);
47 		offset128[3] = Extract128(offset[3], i);
48 
49 		Vector4f c128 = sampleTexture128(texture, uvwa128, Extract128(dRef, i), lodOrBias, Extract128(dsx, i), Extract128(dsy, i), offset128, Extract128(sample, i));
50 		c.x = Insert128(c.x, c128.x, i);
51 		c.y = Insert128(c.y, c128.y, i);
52 		c.z = Insert128(c.z, c128.z, i);
53 		c.w = Insert128(c.w, c128.w, i);
54 	}
55 
56 	return c;
57 }
58 
sampleTexture128(Pointer<Byte> & texture,Float4 uvwa[4],const Float4 & dRef,const Float & lodOrBias,const Float4 & dsx,const Float4 & dsy,Vector4i & offset,const Int4 & sample)59 Vector4f SamplerCore::sampleTexture128(Pointer<Byte> &texture, Float4 uvwa[4], const Float4 &dRef, const Float &lodOrBias, const Float4 &dsx, const Float4 &dsy, Vector4i &offset, const Int4 &sample)
60 {
61 	Vector4f c;
62 
63 	Float4 u = uvwa[0];
64 	Float4 v = uvwa[1];
65 	Float4 w = uvwa[2];
66 	Float4 a;  // Array layer coordinate
67 	switch(state.textureType)
68 	{
69 	case VK_IMAGE_VIEW_TYPE_1D_ARRAY: a = uvwa[1]; break;
70 	case VK_IMAGE_VIEW_TYPE_2D_ARRAY: a = uvwa[2]; break;
71 	case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY: a = uvwa[3]; break;
72 	default: break;
73 	}
74 
75 	Float lod;
76 	Float anisotropy;
77 	Float4 uDelta;
78 	Float4 vDelta;
79 	Float4 M;  // Major axis
80 
81 	if(state.isCube())
82 	{
83 		Int4 face = cubeFace(u, v, uvwa[0], uvwa[1], uvwa[2], M);
84 		w = As<Float4>(face);
85 	}
86 
87 	// Determine if we can skip the LOD computation. This is the case when the mipmap has only one level, except for LOD query,
88 	// where we have to return the computed value. Anisotropic filtering requires computing the anisotropy factor even for a single mipmap level.
89 	bool singleMipLevel = (state.minLod == state.maxLod);
90 	bool requiresLodComputation = (function == Query) || (state.textureFilter == FILTER_ANISOTROPIC);
91 	bool skipLodComputation = singleMipLevel && !requiresLodComputation;
92 
93 	if(skipLodComputation)
94 	{
95 		lod = state.minLod;
96 	}
97 	else if(function == Implicit || function == Bias || function == Grad || function == Query)
98 	{
99 		if(state.is1D())
100 		{
101 			computeLod1D(texture, lod, u, dsx, dsy);
102 		}
103 		else if(state.is2D())
104 		{
105 			computeLod2D(texture, lod, anisotropy, uDelta, vDelta, u, v, dsx, dsy);
106 		}
107 		else if(state.isCube())
108 		{
109 			computeLodCube(texture, lod, uvwa[0], uvwa[1], uvwa[2], dsx, dsy, M);
110 		}
111 		else
112 		{
113 			computeLod3D(texture, lod, u, v, w, dsx, dsy);
114 		}
115 
116 		Float bias = state.mipLodBias;
117 
118 		if(function == Bias)
119 		{
120 			// Add SPIR-V Bias operand to the sampler provided bias and clamp to maxSamplerLodBias limit.
121 			bias = Min(Max(bias + lodOrBias, -vk::MAX_SAMPLER_LOD_BIAS), vk::MAX_SAMPLER_LOD_BIAS);
122 		}
123 
124 		lod += bias;
125 	}
126 	else if(function == Lod)
127 	{
128 		// Vulkan 1.1: "The absolute value of mipLodBias must be less than or equal to VkPhysicalDeviceLimits::maxSamplerLodBias"
129 		// Hence no explicit clamping to maxSamplerLodBias is required in this case.
130 		lod = lodOrBias + state.mipLodBias;
131 	}
132 	else if(function == Fetch)
133 	{
134 		// TODO: Eliminate int-float-int conversion.
135 		lod = Float(As<Int>(lodOrBias));
136 		lod = Max(lod, state.minLod);
137 		lod = Min(lod, state.maxLod);
138 	}
139 	else if(function == Base || function == Gather)
140 	{
141 		lod = Float(0);
142 	}
143 	else
144 		UNREACHABLE("Sampler function %d", int(function));
145 
146 	if(function != Base && function != Fetch && function != Gather)
147 	{
148 		if(function == Query)
149 		{
150 			c.y = Float4(lod);  // Unclamped LOD.
151 		}
152 
153 		if(!skipLodComputation)
154 		{
155 			lod = Max(lod, state.minLod);
156 			lod = Min(lod, state.maxLod);
157 		}
158 
159 		if(function == Query)
160 		{
161 			if(state.mipmapFilter == MIPMAP_POINT)
162 			{
163 				lod = Round(lod);  // TODO: Preferred formula is ceil(lod + 0.5) - 1
164 			}
165 
166 			c.x = lod;
167 			//	c.y contains unclamped LOD.
168 
169 			return c;
170 		}
171 	}
172 
173 	bool force32BitFiltering = state.highPrecisionFiltering && !isYcbcrFormat() && (state.textureFilter != FILTER_POINT);
174 	bool use32BitFiltering = hasFloatTexture() || hasUnnormalizedIntegerTexture() || force32BitFiltering ||
175 	                         state.isCube() || state.unnormalizedCoordinates || state.compareEnable ||
176 	                         borderModeActive() || (function == Gather) || (function == Fetch);
177 	int numComponents = (function == Gather) ? 4 : textureComponentCount();
178 
179 	if(use32BitFiltering)
180 	{
181 		c = sampleFloatFilter(texture, u, v, w, a, dRef, offset, sample, lod, anisotropy, uDelta, vDelta);
182 	}
183 	else  // 16-bit filtering.
184 	{
185 		Vector4s cs = sampleFilter(texture, u, v, w, a, offset, sample, lod, anisotropy, uDelta, vDelta);
186 
187 		for(int component = 0; component < numComponents; component++)
188 		{
189 			if(hasUnsignedTextureComponent(component))
190 			{
191 				c[component] = Float4(As<UShort4>(cs[component]));
192 			}
193 			else
194 			{
195 				c[component] = Float4(cs[component]);
196 			}
197 		}
198 	}
199 
200 	if(hasNormalizedFormat() && !state.compareEnable)
201 	{
202 		sw::float4 scale = getComponentScale();
203 
204 		for(int component = 0; component < numComponents; component++)
205 		{
206 			int texelComponent = (function == Gather) ? getGatherComponent() : component;
207 			c[component] *= Float4(1.0f / scale[texelComponent]);
208 		}
209 	}
210 
211 	if(state.textureFormat.isSignedNormalized())
212 	{
213 		for(int component = 0; component < numComponents; component++)
214 		{
215 			c[component] = Max(c[component], Float4(-1.0f));
216 		}
217 	}
218 
219 	if(state.textureFilter != FILTER_GATHER)
220 	{
221 		if((state.swizzle.r != VK_COMPONENT_SWIZZLE_R) ||
222 		   (state.swizzle.g != VK_COMPONENT_SWIZZLE_G) ||
223 		   (state.swizzle.b != VK_COMPONENT_SWIZZLE_B) ||
224 		   (state.swizzle.a != VK_COMPONENT_SWIZZLE_A))
225 		{
226 			const Vector4f col = c;
227 			bool integer = hasUnnormalizedIntegerTexture();
228 			c.x = applySwizzle(col, state.swizzle.r, integer);
229 			c.y = applySwizzle(col, state.swizzle.g, integer);
230 			c.z = applySwizzle(col, state.swizzle.b, integer);
231 			c.w = applySwizzle(col, state.swizzle.a, integer);
232 		}
233 	}
234 	else  // Gather
235 	{
236 		VkComponentSwizzle swizzle = gatherSwizzle();
237 
238 		// R/G/B/A swizzles affect the component collected from each texel earlier.
239 		// Handle the ZERO and ONE cases here because we don't need to know the format.
240 
241 		if(swizzle == VK_COMPONENT_SWIZZLE_ZERO)
242 		{
243 			c.x = c.y = c.z = c.w = Float4(0);
244 		}
245 		else if(swizzle == VK_COMPONENT_SWIZZLE_ONE)
246 		{
247 			bool integer = hasUnnormalizedIntegerTexture();
248 			c.x = c.y = c.z = c.w = integer ? As<Float4>(Int4(1)) : RValue<Float4>(Float4(1.0f));
249 		}
250 	}
251 
252 	return c;
253 }
254 
applySwizzle(const Vector4f & c,VkComponentSwizzle swizzle,bool integer)255 Float4 SamplerCore::applySwizzle(const Vector4f &c, VkComponentSwizzle swizzle, bool integer)
256 {
257 	switch(swizzle)
258 	{
259 	default: UNSUPPORTED("VkComponentSwizzle %d", (int)swizzle);
260 	case VK_COMPONENT_SWIZZLE_R: return c.x;
261 	case VK_COMPONENT_SWIZZLE_G: return c.y;
262 	case VK_COMPONENT_SWIZZLE_B: return c.z;
263 	case VK_COMPONENT_SWIZZLE_A: return c.w;
264 	case VK_COMPONENT_SWIZZLE_ZERO: return Float4(0.0f, 0.0f, 0.0f, 0.0f);
265 	case VK_COMPONENT_SWIZZLE_ONE:
266 		if(integer)
267 		{
268 			return Float4(As<Float4>(sw::Int4(1, 1, 1, 1)));
269 		}
270 		else
271 		{
272 			return Float4(1.0f, 1.0f, 1.0f, 1.0f);
273 		}
274 		break;
275 	}
276 };
277 
offsetSample(Short4 & uvw,Pointer<Byte> & mipmap,int halfOffset,bool wrap,int count,Float & lod)278 Short4 SamplerCore::offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod)
279 {
280 	Short4 offset = *Pointer<UShort4>(mipmap + halfOffset);
281 
282 	if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
283 	{
284 		offset &= Short4(CmpNLE(Float4(lod), Float4(0.0f)));
285 	}
286 	else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
287 	{
288 		offset &= Short4(CmpLE(Float4(lod), Float4(0.0f)));
289 	}
290 
291 	if(wrap)
292 	{
293 		switch(count)
294 		{
295 		case -1: return uvw - offset;
296 		case 0: return uvw;
297 		case +1: return uvw + offset;
298 		case 2: return uvw + offset + offset;
299 		}
300 	}
301 	else  // Clamp or mirror
302 	{
303 		switch(count)
304 		{
305 		case -1: return SubSat(As<UShort4>(uvw), As<UShort4>(offset));
306 		case 0: return uvw;
307 		case +1: return AddSat(As<UShort4>(uvw), As<UShort4>(offset));
308 		case 2: return AddSat(AddSat(As<UShort4>(uvw), As<UShort4>(offset)), As<UShort4>(offset));
309 		}
310 	}
311 
312 	return uvw;
313 }
314 
sampleFilter(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta)315 Vector4s SamplerCore::sampleFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta)
316 {
317 	Vector4s c = sampleAniso(texture, u, v, w, a, offset, sample, lod, anisotropy, uDelta, vDelta, false);
318 
319 	if(function == Fetch)
320 	{
321 		return c;
322 	}
323 
324 	if(state.mipmapFilter == MIPMAP_LINEAR)
325 	{
326 		Vector4s cc = sampleAniso(texture, u, v, w, a, offset, sample, lod, anisotropy, uDelta, vDelta, true);
327 
328 		lod *= Float(1 << 16);
329 
330 		UShort4 utri = UShort4(Float4(lod));  // TODO: Optimize
331 		Short4 stri = utri >> 1;              // TODO: Optimize
332 
333 		if(hasUnsignedTextureComponent(0))
334 			cc.x = MulHigh(As<UShort4>(cc.x), utri);
335 		else
336 			cc.x = MulHigh(cc.x, stri);
337 		if(hasUnsignedTextureComponent(1))
338 			cc.y = MulHigh(As<UShort4>(cc.y), utri);
339 		else
340 			cc.y = MulHigh(cc.y, stri);
341 		if(hasUnsignedTextureComponent(2))
342 			cc.z = MulHigh(As<UShort4>(cc.z), utri);
343 		else
344 			cc.z = MulHigh(cc.z, stri);
345 		if(hasUnsignedTextureComponent(3))
346 			cc.w = MulHigh(As<UShort4>(cc.w), utri);
347 		else
348 			cc.w = MulHigh(cc.w, stri);
349 
350 		utri = ~utri;
351 		stri = Short4(0x7FFF) - stri;
352 
353 		if(hasUnsignedTextureComponent(0))
354 			c.x = MulHigh(As<UShort4>(c.x), utri);
355 		else
356 			c.x = MulHigh(c.x, stri);
357 		if(hasUnsignedTextureComponent(1))
358 			c.y = MulHigh(As<UShort4>(c.y), utri);
359 		else
360 			c.y = MulHigh(c.y, stri);
361 		if(hasUnsignedTextureComponent(2))
362 			c.z = MulHigh(As<UShort4>(c.z), utri);
363 		else
364 			c.z = MulHigh(c.z, stri);
365 		if(hasUnsignedTextureComponent(3))
366 			c.w = MulHigh(As<UShort4>(c.w), utri);
367 		else
368 			c.w = MulHigh(c.w, stri);
369 
370 		c.x += cc.x;
371 		c.y += cc.y;
372 		c.z += cc.z;
373 		c.w += cc.w;
374 
375 		if(!hasUnsignedTextureComponent(0)) c.x += c.x;
376 		if(!hasUnsignedTextureComponent(1)) c.y += c.y;
377 		if(!hasUnsignedTextureComponent(2)) c.z += c.z;
378 		if(!hasUnsignedTextureComponent(3)) c.w += c.w;
379 	}
380 
381 	return c;
382 }
383 
sampleAniso(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta,bool secondLOD)384 Vector4s SamplerCore::sampleAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD)
385 {
386 	Vector4s c;
387 
388 	if(state.textureFilter != FILTER_ANISOTROPIC)
389 	{
390 		c = sampleQuad(texture, u, v, w, a, offset, sample, lod, secondLOD);
391 	}
392 	else
393 	{
394 		Int N = RoundInt(anisotropy);
395 
396 		Vector4s cSum;
397 
398 		cSum.x = Short4(0);
399 		cSum.y = Short4(0);
400 		cSum.z = Short4(0);
401 		cSum.w = Short4(0);
402 
403 		Float4 A = *Pointer<Float4>(constants + OFFSET(Constants, uvWeight) + 16 * N);
404 		Float4 B = *Pointer<Float4>(constants + OFFSET(Constants, uvStart) + 16 * N);
405 		UShort4 cw = *Pointer<UShort4>(constants + OFFSET(Constants, cWeight) + 8 * N);
406 		Short4 sw = Short4(cw >> 1);
407 
408 		Float4 du = uDelta;
409 		Float4 dv = vDelta;
410 
411 		Float4 u0 = u + B * du;
412 		Float4 v0 = v + B * dv;
413 
414 		du *= A;
415 		dv *= A;
416 
417 		Int i = 0;
418 
419 		Do
420 		{
421 			c = sampleQuad(texture, u0, v0, w, a, offset, sample, lod, secondLOD);
422 
423 			u0 += du;
424 			v0 += dv;
425 
426 			if(hasUnsignedTextureComponent(0))
427 				cSum.x += As<Short4>(MulHigh(As<UShort4>(c.x), cw));
428 			else
429 				cSum.x += MulHigh(c.x, sw);
430 			if(hasUnsignedTextureComponent(1))
431 				cSum.y += As<Short4>(MulHigh(As<UShort4>(c.y), cw));
432 			else
433 				cSum.y += MulHigh(c.y, sw);
434 			if(hasUnsignedTextureComponent(2))
435 				cSum.z += As<Short4>(MulHigh(As<UShort4>(c.z), cw));
436 			else
437 				cSum.z += MulHigh(c.z, sw);
438 			if(hasUnsignedTextureComponent(3))
439 				cSum.w += As<Short4>(MulHigh(As<UShort4>(c.w), cw));
440 			else
441 				cSum.w += MulHigh(c.w, sw);
442 
443 			i++;
444 		}
445 		Until(i >= N);
446 
447 		if(hasUnsignedTextureComponent(0))
448 			c.x = cSum.x;
449 		else
450 			c.x = AddSat(cSum.x, cSum.x);
451 		if(hasUnsignedTextureComponent(1))
452 			c.y = cSum.y;
453 		else
454 			c.y = AddSat(cSum.y, cSum.y);
455 		if(hasUnsignedTextureComponent(2))
456 			c.z = cSum.z;
457 		else
458 			c.z = AddSat(cSum.z, cSum.z);
459 		if(hasUnsignedTextureComponent(3))
460 			c.w = cSum.w;
461 		else
462 			c.w = AddSat(cSum.w, cSum.w);
463 	}
464 
465 	return c;
466 }
467 
sampleQuad(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)468 Vector4s SamplerCore::sampleQuad(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
469 {
470 	if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
471 	{
472 		return sampleQuad2D(texture, u, v, w, a, offset, sample, lod, secondLOD);
473 	}
474 	else
475 	{
476 		return sample3D(texture, u, v, w, offset, sample, lod, secondLOD);
477 	}
478 }
479 
bilinearInterpolateFloat(Vector4f & output,const Short4 & uuuu0,const Short4 & vvvv0,Vector4f & c00,Vector4f & c01,Vector4f & c10,Vector4f & c11,const Pointer<Byte> & mipmap,bool interpolateComponent0,bool interpolateComponent1,bool interpolateComponent2,bool interpolateComponent3)480 void SamplerCore::bilinearInterpolateFloat(Vector4f &output, const Short4 &uuuu0, const Short4 &vvvv0, Vector4f &c00, Vector4f &c01, Vector4f &c10, Vector4f &c11, const Pointer<Byte> &mipmap, bool interpolateComponent0, bool interpolateComponent1, bool interpolateComponent2, bool interpolateComponent3)
481 {
482 	int componentCount = textureComponentCount();
483 
484 	Float4 unnormalizedUUUU0 = (Float4(uuuu0) / Float4(1 << 16)) * Float4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, width)));
485 	Float4 unnormalizedVVVV0 = (Float4(vvvv0) / Float4(1 << 16)) * Float4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, height)));
486 
487 	Float4 frac0u = Frac(unnormalizedUUUU0);
488 	Float4 frac0v = Frac(unnormalizedVVVV0);
489 
490 	if(interpolateComponent0 && componentCount >= 1)
491 	{
492 		c00.x = Mix(c00.x, c10.x, frac0u);
493 		c01.x = Mix(c01.x, c11.x, frac0u);
494 		output.x = Mix(c00.x, c01.x, frac0v);
495 	}
496 	if(interpolateComponent1 && componentCount >= 2)
497 	{
498 		c00.y = Mix(c00.y, c10.y, frac0u);
499 		c01.y = Mix(c01.y, c11.y, frac0u);
500 		output.y = Mix(c00.y, c01.y, frac0v);
501 	}
502 	if(interpolateComponent2 && componentCount >= 3)
503 	{
504 		c00.z = Mix(c00.z, c10.z, frac0u);
505 		c01.z = Mix(c01.z, c11.z, frac0u);
506 		output.z = Mix(c00.z, c01.z, frac0v);
507 	}
508 	if(interpolateComponent3 && componentCount >= 4)
509 	{
510 		c00.w = Mix(c00.w, c10.w, frac0u);
511 		c01.w = Mix(c01.w, c11.w, frac0u);
512 		output.w = Mix(c00.w, c01.w, frac0v);
513 	}
514 }
515 
bilinearInterpolate(Vector4s & output,const Short4 & uuuu0,const Short4 & vvvv0,Vector4s & c00,Vector4s & c01,Vector4s & c10,Vector4s & c11,const Pointer<Byte> & mipmap)516 void SamplerCore::bilinearInterpolate(Vector4s &output, const Short4 &uuuu0, const Short4 &vvvv0, Vector4s &c00, Vector4s &c01, Vector4s &c10, Vector4s &c11, const Pointer<Byte> &mipmap)
517 {
518 	int componentCount = textureComponentCount();
519 
520 	// Fractions
521 	UShort4 f0u = As<UShort4>(uuuu0) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, width)));
522 	UShort4 f0v = As<UShort4>(vvvv0) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, height)));
523 
524 	UShort4 f1u = ~f0u;
525 	UShort4 f1v = ~f0v;
526 
527 	UShort4 f0u0v = MulHigh(f0u, f0v);
528 	UShort4 f1u0v = MulHigh(f1u, f0v);
529 	UShort4 f0u1v = MulHigh(f0u, f1v);
530 	UShort4 f1u1v = MulHigh(f1u, f1v);
531 
532 	// Signed fractions
533 	Short4 f1u1vs;
534 	Short4 f0u1vs;
535 	Short4 f1u0vs;
536 	Short4 f0u0vs;
537 
538 	if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
539 	{
540 		f1u1vs = f1u1v >> 1;
541 		f0u1vs = f0u1v >> 1;
542 		f1u0vs = f1u0v >> 1;
543 		f0u0vs = f0u0v >> 1;
544 	}
545 
546 	// Bilinear interpolation
547 	if(componentCount >= 1)
548 	{
549 		if(has16bitTextureComponents() && hasUnsignedTextureComponent(0))
550 		{
551 			c00.x = As<UShort4>(c00.x) - MulHigh(As<UShort4>(c00.x), f0u) + MulHigh(As<UShort4>(c10.x), f0u);
552 			c01.x = As<UShort4>(c01.x) - MulHigh(As<UShort4>(c01.x), f0u) + MulHigh(As<UShort4>(c11.x), f0u);
553 			output.x = As<UShort4>(c00.x) - MulHigh(As<UShort4>(c00.x), f0v) + MulHigh(As<UShort4>(c01.x), f0v);
554 		}
555 		else
556 		{
557 			if(hasUnsignedTextureComponent(0))
558 			{
559 				c00.x = MulHigh(As<UShort4>(c00.x), f1u1v);
560 				c10.x = MulHigh(As<UShort4>(c10.x), f0u1v);
561 				c01.x = MulHigh(As<UShort4>(c01.x), f1u0v);
562 				c11.x = MulHigh(As<UShort4>(c11.x), f0u0v);
563 			}
564 			else
565 			{
566 				c00.x = MulHigh(c00.x, f1u1vs);
567 				c10.x = MulHigh(c10.x, f0u1vs);
568 				c01.x = MulHigh(c01.x, f1u0vs);
569 				c11.x = MulHigh(c11.x, f0u0vs);
570 			}
571 
572 			output.x = (c00.x + c10.x) + (c01.x + c11.x);
573 			if(!hasUnsignedTextureComponent(0)) output.x = AddSat(output.x, output.x);  // Correct for signed fractions
574 		}
575 	}
576 
577 	if(componentCount >= 2)
578 	{
579 		if(has16bitTextureComponents() && hasUnsignedTextureComponent(1))
580 		{
581 			c00.y = As<UShort4>(c00.y) - MulHigh(As<UShort4>(c00.y), f0u) + MulHigh(As<UShort4>(c10.y), f0u);
582 			c01.y = As<UShort4>(c01.y) - MulHigh(As<UShort4>(c01.y), f0u) + MulHigh(As<UShort4>(c11.y), f0u);
583 			output.y = As<UShort4>(c00.y) - MulHigh(As<UShort4>(c00.y), f0v) + MulHigh(As<UShort4>(c01.y), f0v);
584 		}
585 		else
586 		{
587 			if(hasUnsignedTextureComponent(1))
588 			{
589 				c00.y = MulHigh(As<UShort4>(c00.y), f1u1v);
590 				c10.y = MulHigh(As<UShort4>(c10.y), f0u1v);
591 				c01.y = MulHigh(As<UShort4>(c01.y), f1u0v);
592 				c11.y = MulHigh(As<UShort4>(c11.y), f0u0v);
593 			}
594 			else
595 			{
596 				c00.y = MulHigh(c00.y, f1u1vs);
597 				c10.y = MulHigh(c10.y, f0u1vs);
598 				c01.y = MulHigh(c01.y, f1u0vs);
599 				c11.y = MulHigh(c11.y, f0u0vs);
600 			}
601 
602 			output.y = (c00.y + c10.y) + (c01.y + c11.y);
603 			if(!hasUnsignedTextureComponent(1)) output.y = AddSat(output.y, output.y);  // Correct for signed fractions
604 		}
605 	}
606 
607 	if(componentCount >= 3)
608 	{
609 		if(has16bitTextureComponents() && hasUnsignedTextureComponent(2))
610 		{
611 			c00.z = As<UShort4>(c00.z) - MulHigh(As<UShort4>(c00.z), f0u) + MulHigh(As<UShort4>(c10.z), f0u);
612 			c01.z = As<UShort4>(c01.z) - MulHigh(As<UShort4>(c01.z), f0u) + MulHigh(As<UShort4>(c11.z), f0u);
613 			output.z = As<UShort4>(c00.z) - MulHigh(As<UShort4>(c00.z), f0v) + MulHigh(As<UShort4>(c01.z), f0v);
614 		}
615 		else
616 		{
617 			if(hasUnsignedTextureComponent(2))
618 			{
619 				c00.z = MulHigh(As<UShort4>(c00.z), f1u1v);
620 				c10.z = MulHigh(As<UShort4>(c10.z), f0u1v);
621 				c01.z = MulHigh(As<UShort4>(c01.z), f1u0v);
622 				c11.z = MulHigh(As<UShort4>(c11.z), f0u0v);
623 			}
624 			else
625 			{
626 				c00.z = MulHigh(c00.z, f1u1vs);
627 				c10.z = MulHigh(c10.z, f0u1vs);
628 				c01.z = MulHigh(c01.z, f1u0vs);
629 				c11.z = MulHigh(c11.z, f0u0vs);
630 			}
631 
632 			output.z = (c00.z + c10.z) + (c01.z + c11.z);
633 			if(!hasUnsignedTextureComponent(2)) output.z = AddSat(output.z, output.z);  // Correct for signed fractions
634 		}
635 	}
636 
637 	if(componentCount >= 4)
638 	{
639 		if(has16bitTextureComponents() && hasUnsignedTextureComponent(3))
640 		{
641 			c00.w = As<UShort4>(c00.w) - MulHigh(As<UShort4>(c00.w), f0u) + MulHigh(As<UShort4>(c10.w), f0u);
642 			c01.w = As<UShort4>(c01.w) - MulHigh(As<UShort4>(c01.w), f0u) + MulHigh(As<UShort4>(c11.w), f0u);
643 			output.w = As<UShort4>(c00.w) - MulHigh(As<UShort4>(c00.w), f0v) + MulHigh(As<UShort4>(c01.w), f0v);
644 		}
645 		else
646 		{
647 			if(hasUnsignedTextureComponent(3))
648 			{
649 				c00.w = MulHigh(As<UShort4>(c00.w), f1u1v);
650 				c10.w = MulHigh(As<UShort4>(c10.w), f0u1v);
651 				c01.w = MulHigh(As<UShort4>(c01.w), f1u0v);
652 				c11.w = MulHigh(As<UShort4>(c11.w), f0u0v);
653 			}
654 			else
655 			{
656 				c00.w = MulHigh(c00.w, f1u1vs);
657 				c10.w = MulHigh(c10.w, f0u1vs);
658 				c01.w = MulHigh(c01.w, f1u0vs);
659 				c11.w = MulHigh(c11.w, f0u0vs);
660 			}
661 
662 			output.w = (c00.w + c10.w) + (c01.w + c11.w);
663 			if(!hasUnsignedTextureComponent(3)) output.w = AddSat(output.w, output.w);  // Correct for signed fractions
664 		}
665 	}
666 }
667 
sampleQuad2D(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)668 Vector4s SamplerCore::sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
669 {
670 	Vector4s c;
671 
672 	bool gather = (state.textureFilter == FILTER_GATHER);
673 
674 	Pointer<Byte> mipmap = selectMipmap(texture, lod, secondLOD);
675 	Pointer<Byte> buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
676 
677 	applyOffset(u, v, w, offset, mipmap);
678 
679 	Short4 uuuu = address(u, state.addressingModeU);
680 	Short4 vvvv = address(v, state.addressingModeV);
681 	Short4 wwww = address(w, state.addressingModeW);
682 	Short4 layerIndex = computeLayerIndex16(a, mipmap);
683 
684 	if(isYcbcrFormat())
685 	{
686 		uint8_t lumaBits = 8;
687 		uint8_t chromaBits = 8;
688 		switch(state.textureFormat)
689 		{
690 		case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
691 		case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
692 			lumaBits = 8;
693 			chromaBits = 8;
694 			break;
695 		case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
696 			lumaBits = 10;
697 			chromaBits = 10;
698 			break;
699 		default:
700 			UNSUPPORTED("state.textureFormat %d", (int)state.textureFormat);
701 			break;
702 		}
703 
704 		// TODO: investigate apparent precision losses in dEQP-VK.ycbcr when sampling and interpolating with Short4.
705 
706 		// Unnnormalized YUV values in [0, 255] for 8-bit formats, [0, 1023] for 10-bit formats.
707 		Vector4f yuv;
708 		Vector4f yuv00;
709 		Vector4f yuv10;
710 		Vector4f yuv01;
711 		Vector4f yuv11;
712 
713 		if(state.textureFilter == FILTER_POINT)
714 		{
715 			sampleLumaTexel(yuv, uuuu, vvvv, wwww, layerIndex, sample, mipmap, buffer);
716 		}
717 		else
718 		{
719 			Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, -1, lod);
720 			Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, -1, lod);
721 			Short4 uuuu1 = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, +1, lod);
722 			Short4 vvvv1 = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, +1, lod);
723 
724 			sampleLumaTexel(yuv00, uuuu0, vvvv0, wwww, layerIndex, sample, mipmap, buffer);
725 			sampleLumaTexel(yuv01, uuuu0, vvvv1, wwww, layerIndex, sample, mipmap, buffer);
726 			sampleLumaTexel(yuv10, uuuu1, vvvv0, wwww, layerIndex, sample, mipmap, buffer);
727 			sampleLumaTexel(yuv11, uuuu1, vvvv1, wwww, layerIndex, sample, mipmap, buffer);
728 
729 			bilinearInterpolateFloat(yuv, uuuu0, vvvv0, yuv00, yuv01, yuv10, yuv11, mipmap, false, true, false, false);
730 		}
731 
732 		// Pointers to the planes of YCbCr images are stored in consecutive mipmap levels.
733 		Pointer<Byte> mipmapU = Pointer<Byte>(mipmap + 1 * sizeof(Mipmap));
734 		Pointer<Byte> mipmapV = Pointer<Byte>(mipmap + 2 * sizeof(Mipmap));
735 		Pointer<Byte> bufferU = *Pointer<Pointer<Byte>>(mipmapU + OFFSET(Mipmap, buffer));  // U/V for 2-plane interleaved formats.
736 		Pointer<Byte> bufferV = *Pointer<Pointer<Byte>>(mipmapV + OFFSET(Mipmap, buffer));
737 
738 		// https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#textures-implict-reconstruction
739 		// but using normalized coordinates.
740 		Float4 chromaU = u;
741 		Float4 chromaV = v;
742 		if(state.chromaXOffset == VK_CHROMA_LOCATION_COSITED_EVEN)
743 		{
744 			chromaU += (Float4(0.25f) / Float4(*Pointer<UInt4>(mipmapU + OFFSET(Mipmap, width))));
745 		}
746 		if(state.chromaYOffset == VK_CHROMA_LOCATION_COSITED_EVEN)
747 		{
748 			chromaV += (Float4(0.25f) / Float4(*Pointer<UInt4>(mipmapU + OFFSET(Mipmap, height))));
749 		}
750 
751 		Short4 chromaUUUU = address(chromaU, state.addressingModeU);
752 		Short4 chromaVVVV = address(chromaV, state.addressingModeV);
753 
754 		if(state.chromaFilter == FILTER_POINT)
755 		{
756 			sampleChromaTexel(yuv, chromaUUUU, chromaVVVV, wwww, layerIndex, sample, mipmapU, bufferU, mipmapV, bufferV);
757 		}
758 		else
759 		{
760 			Short4 chromaUUUU0 = offsetSample(chromaUUUU, mipmapU, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, -1, lod);
761 			Short4 chromaVVVV0 = offsetSample(chromaVVVV, mipmapU, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, -1, lod);
762 			Short4 chromaUUUU1 = offsetSample(chromaUUUU, mipmapU, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, +1, lod);
763 			Short4 chromaVVVV1 = offsetSample(chromaVVVV, mipmapU, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, +1, lod);
764 
765 			sampleChromaTexel(yuv00, chromaUUUU0, chromaVVVV0, wwww, layerIndex, sample, mipmapU, bufferU, mipmapV, bufferV);
766 			sampleChromaTexel(yuv01, chromaUUUU0, chromaVVVV1, wwww, layerIndex, sample, mipmapU, bufferU, mipmapV, bufferV);
767 			sampleChromaTexel(yuv10, chromaUUUU1, chromaVVVV0, wwww, layerIndex, sample, mipmapU, bufferU, mipmapV, bufferV);
768 			sampleChromaTexel(yuv11, chromaUUUU1, chromaVVVV1, wwww, layerIndex, sample, mipmapU, bufferU, mipmapV, bufferV);
769 
770 			bilinearInterpolateFloat(yuv, chromaUUUU0, chromaVVVV0, yuv00, yuv01, yuv10, yuv11, mipmapU, true, false, true, false);
771 		}
772 
773 		if(state.swappedChroma)
774 		{
775 			std::swap(yuv.x, yuv.z);
776 		}
777 
778 		if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY)
779 		{
780 			// Scale to the output 15-bit.
781 			c.x = UShort4(yuv.x) << (15 - chromaBits);
782 			c.y = UShort4(yuv.y) << (15 - lumaBits);
783 			c.z = UShort4(yuv.z) << (15 - chromaBits);
784 		}
785 		else
786 		{
787 			const float twoPowLumaBits = static_cast<float>(0x1u << lumaBits);
788 			const float twoPowLumaBitsMinus8 = static_cast<float>(0x1u << (lumaBits - 8));
789 			const float twoPowChromaBits = static_cast<float>(0x1u << chromaBits);
790 			const float twoPowChromaBitsMinus1 = static_cast<float>(0x1u << (chromaBits - 1));
791 			const float twoPowChromaBitsMinus8 = static_cast<float>(0x1u << (chromaBits - 8));
792 
793 			Float4 y = Float4(yuv.y);
794 			Float4 u = Float4(yuv.z);
795 			Float4 v = Float4(yuv.x);
796 
797 			if(state.studioSwing)
798 			{
799 				// See https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.html#QUANTIZATION_NARROW
800 				y = ((y / Float4(twoPowLumaBitsMinus8)) - Float4(16.0f)) / Float4(219.0f);
801 				u = ((u / Float4(twoPowChromaBitsMinus8)) - Float4(128.0f)) / Float4(224.0f);
802 				v = ((v / Float4(twoPowChromaBitsMinus8)) - Float4(128.0f)) / Float4(224.0f);
803 			}
804 			else
805 			{
806 				// See https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.html#QUANTIZATION_FULL
807 				y = y / Float4(twoPowLumaBits - 1.0f);
808 				u = (u - Float4(twoPowChromaBitsMinus1)) / Float4(twoPowChromaBits - 1.0f);
809 				v = (v - Float4(twoPowChromaBitsMinus1)) / Float4(twoPowChromaBits - 1.0f);
810 			}
811 
812 			// Now, `y` is in [0, 1] and `u` and `v` are in [-0.5, 0.5].
813 
814 			if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_IDENTITY)
815 			{
816 				c.x = Short4(v * static_cast<float>(0x7FFF));
817 				c.y = Short4(y * static_cast<float>(0x7FFF));
818 				c.z = Short4(u * static_cast<float>(0x7FFF));
819 			}
820 			else
821 			{
822 				// Generic YCbCr to RGB transformation:
823 				// R = Y                               +           2 * (1 - Kr) * Cr
824 				// G = Y - 2 * Kb * (1 - Kb) / Kg * Cb - 2 * Kr * (1 - Kr) / Kg * Cr
825 				// B = Y +           2 * (1 - Kb) * Cb
826 
827 				float Kb = 0.114f;
828 				float Kr = 0.299f;
829 
830 				switch(state.ycbcrModel)
831 				{
832 				case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_709:
833 					Kb = 0.0722f;
834 					Kr = 0.2126f;
835 					break;
836 				case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601:
837 					Kb = 0.114f;
838 					Kr = 0.299f;
839 					break;
840 				case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_2020:
841 					Kb = 0.0593f;
842 					Kr = 0.2627f;
843 					break;
844 				default:
845 					UNSUPPORTED("ycbcrModel %d", int(state.ycbcrModel));
846 				}
847 
848 				const float Kg = 1.0f - Kr - Kb;
849 
850 				const float Rr = 2 * (1 - Kr);
851 				const float Gb = -2 * Kb * (1 - Kb) / Kg;
852 				const float Gr = -2 * Kr * (1 - Kr) / Kg;
853 				const float Bb = 2 * (1 - Kb);
854 
855 				Float4 r = y + Float4(Rr) * v;
856 				Float4 g = y + Float4(Gb) * u + Float4(Gr) * v;
857 				Float4 b = y + Float4(Bb) * u;
858 
859 				c.x = Short4(r * static_cast<float>(0x7FFF));
860 				c.y = Short4(g * static_cast<float>(0x7FFF));
861 				c.z = Short4(b * static_cast<float>(0x7FFF));
862 			}
863 		}
864 	}
865 	else  // !isYcbcrFormat()
866 	{
867 		if(state.textureFilter == FILTER_POINT)
868 		{
869 			c = sampleTexel(uuuu, vvvv, wwww, layerIndex, sample, mipmap, buffer);
870 		}
871 		else
872 		{
873 			Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, -1, lod);
874 			Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, -1, lod);
875 			Short4 uuuu1 = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, +1, lod);
876 			Short4 vvvv1 = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, +1, lod);
877 
878 			Vector4s c00 = sampleTexel(uuuu0, vvvv0, wwww, layerIndex, sample, mipmap, buffer);
879 			Vector4s c10 = sampleTexel(uuuu1, vvvv0, wwww, layerIndex, sample, mipmap, buffer);
880 			Vector4s c01 = sampleTexel(uuuu0, vvvv1, wwww, layerIndex, sample, mipmap, buffer);
881 			Vector4s c11 = sampleTexel(uuuu1, vvvv1, wwww, layerIndex, sample, mipmap, buffer);
882 
883 			if(!gather)  // Blend
884 			{
885 				bilinearInterpolate(c, uuuu0, vvvv0, c00, c01, c10, c11, mipmap);
886 			}
887 			else
888 			{
889 				VkComponentSwizzle swizzle = gatherSwizzle();
890 				switch(swizzle)
891 				{
892 				case VK_COMPONENT_SWIZZLE_ZERO:
893 				case VK_COMPONENT_SWIZZLE_ONE:
894 					// Handled at the final component swizzle.
895 					break;
896 				default:
897 					c.x = c01[swizzle - VK_COMPONENT_SWIZZLE_R];
898 					c.y = c11[swizzle - VK_COMPONENT_SWIZZLE_R];
899 					c.z = c10[swizzle - VK_COMPONENT_SWIZZLE_R];
900 					c.w = c00[swizzle - VK_COMPONENT_SWIZZLE_R];
901 					break;
902 				}
903 			}
904 		}
905 	}
906 
907 	return c;
908 }
909 
sample3D(Pointer<Byte> & texture,Float4 & u_,Float4 & v_,Float4 & w_,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)910 Vector4s SamplerCore::sample3D(Pointer<Byte> &texture, Float4 &u_, Float4 &v_, Float4 &w_, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
911 {
912 	Vector4s c_;
913 
914 	int componentCount = textureComponentCount();
915 
916 	Pointer<Byte> mipmap = selectMipmap(texture, lod, secondLOD);
917 	Pointer<Byte> buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
918 
919 	applyOffset(u_, v_, w_, offset, mipmap);
920 
921 	Short4 uuuu = address(u_, state.addressingModeU);
922 	Short4 vvvv = address(v_, state.addressingModeV);
923 	Short4 wwww = address(w_, state.addressingModeW);
924 
925 	if(state.textureFilter == FILTER_POINT)
926 	{
927 		c_ = sampleTexel(uuuu, vvvv, wwww, 0, sample, mipmap, buffer);
928 	}
929 	else
930 	{
931 		Vector4s c[2][2][2];
932 
933 		Short4 u[2][2][2];
934 		Short4 v[2][2][2];
935 		Short4 s[2][2][2];
936 
937 		for(int i = 0; i < 2; i++)
938 		{
939 			for(int j = 0; j < 2; j++)
940 			{
941 				for(int k = 0; k < 2; k++)
942 				{
943 					u[i][j][k] = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, i * 2 - 1, lod);
944 					v[i][j][k] = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, j * 2 - 1, lod);
945 					s[i][j][k] = offsetSample(wwww, mipmap, OFFSET(Mipmap, wHalf), state.addressingModeW == ADDRESSING_WRAP, k * 2 - 1, lod);
946 				}
947 			}
948 		}
949 
950 		// Fractions
951 		UShort4 f0u = As<UShort4>(u[0][0][0]) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, width)));
952 		UShort4 f0v = As<UShort4>(v[0][0][0]) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, height)));
953 		UShort4 f0s = As<UShort4>(s[0][0][0]) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, depth)));
954 
955 		UShort4 f1u = ~f0u;
956 		UShort4 f1v = ~f0v;
957 		UShort4 f1s = ~f0s;
958 
959 		UShort4 f[2][2][2];
960 		Short4 fs[2][2][2];
961 
962 		f[1][1][1] = MulHigh(f1u, f1v);
963 		f[0][1][1] = MulHigh(f0u, f1v);
964 		f[1][0][1] = MulHigh(f1u, f0v);
965 		f[0][0][1] = MulHigh(f0u, f0v);
966 		f[1][1][0] = MulHigh(f1u, f1v);
967 		f[0][1][0] = MulHigh(f0u, f1v);
968 		f[1][0][0] = MulHigh(f1u, f0v);
969 		f[0][0][0] = MulHigh(f0u, f0v);
970 
971 		f[1][1][1] = MulHigh(f[1][1][1], f1s);
972 		f[0][1][1] = MulHigh(f[0][1][1], f1s);
973 		f[1][0][1] = MulHigh(f[1][0][1], f1s);
974 		f[0][0][1] = MulHigh(f[0][0][1], f1s);
975 		f[1][1][0] = MulHigh(f[1][1][0], f0s);
976 		f[0][1][0] = MulHigh(f[0][1][0], f0s);
977 		f[1][0][0] = MulHigh(f[1][0][0], f0s);
978 		f[0][0][0] = MulHigh(f[0][0][0], f0s);
979 
980 		// Signed fractions
981 		if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
982 		{
983 			fs[0][0][0] = f[0][0][0] >> 1;
984 			fs[0][0][1] = f[0][0][1] >> 1;
985 			fs[0][1][0] = f[0][1][0] >> 1;
986 			fs[0][1][1] = f[0][1][1] >> 1;
987 			fs[1][0][0] = f[1][0][0] >> 1;
988 			fs[1][0][1] = f[1][0][1] >> 1;
989 			fs[1][1][0] = f[1][1][0] >> 1;
990 			fs[1][1][1] = f[1][1][1] >> 1;
991 		}
992 
993 		for(int i = 0; i < 2; i++)
994 		{
995 			for(int j = 0; j < 2; j++)
996 			{
997 				for(int k = 0; k < 2; k++)
998 				{
999 					c[i][j][k] = sampleTexel(u[i][j][k], v[i][j][k], s[i][j][k], 0, sample, mipmap, buffer);
1000 
1001 					if(componentCount >= 1)
1002 					{
1003 						if(hasUnsignedTextureComponent(0))
1004 							c[i][j][k].x = MulHigh(As<UShort4>(c[i][j][k].x), f[1 - i][1 - j][1 - k]);
1005 						else
1006 							c[i][j][k].x = MulHigh(c[i][j][k].x, fs[1 - i][1 - j][1 - k]);
1007 					}
1008 					if(componentCount >= 2)
1009 					{
1010 						if(hasUnsignedTextureComponent(1))
1011 							c[i][j][k].y = MulHigh(As<UShort4>(c[i][j][k].y), f[1 - i][1 - j][1 - k]);
1012 						else
1013 							c[i][j][k].y = MulHigh(c[i][j][k].y, fs[1 - i][1 - j][1 - k]);
1014 					}
1015 					if(componentCount >= 3)
1016 					{
1017 						if(hasUnsignedTextureComponent(2))
1018 							c[i][j][k].z = MulHigh(As<UShort4>(c[i][j][k].z), f[1 - i][1 - j][1 - k]);
1019 						else
1020 							c[i][j][k].z = MulHigh(c[i][j][k].z, fs[1 - i][1 - j][1 - k]);
1021 					}
1022 					if(componentCount >= 4)
1023 					{
1024 						if(hasUnsignedTextureComponent(3))
1025 							c[i][j][k].w = MulHigh(As<UShort4>(c[i][j][k].w), f[1 - i][1 - j][1 - k]);
1026 						else
1027 							c[i][j][k].w = MulHigh(c[i][j][k].w, fs[1 - i][1 - j][1 - k]);
1028 					}
1029 
1030 					if(i != 0 || j != 0 || k != 0)
1031 					{
1032 						if(componentCount >= 1) c[0][0][0].x += c[i][j][k].x;
1033 						if(componentCount >= 2) c[0][0][0].y += c[i][j][k].y;
1034 						if(componentCount >= 3) c[0][0][0].z += c[i][j][k].z;
1035 						if(componentCount >= 4) c[0][0][0].w += c[i][j][k].w;
1036 					}
1037 				}
1038 			}
1039 		}
1040 
1041 		if(componentCount >= 1) c_.x = c[0][0][0].x;
1042 		if(componentCount >= 2) c_.y = c[0][0][0].y;
1043 		if(componentCount >= 3) c_.z = c[0][0][0].z;
1044 		if(componentCount >= 4) c_.w = c[0][0][0].w;
1045 
1046 		// Correct for signed fractions
1047 		if(componentCount >= 1)
1048 			if(!hasUnsignedTextureComponent(0)) c_.x = AddSat(c_.x, c_.x);
1049 		if(componentCount >= 2)
1050 			if(!hasUnsignedTextureComponent(1)) c_.y = AddSat(c_.y, c_.y);
1051 		if(componentCount >= 3)
1052 			if(!hasUnsignedTextureComponent(2)) c_.z = AddSat(c_.z, c_.z);
1053 		if(componentCount >= 4)
1054 			if(!hasUnsignedTextureComponent(3)) c_.w = AddSat(c_.w, c_.w);
1055 	}
1056 
1057 	return c_;
1058 }
1059 
sampleFloatFilter(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta)1060 Vector4f SamplerCore::sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta)
1061 {
1062 	Vector4f c = sampleFloatAniso(texture, u, v, w, a, dRef, offset, sample, lod, anisotropy, uDelta, vDelta, false);
1063 
1064 	if(function == Fetch)
1065 	{
1066 		return c;
1067 	}
1068 
1069 	if(state.mipmapFilter == MIPMAP_LINEAR)
1070 	{
1071 		Vector4f cc = sampleFloatAniso(texture, u, v, w, a, dRef, offset, sample, lod, anisotropy, uDelta, vDelta, true);
1072 
1073 		Float4 lod4 = Float4(Frac(lod));
1074 
1075 		c.x = (cc.x - c.x) * lod4 + c.x;
1076 		c.y = (cc.y - c.y) * lod4 + c.y;
1077 		c.z = (cc.z - c.z) * lod4 + c.z;
1078 		c.w = (cc.w - c.w) * lod4 + c.w;
1079 	}
1080 
1081 	return c;
1082 }
1083 
sampleFloatAniso(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta,bool secondLOD)1084 Vector4f SamplerCore::sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD)
1085 {
1086 	Vector4f c;
1087 
1088 	if(state.textureFilter != FILTER_ANISOTROPIC)
1089 	{
1090 		c = sampleFloat(texture, u, v, w, a, dRef, offset, sample, lod, secondLOD);
1091 	}
1092 	else
1093 	{
1094 		Int N = RoundInt(anisotropy);
1095 
1096 		Vector4f cSum;
1097 
1098 		cSum.x = Float4(0.0f);
1099 		cSum.y = Float4(0.0f);
1100 		cSum.z = Float4(0.0f);
1101 		cSum.w = Float4(0.0f);
1102 
1103 		Float4 A = *Pointer<Float4>(constants + OFFSET(Constants, uvWeight) + 16 * N);
1104 		Float4 B = *Pointer<Float4>(constants + OFFSET(Constants, uvStart) + 16 * N);
1105 
1106 		Float4 du = uDelta;
1107 		Float4 dv = vDelta;
1108 
1109 		Float4 u0 = u + B * du;
1110 		Float4 v0 = v + B * dv;
1111 
1112 		du *= A;
1113 		dv *= A;
1114 
1115 		Int i = 0;
1116 
1117 		Do
1118 		{
1119 			c = sampleFloat(texture, u0, v0, w, a, dRef, offset, sample, lod, secondLOD);
1120 
1121 			u0 += du;
1122 			v0 += dv;
1123 
1124 			cSum.x += c.x * A;
1125 			cSum.y += c.y * A;
1126 			cSum.z += c.z * A;
1127 			cSum.w += c.w * A;
1128 
1129 			i++;
1130 		}
1131 		Until(i >= N);
1132 
1133 		c.x = cSum.x;
1134 		c.y = cSum.y;
1135 		c.z = cSum.z;
1136 		c.w = cSum.w;
1137 	}
1138 
1139 	return c;
1140 }
1141 
sampleFloat(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)1142 Vector4f SamplerCore::sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
1143 {
1144 	if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
1145 	{
1146 		return sampleFloat2D(texture, u, v, w, a, dRef, offset, sample, lod, secondLOD);
1147 	}
1148 	else
1149 	{
1150 		return sampleFloat3D(texture, u, v, w, dRef, offset, sample, lod, secondLOD);
1151 	}
1152 }
1153 
sampleFloat2D(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)1154 Vector4f SamplerCore::sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
1155 {
1156 	Vector4f c;
1157 
1158 	int componentCount = textureComponentCount();
1159 	bool gather = (state.textureFilter == FILTER_GATHER);
1160 
1161 	Pointer<Byte> mipmap = selectMipmap(texture, lod, secondLOD);
1162 	Pointer<Byte> buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
1163 
1164 	applyOffset(u, v, w, offset, mipmap);
1165 
1166 	Int4 x0, x1, y0, y1;
1167 	Float4 fu, fv;
1168 	Int4 filter = computeFilterOffset(lod);
1169 	address(u, x0, x1, fu, mipmap, filter, OFFSET(Mipmap, width), state.addressingModeU);
1170 	address(v, y0, y1, fv, mipmap, filter, OFFSET(Mipmap, height), state.addressingModeV);
1171 
1172 	Int4 pitchP = As<Int4>(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, pitchP), 16));
1173 	y0 *= pitchP;
1174 
1175 	Int4 z;
1176 	if(state.isCube() || state.isArrayed())
1177 	{
1178 		Int4 face = As<Int4>(w);
1179 		Int4 layerIndex = computeLayerIndex(a, mipmap);
1180 
1181 		// For cube maps, the layer argument is per cube, each of which has 6 layers
1182 		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
1183 		{
1184 			layerIndex *= Int4(6);
1185 		}
1186 
1187 		z = state.isCube() ? face : layerIndex;
1188 
1189 		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
1190 		{
1191 			z += layerIndex;
1192 		}
1193 
1194 		z *= *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
1195 	}
1196 
1197 	if(state.textureFilter == FILTER_POINT || (function == Fetch))
1198 	{
1199 		c = sampleTexel(x0, y0, z, dRef, sample, mipmap, buffer);
1200 	}
1201 	else
1202 	{
1203 		y1 *= pitchP;
1204 
1205 		Vector4f c00 = sampleTexel(x0, y0, z, dRef, sample, mipmap, buffer);
1206 		Vector4f c10 = sampleTexel(x1, y0, z, dRef, sample, mipmap, buffer);
1207 		Vector4f c01 = sampleTexel(x0, y1, z, dRef, sample, mipmap, buffer);
1208 		Vector4f c11 = sampleTexel(x1, y1, z, dRef, sample, mipmap, buffer);
1209 
1210 		if(!gather)  // Blend
1211 		{
1212 			if(componentCount >= 1) c00.x = c00.x + fu * (c10.x - c00.x);
1213 			if(componentCount >= 2) c00.y = c00.y + fu * (c10.y - c00.y);
1214 			if(componentCount >= 3) c00.z = c00.z + fu * (c10.z - c00.z);
1215 			if(componentCount >= 4) c00.w = c00.w + fu * (c10.w - c00.w);
1216 
1217 			if(componentCount >= 1) c01.x = c01.x + fu * (c11.x - c01.x);
1218 			if(componentCount >= 2) c01.y = c01.y + fu * (c11.y - c01.y);
1219 			if(componentCount >= 3) c01.z = c01.z + fu * (c11.z - c01.z);
1220 			if(componentCount >= 4) c01.w = c01.w + fu * (c11.w - c01.w);
1221 
1222 			if(componentCount >= 1) c.x = c00.x + fv * (c01.x - c00.x);
1223 			if(componentCount >= 2) c.y = c00.y + fv * (c01.y - c00.y);
1224 			if(componentCount >= 3) c.z = c00.z + fv * (c01.z - c00.z);
1225 			if(componentCount >= 4) c.w = c00.w + fv * (c01.w - c00.w);
1226 		}
1227 		else  // Gather
1228 		{
1229 			VkComponentSwizzle swizzle = gatherSwizzle();
1230 			switch(swizzle)
1231 			{
1232 			case VK_COMPONENT_SWIZZLE_ZERO:
1233 			case VK_COMPONENT_SWIZZLE_ONE:
1234 				// Handled at the final component swizzle.
1235 				break;
1236 			default:
1237 				c.x = c01[swizzle - VK_COMPONENT_SWIZZLE_R];
1238 				c.y = c11[swizzle - VK_COMPONENT_SWIZZLE_R];
1239 				c.z = c10[swizzle - VK_COMPONENT_SWIZZLE_R];
1240 				c.w = c00[swizzle - VK_COMPONENT_SWIZZLE_R];
1241 				break;
1242 			}
1243 		}
1244 	}
1245 
1246 	return c;
1247 }
1248 
sampleFloat3D(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)1249 Vector4f SamplerCore::sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
1250 {
1251 	Vector4f c;
1252 
1253 	int componentCount = textureComponentCount();
1254 
1255 	Pointer<Byte> mipmap = selectMipmap(texture, lod, secondLOD);
1256 	Pointer<Byte> buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
1257 
1258 	applyOffset(u, v, w, offset, mipmap);
1259 
1260 	Int4 x0, x1, y0, y1, z0, z1;
1261 	Float4 fu, fv, fw;
1262 	Int4 filter = computeFilterOffset(lod);
1263 	address(u, x0, x1, fu, mipmap, filter, OFFSET(Mipmap, width), state.addressingModeU);
1264 	address(v, y0, y1, fv, mipmap, filter, OFFSET(Mipmap, height), state.addressingModeV);
1265 	address(w, z0, z1, fw, mipmap, filter, OFFSET(Mipmap, depth), state.addressingModeW);
1266 
1267 	Int4 pitchP = As<Int4>(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, pitchP), 16));
1268 	Int4 sliceP = As<Int4>(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP), 16));
1269 	y0 *= pitchP;
1270 	z0 *= sliceP;
1271 
1272 	if(state.textureFilter == FILTER_POINT || (function == Fetch))
1273 	{
1274 		c = sampleTexel(x0, y0, z0, dRef, sample, mipmap, buffer);
1275 	}
1276 	else
1277 	{
1278 		y1 *= pitchP;
1279 		z1 *= sliceP;
1280 
1281 		Vector4f c000 = sampleTexel(x0, y0, z0, dRef, sample, mipmap, buffer);
1282 		Vector4f c100 = sampleTexel(x1, y0, z0, dRef, sample, mipmap, buffer);
1283 		Vector4f c010 = sampleTexel(x0, y1, z0, dRef, sample, mipmap, buffer);
1284 		Vector4f c110 = sampleTexel(x1, y1, z0, dRef, sample, mipmap, buffer);
1285 		Vector4f c001 = sampleTexel(x0, y0, z1, dRef, sample, mipmap, buffer);
1286 		Vector4f c101 = sampleTexel(x1, y0, z1, dRef, sample, mipmap, buffer);
1287 		Vector4f c011 = sampleTexel(x0, y1, z1, dRef, sample, mipmap, buffer);
1288 		Vector4f c111 = sampleTexel(x1, y1, z1, dRef, sample, mipmap, buffer);
1289 
1290 		// Blend first slice
1291 		if(componentCount >= 1) c000.x = c000.x + fu * (c100.x - c000.x);
1292 		if(componentCount >= 2) c000.y = c000.y + fu * (c100.y - c000.y);
1293 		if(componentCount >= 3) c000.z = c000.z + fu * (c100.z - c000.z);
1294 		if(componentCount >= 4) c000.w = c000.w + fu * (c100.w - c000.w);
1295 
1296 		if(componentCount >= 1) c010.x = c010.x + fu * (c110.x - c010.x);
1297 		if(componentCount >= 2) c010.y = c010.y + fu * (c110.y - c010.y);
1298 		if(componentCount >= 3) c010.z = c010.z + fu * (c110.z - c010.z);
1299 		if(componentCount >= 4) c010.w = c010.w + fu * (c110.w - c010.w);
1300 
1301 		if(componentCount >= 1) c000.x = c000.x + fv * (c010.x - c000.x);
1302 		if(componentCount >= 2) c000.y = c000.y + fv * (c010.y - c000.y);
1303 		if(componentCount >= 3) c000.z = c000.z + fv * (c010.z - c000.z);
1304 		if(componentCount >= 4) c000.w = c000.w + fv * (c010.w - c000.w);
1305 
1306 		// Blend second slice
1307 		if(componentCount >= 1) c001.x = c001.x + fu * (c101.x - c001.x);
1308 		if(componentCount >= 2) c001.y = c001.y + fu * (c101.y - c001.y);
1309 		if(componentCount >= 3) c001.z = c001.z + fu * (c101.z - c001.z);
1310 		if(componentCount >= 4) c001.w = c001.w + fu * (c101.w - c001.w);
1311 
1312 		if(componentCount >= 1) c011.x = c011.x + fu * (c111.x - c011.x);
1313 		if(componentCount >= 2) c011.y = c011.y + fu * (c111.y - c011.y);
1314 		if(componentCount >= 3) c011.z = c011.z + fu * (c111.z - c011.z);
1315 		if(componentCount >= 4) c011.w = c011.w + fu * (c111.w - c011.w);
1316 
1317 		if(componentCount >= 1) c001.x = c001.x + fv * (c011.x - c001.x);
1318 		if(componentCount >= 2) c001.y = c001.y + fv * (c011.y - c001.y);
1319 		if(componentCount >= 3) c001.z = c001.z + fv * (c011.z - c001.z);
1320 		if(componentCount >= 4) c001.w = c001.w + fv * (c011.w - c001.w);
1321 
1322 		// Blend slices
1323 		if(componentCount >= 1) c.x = c000.x + fw * (c001.x - c000.x);
1324 		if(componentCount >= 2) c.y = c000.y + fw * (c001.y - c000.y);
1325 		if(componentCount >= 3) c.z = c000.z + fw * (c001.z - c000.z);
1326 		if(componentCount >= 4) c.w = c000.w + fw * (c001.w - c000.w);
1327 	}
1328 
1329 	return c;
1330 }
1331 
log2sqrt(Float lod)1332 static Float log2sqrt(Float lod)
1333 {
1334 	// log2(sqrt(lod))                              // Equals 0.25 * log2(lod^2).
1335 	lod *= lod;                                     // Squaring doubles the exponent and produces an extra bit of precision.
1336 	lod = Float(As<Int>(lod)) - Float(0x3F800000);  // Interpret as integer and subtract the exponent bias.
1337 	lod *= As<Float>(Int(0x33000000));              // Scale by 0.25 * 2^-23 (mantissa length).
1338 
1339 	return lod;
1340 }
1341 
log2(Float lod)1342 static Float log2(Float lod)
1343 {
1344 	lod *= lod;                                     // Squaring doubles the exponent and produces an extra bit of precision.
1345 	lod = Float(As<Int>(lod)) - Float(0x3F800000);  // Interpret as integer and subtract the exponent bias.
1346 	lod *= As<Float>(Int(0x33800000));              // Scale by 0.5 * 2^-23 (mantissa length).
1347 
1348 	return lod;
1349 }
1350 
computeLod1D(Pointer<Byte> & texture,Float & lod,Float4 & uuuu,const Float4 & dsx,const Float4 & dsy)1351 void SamplerCore::computeLod1D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, const Float4 &dsx, const Float4 &dsy)
1352 {
1353 	Float4 dudxy;
1354 
1355 	if(function != Grad)  // Implicit
1356 	{
1357 		dudxy = uuuu.yz - uuuu.xx;
1358 	}
1359 	else
1360 	{
1361 		dudxy = UnpackLow(dsx, dsy);
1362 	}
1363 
1364 	// Scale by texture dimensions.
1365 	Float4 dUdxy = dudxy * *Pointer<Float4>(texture + OFFSET(Texture, widthWidthHeightHeight));
1366 
1367 	// Note we could take the absolute value here and omit the square root below,
1368 	// but this is more consistent with the 2D calculation and still cheap.
1369 	Float4 dU2dxy = dUdxy * dUdxy;
1370 
1371 	lod = Max(Float(dU2dxy.x), Float(dU2dxy.y));
1372 	lod = log2sqrt(lod);
1373 }
1374 
computeLod2D(Pointer<Byte> & texture,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta,Float4 & uuuu,Float4 & vvvv,const Float4 & dsx,const Float4 & dsy)1375 void SamplerCore::computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, const Float4 &dsx, const Float4 &dsy)
1376 {
1377 	Float4 duvdxy;
1378 
1379 	if(function != Grad)  // Implicit
1380 	{
1381 		duvdxy = Float4(uuuu.yz, vvvv.yz) - Float4(uuuu.xx, vvvv.xx);
1382 	}
1383 	else
1384 	{
1385 		Float4 dudxy = Float4(dsx.xx, dsy.xx);
1386 		Float4 dvdxy = Float4(dsx.yy, dsy.yy);
1387 
1388 		duvdxy = Float4(dudxy.xz, dvdxy.xz);
1389 	}
1390 
1391 	// Scale by texture dimensions.
1392 	Float4 dUVdxy = duvdxy * *Pointer<Float4>(texture + OFFSET(Texture, widthWidthHeightHeight));
1393 
1394 	Float4 dUV2dxy = dUVdxy * dUVdxy;
1395 	Float4 dUV2 = dUV2dxy.xy + dUV2dxy.zw;
1396 
1397 	lod = Max(Float(dUV2.x), Float(dUV2.y));  // Square length of major axis
1398 
1399 	if(state.textureFilter == FILTER_ANISOTROPIC)
1400 	{
1401 		Float det = Abs(Float(dUVdxy.x) * Float(dUVdxy.w) - Float(dUVdxy.y) * Float(dUVdxy.z));
1402 
1403 		Float4 dudx = duvdxy.xxxx;
1404 		Float4 dudy = duvdxy.yyyy;
1405 		Float4 dvdx = duvdxy.zzzz;
1406 		Float4 dvdy = duvdxy.wwww;
1407 
1408 		Int4 mask = As<Int4>(CmpNLT(dUV2.x, dUV2.y));
1409 		uDelta = As<Float4>((As<Int4>(dudx) & mask) | ((As<Int4>(dudy) & ~mask)));
1410 		vDelta = As<Float4>((As<Int4>(dvdx) & mask) | ((As<Int4>(dvdy) & ~mask)));
1411 
1412 		anisotropy = lod * Rcp(det, true /* relaxedPrecision */);
1413 		anisotropy = Min(anisotropy, state.maxAnisotropy);
1414 
1415 		// TODO(b/151263485): While we always need `lod` above, when there's only
1416 		// a single mipmap level the following calculations could be skipped.
1417 		lod *= Rcp(anisotropy * anisotropy, true /* relaxedPrecision */);
1418 	}
1419 
1420 	lod = log2sqrt(lod);  // log2(sqrt(lod))
1421 }
1422 
computeLodCube(Pointer<Byte> & texture,Float & lod,Float4 & u,Float4 & v,Float4 & w,const Float4 & dsx,const Float4 & dsy,Float4 & M)1423 void SamplerCore::computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float4 &dsx, const Float4 &dsy, Float4 &M)
1424 {
1425 	Float4 dudxy, dvdxy, dsdxy;
1426 
1427 	if(function != Grad)  // Implicit
1428 	{
1429 		Float4 U = u * M;
1430 		Float4 V = v * M;
1431 		Float4 W = w * M;
1432 
1433 		dudxy = Abs(U - U.xxxx);
1434 		dvdxy = Abs(V - V.xxxx);
1435 		dsdxy = Abs(W - W.xxxx);
1436 	}
1437 	else
1438 	{
1439 		dudxy = Float4(dsx.xx, dsy.xx);
1440 		dvdxy = Float4(dsx.yy, dsy.yy);
1441 		dsdxy = Float4(dsx.zz, dsy.zz);
1442 
1443 		dudxy = Abs(dudxy * Float4(M.x));
1444 		dvdxy = Abs(dvdxy * Float4(M.x));
1445 		dsdxy = Abs(dsdxy * Float4(M.x));
1446 	}
1447 
1448 	// Compute the largest Manhattan distance in two dimensions.
1449 	// This takes the footprint across adjacent faces into account.
1450 	Float4 duvdxy = dudxy + dvdxy;
1451 	Float4 dusdxy = dudxy + dsdxy;
1452 	Float4 dvsdxy = dvdxy + dsdxy;
1453 
1454 	dudxy = Max(Max(duvdxy, dusdxy), dvsdxy);
1455 
1456 	lod = Max(Float(dudxy.y), Float(dudxy.z));  // TODO: Max(dudxy.y, dudxy.z);
1457 
1458 	// Scale by texture dimension.
1459 	lod *= *Pointer<Float>(texture + OFFSET(Texture, width));
1460 
1461 	lod = log2(lod);
1462 }
1463 
computeLod3D(Pointer<Byte> & texture,Float & lod,Float4 & uuuu,Float4 & vvvv,Float4 & wwww,const Float4 & dsx,const Float4 & dsy)1464 void SamplerCore::computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, const Float4 &dsx, const Float4 &dsy)
1465 {
1466 	Float4 dudxy, dvdxy, dsdxy;
1467 
1468 	if(function != Grad)  // Implicit
1469 	{
1470 		dudxy = uuuu - uuuu.xxxx;
1471 		dvdxy = vvvv - vvvv.xxxx;
1472 		dsdxy = wwww - wwww.xxxx;
1473 	}
1474 	else
1475 	{
1476 		dudxy = Float4(dsx.xx, dsy.xx);
1477 		dvdxy = Float4(dsx.yy, dsy.yy);
1478 		dsdxy = Float4(dsx.zz, dsy.zz);
1479 	}
1480 
1481 	// Scale by texture dimensions.
1482 	dudxy *= *Pointer<Float4>(texture + OFFSET(Texture, width));
1483 	dvdxy *= *Pointer<Float4>(texture + OFFSET(Texture, height));
1484 	dsdxy *= *Pointer<Float4>(texture + OFFSET(Texture, depth));
1485 
1486 	dudxy *= dudxy;
1487 	dvdxy *= dvdxy;
1488 	dsdxy *= dsdxy;
1489 
1490 	dudxy += dvdxy;
1491 	dudxy += dsdxy;
1492 
1493 	lod = Max(Float(dudxy.y), Float(dudxy.z));  // TODO: Max(dudxy.y, dudxy.z);
1494 
1495 	lod = log2sqrt(lod);  // log2(sqrt(lod))
1496 }
1497 
cubeFace(Float4 & U,Float4 & V,Float4 & x,Float4 & y,Float4 & z,Float4 & M)1498 Int4 SamplerCore::cubeFace(Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M)
1499 {
1500 	// TODO: Comply with Vulkan recommendation:
1501 	// Vulkan 1.1: "The rules should have as the first rule that rz wins over ry and rx, and the second rule that ry wins over rx."
1502 
1503 	Int4 xn = CmpLT(x, 0.0f);  // x < 0
1504 	Int4 yn = CmpLT(y, 0.0f);  // y < 0
1505 	Int4 zn = CmpLT(z, 0.0f);  // z < 0
1506 
1507 	Float4 absX = Abs(x);
1508 	Float4 absY = Abs(y);
1509 	Float4 absZ = Abs(z);
1510 
1511 	Int4 xy = CmpNLE(absX, absY);  // abs(x) > abs(y)
1512 	Int4 yz = CmpNLE(absY, absZ);  // abs(y) > abs(z)
1513 	Int4 zx = CmpNLE(absZ, absX);  // abs(z) > abs(x)
1514 	Int4 xMajor = xy & ~zx;        // abs(x) > abs(y) && abs(x) > abs(z)
1515 	Int4 yMajor = yz & ~xy;        // abs(y) > abs(z) && abs(y) > abs(x)
1516 	Int4 zMajor = zx & ~yz;        // abs(z) > abs(x) && abs(z) > abs(y)
1517 
1518 	// FACE_POSITIVE_X = 000b
1519 	// FACE_NEGATIVE_X = 001b
1520 	// FACE_POSITIVE_Y = 010b
1521 	// FACE_NEGATIVE_Y = 011b
1522 	// FACE_POSITIVE_Z = 100b
1523 	// FACE_NEGATIVE_Z = 101b
1524 
1525 	Int yAxis = SignMask(yMajor);
1526 	Int zAxis = SignMask(zMajor);
1527 
1528 	Int4 n = ((xn & xMajor) | (yn & yMajor) | (zn & zMajor)) & Int4(0x80000000);
1529 	Int negative = SignMask(n);
1530 
1531 	Int faces = *Pointer<Int>(constants + OFFSET(Constants, transposeBit0) + negative * 4);
1532 	faces |= *Pointer<Int>(constants + OFFSET(Constants, transposeBit1) + yAxis * 4);
1533 	faces |= *Pointer<Int>(constants + OFFSET(Constants, transposeBit2) + zAxis * 4);
1534 
1535 	Int4 face;
1536 	face.x = faces & 0x7;
1537 	face.y = (faces >> 4) & 0x7;
1538 	face.z = (faces >> 8) & 0x7;
1539 	face.w = (faces >> 12) & 0x7;
1540 
1541 	M = Max(Max(absX, absY), absZ);
1542 
1543 	// U = xMajor ? (neg ^ -z) : ((zMajor & neg) ^ x)
1544 	U = As<Float4>((xMajor & (n ^ As<Int4>(-z))) | (~xMajor & ((zMajor & n) ^ As<Int4>(x))));
1545 
1546 	// V = !yMajor ? -y : (n ^ z)
1547 	V = As<Float4>((~yMajor & As<Int4>(-y)) | (yMajor & (n ^ As<Int4>(z))));
1548 
1549 	M = reciprocal(M) * 0.5f;
1550 	U = U * M + 0.5f;
1551 	V = V * M + 0.5f;
1552 
1553 	return face;
1554 }
1555 
applyOffset(Float4 & u,Float4 & v,Float4 & w,Vector4i & offset,Pointer<Byte> mipmap)1556 void SamplerCore::applyOffset(Float4 &u, Float4 &v, Float4 &w, Vector4i &offset, Pointer<Byte> mipmap)
1557 {
1558 	if(function.offset)
1559 	{
1560 		if(function == Fetch)
1561 		{
1562 			// Unnormalized coordinates
1563 			u = As<Float4>(As<Int4>(u) + offset.x);
1564 			if(state.is2D() || state.is3D() || state.isCube())
1565 			{
1566 				v = As<Float4>(As<Int4>(v) + offset.y);
1567 				if(state.is3D())
1568 				{
1569 					w = As<Float4>(As<Int4>(w) + offset.z);
1570 				}
1571 			}
1572 		}
1573 		else
1574 		{
1575 			// Normalized coordinates
1576 			UInt4 width = *Pointer<UInt4>(mipmap + OFFSET(Mipmap, width));
1577 			u += Float4(offset.x) / Float4(width);
1578 			if(state.is2D() || state.is3D() || state.isCube())
1579 			{
1580 				UInt4 height = *Pointer<UInt4>(mipmap + OFFSET(Mipmap, height));
1581 				v += Float4(offset.y) / Float4(height);
1582 				if(state.is3D())
1583 				{
1584 					UInt4 depth = *Pointer<UInt4>(mipmap + OFFSET(Mipmap, depth));
1585 					w += Float4(offset.z) / Float4(depth);
1586 				}
1587 			}
1588 		}
1589 	}
1590 }
1591 
computeIndices(UInt index[4],Short4 uuuu,Short4 vvvv,Short4 wwww,const Short4 & layerIndex,const Int4 & sample,const Pointer<Byte> & mipmap)1592 void SamplerCore::computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, const Short4 &layerIndex, const Int4 &sample, const Pointer<Byte> &mipmap)
1593 {
1594 	uuuu = MulHigh(As<UShort4>(uuuu), UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, width))));
1595 
1596 	UInt4 indices = Int4(uuuu);
1597 
1598 	if(state.is2D() || state.is3D() || state.isCube())
1599 	{
1600 		vvvv = MulHigh(As<UShort4>(vvvv), UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, height))));
1601 
1602 		Short4 uv0uv1 = As<Short4>(UnpackLow(uuuu, vvvv));
1603 		Short4 uv2uv3 = As<Short4>(UnpackHigh(uuuu, vvvv));
1604 		Int2 i01 = MulAdd(uv0uv1, *Pointer<Short4>(mipmap + OFFSET(Mipmap, onePitchP)));
1605 		Int2 i23 = MulAdd(uv2uv3, *Pointer<Short4>(mipmap + OFFSET(Mipmap, onePitchP)));
1606 
1607 		indices = UInt4(As<UInt2>(i01), As<UInt2>(i23));
1608 	}
1609 
1610 	if(state.is3D())
1611 	{
1612 		wwww = MulHigh(As<UShort4>(wwww), UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, depth))));
1613 
1614 		indices += As<UInt4>(Int4(As<UShort4>(wwww))) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP));
1615 	}
1616 
1617 	if(state.isArrayed())
1618 	{
1619 		Int4 layer = Int4(As<UShort4>(layerIndex));
1620 
1621 		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
1622 		{
1623 			layer *= Int4(6);
1624 		}
1625 
1626 		UInt4 layerOffset = As<UInt4>(layer) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP));
1627 
1628 		indices += layerOffset;
1629 	}
1630 
1631 	if(function.sample)
1632 	{
1633 		UInt4 sampleOffset = Min(As<UInt4>(sample), *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sampleMax), 16)) *
1634 		                     *Pointer<UInt4>(mipmap + OFFSET(Mipmap, samplePitchP), 16);
1635 		indices += sampleOffset;
1636 	}
1637 
1638 	index[0] = Extract(indices, 0);
1639 	index[1] = Extract(indices, 1);
1640 	index[2] = Extract(indices, 2);
1641 	index[3] = Extract(indices, 3);
1642 }
1643 
computeIndices(UInt index[4],Int4 uuuu,Int4 vvvv,Int4 wwww,const Int4 & sample,Int4 valid,const Pointer<Byte> & mipmap)1644 void SamplerCore::computeIndices(UInt index[4], Int4 uuuu, Int4 vvvv, Int4 wwww, const Int4 &sample, Int4 valid, const Pointer<Byte> &mipmap)
1645 {
1646 	UInt4 indices = uuuu;
1647 
1648 	if(state.is2D() || state.is3D() || state.isCube())
1649 	{
1650 		indices += As<UInt4>(vvvv);
1651 	}
1652 
1653 	if(state.is3D() || state.isCube() || state.isArrayed())
1654 	{
1655 		indices += As<UInt4>(wwww);
1656 	}
1657 
1658 	if(function.sample)
1659 	{
1660 		indices += Min(As<UInt4>(sample), *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sampleMax), 16)) *
1661 		           *Pointer<UInt4>(mipmap + OFFSET(Mipmap, samplePitchP), 16);
1662 	}
1663 
1664 	if(borderModeActive())
1665 	{
1666 		// Texels out of range are still sampled before being replaced
1667 		// with the border color, so sample them at linear index 0.
1668 		indices &= As<UInt4>(valid);
1669 	}
1670 
1671 	for(int i = 0; i < 4; i++)
1672 	{
1673 		index[i] = Extract(As<Int4>(indices), i);
1674 	}
1675 }
1676 
sampleTexel(UInt index[4],Pointer<Byte> buffer)1677 Vector4s SamplerCore::sampleTexel(UInt index[4], Pointer<Byte> buffer)
1678 {
1679 	Vector4s c;
1680 
1681 	if(has16bitPackedTextureFormat())
1682 	{
1683 		c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
1684 		c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
1685 		c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
1686 		c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
1687 
1688 		switch(state.textureFormat)
1689 		{
1690 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
1691 			c.z = (c.x & Short4(0x001Fu)) << 11;
1692 			c.y = (c.x & Short4(0x07E0u)) << 5;
1693 			c.x = (c.x & Short4(0xF800u));
1694 			break;
1695 		case VK_FORMAT_B5G6R5_UNORM_PACK16:
1696 			c.z = (c.x & Short4(0xF800u));
1697 			c.y = (c.x & Short4(0x07E0u)) << 5;
1698 			c.x = (c.x & Short4(0x001Fu)) << 11;
1699 			break;
1700 		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1701 			c.w = (c.x << 12) & Short4(0xF000u);
1702 			c.z = (c.x << 8) & Short4(0xF000u);
1703 			c.y = (c.x << 4) & Short4(0xF000u);
1704 			c.x = (c.x) & Short4(0xF000u);
1705 			break;
1706 		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1707 			c.w = (c.x << 12) & Short4(0xF000u);
1708 			c.z = (c.x) & Short4(0xF000u);
1709 			c.y = (c.x << 4) & Short4(0xF000u);
1710 			c.x = (c.x << 8) & Short4(0xF000u);
1711 			break;
1712 		case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1713 			c.w = (c.x) & Short4(0xF000u);
1714 			c.z = (c.x << 12) & Short4(0xF000u);
1715 			c.y = (c.x << 8) & Short4(0xF000u);
1716 			c.x = (c.x << 4) & Short4(0xF000u);
1717 			break;
1718 		case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1719 			c.w = (c.x) & Short4(0xF000u);
1720 			c.z = (c.x << 4) & Short4(0xF000u);
1721 			c.y = (c.x << 8) & Short4(0xF000u);
1722 			c.x = (c.x << 12) & Short4(0xF000u);
1723 			break;
1724 		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1725 			c.w = (c.x << 15) & Short4(0x8000u);
1726 			c.z = (c.x << 10) & Short4(0xF800u);
1727 			c.y = (c.x << 5) & Short4(0xF800u);
1728 			c.x = (c.x) & Short4(0xF800u);
1729 			break;
1730 		case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1731 			c.w = (c.x << 15) & Short4(0x8000u);
1732 			c.z = (c.x) & Short4(0xF800u);
1733 			c.y = (c.x << 5) & Short4(0xF800u);
1734 			c.x = (c.x << 10) & Short4(0xF800u);
1735 			break;
1736 		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1737 			c.w = (c.x) & Short4(0x8000u);
1738 			c.z = (c.x << 11) & Short4(0xF800u);
1739 			c.y = (c.x << 6) & Short4(0xF800u);
1740 			c.x = (c.x << 1) & Short4(0xF800u);
1741 			break;
1742 		default:
1743 			ASSERT(false);
1744 		}
1745 	}
1746 	else if(has8bitTextureComponents())
1747 	{
1748 		switch(textureComponentCount())
1749 		{
1750 		case 4:
1751 			{
1752 				Byte4 c0 = Pointer<Byte4>(buffer)[index[0]];
1753 				Byte4 c1 = Pointer<Byte4>(buffer)[index[1]];
1754 				Byte4 c2 = Pointer<Byte4>(buffer)[index[2]];
1755 				Byte4 c3 = Pointer<Byte4>(buffer)[index[3]];
1756 				c.x = Unpack(c0, c1);
1757 				c.y = Unpack(c2, c3);
1758 
1759 				switch(state.textureFormat)
1760 				{
1761 				case VK_FORMAT_B8G8R8A8_UNORM:
1762 				case VK_FORMAT_B8G8R8A8_SRGB:
1763 					c.z = As<Short4>(UnpackLow(c.x, c.y));
1764 					c.x = As<Short4>(UnpackHigh(c.x, c.y));
1765 					c.y = c.z;
1766 					c.w = c.x;
1767 					c.z = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.z));
1768 					c.y = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.y));
1769 					c.x = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.x));
1770 					c.w = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.w));
1771 					break;
1772 				case VK_FORMAT_R8G8B8A8_UNORM:
1773 				case VK_FORMAT_R8G8B8A8_SNORM:
1774 				case VK_FORMAT_R8G8B8A8_SINT:
1775 				case VK_FORMAT_R8G8B8A8_SRGB:
1776 				case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1777 				case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
1778 				case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1779 				case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1780 					c.z = As<Short4>(UnpackHigh(c.x, c.y));
1781 					c.x = As<Short4>(UnpackLow(c.x, c.y));
1782 					c.y = c.x;
1783 					c.w = c.z;
1784 					c.x = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.x));
1785 					c.y = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.y));
1786 					c.z = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.z));
1787 					c.w = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.w));
1788 					// Propagate sign bit
1789 					if(state.textureFormat == VK_FORMAT_R8G8B8A8_SINT ||
1790 					   state.textureFormat == VK_FORMAT_A8B8G8R8_SINT_PACK32)
1791 					{
1792 						c.x >>= 8;
1793 						c.y >>= 8;
1794 						c.z >>= 8;
1795 						c.w >>= 8;
1796 					}
1797 					break;
1798 				case VK_FORMAT_R8G8B8A8_UINT:
1799 				case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1800 					c.z = As<Short4>(UnpackHigh(c.x, c.y));
1801 					c.x = As<Short4>(UnpackLow(c.x, c.y));
1802 					c.y = c.x;
1803 					c.w = c.z;
1804 					c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(Short4(0)));
1805 					c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(Short4(0)));
1806 					c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(Short4(0)));
1807 					c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(Short4(0)));
1808 					break;
1809 				default:
1810 					ASSERT(false);
1811 				}
1812 			}
1813 			break;
1814 		case 2:
1815 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
1816 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
1817 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
1818 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
1819 
1820 			switch(state.textureFormat)
1821 			{
1822 			case VK_FORMAT_R8G8_UNORM:
1823 			case VK_FORMAT_R8G8_SNORM:
1824 			case VK_FORMAT_R8G8_SRGB:
1825 				c.y = (c.x & Short4(0xFF00u));
1826 				c.x = (c.x << 8);
1827 				break;
1828 			case VK_FORMAT_R8G8_SINT:
1829 				c.y = c.x >> 8;
1830 				c.x = (c.x << 8) >> 8;  // Propagate sign bit
1831 				break;
1832 			case VK_FORMAT_R8G8_UINT:
1833 				c.y = As<Short4>(As<UShort4>(c.x) >> 8);
1834 				c.x &= Short4(0x00FFu);
1835 				break;
1836 			default:
1837 				ASSERT(false);
1838 			}
1839 			break;
1840 		case 1:
1841 			{
1842 				Int c0 = Int(*Pointer<Byte>(buffer + index[0]));
1843 				Int c1 = Int(*Pointer<Byte>(buffer + index[1]));
1844 				Int c2 = Int(*Pointer<Byte>(buffer + index[2]));
1845 				Int c3 = Int(*Pointer<Byte>(buffer + index[3]));
1846 				c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
1847 
1848 				switch(state.textureFormat)
1849 				{
1850 				case VK_FORMAT_R8_SINT:
1851 				case VK_FORMAT_R8_UINT:
1852 				case VK_FORMAT_S8_UINT:
1853 					{
1854 						Int zero(0);
1855 						c.x = Unpack(As<Byte4>(c0), As<Byte4>(zero));
1856 						// Propagate sign bit
1857 						if(state.textureFormat == VK_FORMAT_R8_SINT)
1858 						{
1859 							c.x = (c.x << 8) >> 8;
1860 						}
1861 					}
1862 					break;
1863 				case VK_FORMAT_R8_SNORM:
1864 				case VK_FORMAT_R8_UNORM:
1865 				case VK_FORMAT_R8_SRGB:
1866 					// TODO: avoid populating the low bits at all.
1867 					c.x = Unpack(As<Byte4>(c0));
1868 					c.x &= Short4(0xFF00u);
1869 					break;
1870 				default:
1871 					c.x = Unpack(As<Byte4>(c0));
1872 					break;
1873 				}
1874 			}
1875 			break;
1876 		default:
1877 			ASSERT(false);
1878 		}
1879 	}
1880 	else if(has16bitTextureComponents())
1881 	{
1882 		switch(textureComponentCount())
1883 		{
1884 		case 4:
1885 			c.x = Pointer<Short4>(buffer)[index[0]];
1886 			c.y = Pointer<Short4>(buffer)[index[1]];
1887 			c.z = Pointer<Short4>(buffer)[index[2]];
1888 			c.w = Pointer<Short4>(buffer)[index[3]];
1889 			transpose4x4(c.x, c.y, c.z, c.w);
1890 			break;
1891 		case 2:
1892 			c.x = *Pointer<Short4>(buffer + 4 * index[0]);
1893 			c.x = As<Short4>(UnpackLow(c.x, *Pointer<Short4>(buffer + 4 * index[1])));
1894 			c.z = *Pointer<Short4>(buffer + 4 * index[2]);
1895 			c.z = As<Short4>(UnpackLow(c.z, *Pointer<Short4>(buffer + 4 * index[3])));
1896 			c.y = c.x;
1897 			c.x = UnpackLow(As<Int2>(c.x), As<Int2>(c.z));
1898 			c.y = UnpackHigh(As<Int2>(c.y), As<Int2>(c.z));
1899 			break;
1900 		case 1:
1901 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
1902 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
1903 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
1904 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
1905 			break;
1906 		default:
1907 			ASSERT(false);
1908 		}
1909 	}
1910 	else if(state.textureFormat == VK_FORMAT_A2B10G10R10_UNORM_PACK32)
1911 	{
1912 		Int4 cc;
1913 		cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1914 		cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1915 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1916 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1917 
1918 		c.x = Short4(cc << 6) & Short4(0xFFC0u);
1919 		c.y = Short4(cc >> 4) & Short4(0xFFC0u);
1920 		c.z = Short4(cc >> 14) & Short4(0xFFC0u);
1921 		c.w = Short4(cc >> 16) & Short4(0xC000u);
1922 	}
1923 	else if(state.textureFormat == VK_FORMAT_A2R10G10B10_UNORM_PACK32)
1924 	{
1925 		Int4 cc;
1926 		cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1927 		cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1928 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1929 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1930 
1931 		c.x = Short4(cc >> 14) & Short4(0xFFC0u);
1932 		c.y = Short4(cc >> 4) & Short4(0xFFC0u);
1933 		c.z = Short4(cc << 6) & Short4(0xFFC0u);
1934 		c.w = Short4(cc >> 16) & Short4(0xC000u);
1935 	}
1936 	else if(state.textureFormat == VK_FORMAT_A2B10G10R10_UINT_PACK32)
1937 	{
1938 		Int4 cc;
1939 		cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1940 		cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1941 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1942 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1943 
1944 		c.x = Short4(cc & Int4(0x3FF));
1945 		c.y = Short4((cc >> 10) & Int4(0x3FF));
1946 		c.z = Short4((cc >> 20) & Int4(0x3FF));
1947 		c.w = Short4((cc >> 30) & Int4(0x3));
1948 	}
1949 	else if(state.textureFormat == VK_FORMAT_A2R10G10B10_UINT_PACK32)
1950 	{
1951 		Int4 cc;
1952 		cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1953 		cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1954 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1955 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1956 
1957 		c.z = Short4((cc & Int4(0x3FF)));
1958 		c.y = Short4(((cc >> 10) & Int4(0x3FF)));
1959 		c.x = Short4(((cc >> 20) & Int4(0x3FF)));
1960 		c.w = Short4(((cc >> 30) & Int4(0x3)));
1961 	}
1962 	else
1963 		ASSERT(false);
1964 
1965 	if(state.textureFormat.isSRGBformat())
1966 	{
1967 		for(int i = 0; i < textureComponentCount(); i++)
1968 		{
1969 			if(isRGBComponent(i))
1970 			{
1971 				// The current table-based sRGB conversion requires 0xFF00 to represent 1.0.
1972 				ASSERT(state.textureFormat.has8bitTextureComponents());
1973 
1974 				sRGBtoLinearFF00(c[i]);
1975 			}
1976 		}
1977 	}
1978 
1979 	return c;
1980 }
1981 
sampleLumaTexel(Vector4f & output,Short4 & uuuu,Short4 & vvvv,Short4 & wwww,const Short4 & layerIndex,const Int4 & sample,Pointer<Byte> & lumaMipmap,Pointer<Byte> lumaBuffer)1982 void SamplerCore::sampleLumaTexel(Vector4f &output, Short4 &uuuu, Short4 &vvvv, Short4 &wwww, const Short4 &layerIndex, const Int4 &sample, Pointer<Byte> &lumaMipmap, Pointer<Byte> lumaBuffer)
1983 {
1984 	ASSERT(isYcbcrFormat());
1985 
1986 	UInt index[4];
1987 	computeIndices(index, uuuu, vvvv, wwww, layerIndex, sample, lumaMipmap);
1988 
1989 	// Luminance (either 8-bit or 10-bit in bottom bits).
1990 	UShort4 Y;
1991 
1992 	switch(state.textureFormat)
1993 	{
1994 	case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
1995 	case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
1996 		{
1997 			Y = Insert(Y, UShort(lumaBuffer[index[0]]), 0);
1998 			Y = Insert(Y, UShort(lumaBuffer[index[1]]), 1);
1999 			Y = Insert(Y, UShort(lumaBuffer[index[2]]), 2);
2000 			Y = Insert(Y, UShort(lumaBuffer[index[3]]), 3);
2001 		}
2002 		break;
2003 	case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
2004 		{
2005 			Y = Insert(Y, Pointer<UShort>(lumaBuffer)[index[0]], 0);
2006 			Y = Insert(Y, Pointer<UShort>(lumaBuffer)[index[1]], 1);
2007 			Y = Insert(Y, Pointer<UShort>(lumaBuffer)[index[2]], 2);
2008 			Y = Insert(Y, Pointer<UShort>(lumaBuffer)[index[3]], 3);
2009 			// Top 10 bits of each 16 bits:
2010 			Y = (Y & UShort4(0xFFC0u)) >> 6;
2011 		}
2012 		break;
2013 	default:
2014 		UNSUPPORTED("state.textureFormat %d", (int)state.textureFormat);
2015 		break;
2016 	}
2017 
2018 	output.y = Float4(Y);
2019 }
2020 
sampleChromaTexel(Vector4f & output,Short4 & uuuu,Short4 & vvvv,Short4 & wwww,const Short4 & layerIndex,const Int4 & sample,Pointer<Byte> & mipmapU,Pointer<Byte> bufferU,Pointer<Byte> & mipmapV,Pointer<Byte> bufferV)2021 void SamplerCore::sampleChromaTexel(Vector4f &output, Short4 &uuuu, Short4 &vvvv, Short4 &wwww, const Short4 &layerIndex, const Int4 &sample, Pointer<Byte> &mipmapU, Pointer<Byte> bufferU, Pointer<Byte> &mipmapV, Pointer<Byte> bufferV)
2022 {
2023 	ASSERT(isYcbcrFormat());
2024 
2025 	UInt index[4];
2026 
2027 	// Chroma (either 8-bit or 10-bit in bottom bits).
2028 	UShort4 U, V;
2029 	computeIndices(index, uuuu, vvvv, wwww, layerIndex, sample, mipmapU);
2030 
2031 	switch(state.textureFormat)
2032 	{
2033 	case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
2034 		{
2035 			U = Insert(U, UShort(bufferU[index[0]]), 0);
2036 			U = Insert(U, UShort(bufferU[index[1]]), 1);
2037 			U = Insert(U, UShort(bufferU[index[2]]), 2);
2038 			U = Insert(U, UShort(bufferU[index[3]]), 3);
2039 
2040 			V = Insert(V, UShort(bufferV[index[0]]), 0);
2041 			V = Insert(V, UShort(bufferV[index[1]]), 1);
2042 			V = Insert(V, UShort(bufferV[index[2]]), 2);
2043 			V = Insert(V, UShort(bufferV[index[3]]), 3);
2044 		}
2045 		break;
2046 	case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
2047 		{
2048 			UShort4 UV;
2049 			UV = Insert(UV, Pointer<UShort>(bufferU)[index[0]], 0);
2050 			UV = Insert(UV, Pointer<UShort>(bufferU)[index[1]], 1);
2051 			UV = Insert(UV, Pointer<UShort>(bufferU)[index[2]], 2);
2052 			UV = Insert(UV, Pointer<UShort>(bufferU)[index[3]], 3);
2053 
2054 			U = (UV & UShort4(0x00FFu));
2055 			V = (UV & UShort4(0xFF00u)) >> 8;
2056 		}
2057 		break;
2058 	case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
2059 		{
2060 			UInt4 UV;
2061 			UV = Insert(UV, Pointer<UInt>(bufferU)[index[0]], 0);
2062 			UV = Insert(UV, Pointer<UInt>(bufferU)[index[1]], 1);
2063 			UV = Insert(UV, Pointer<UInt>(bufferU)[index[2]], 2);
2064 			UV = Insert(UV, Pointer<UInt>(bufferU)[index[3]], 3);
2065 			// Top 10 bits of first 16-bits:
2066 			U = UShort4((UV & UInt4(0x0000FFC0u)) >> 6);
2067 			// Top 10 bits of second 16-bits:
2068 			V = UShort4((UV & UInt4(0xFFC00000u)) >> 22);
2069 		}
2070 		break;
2071 	default:
2072 		UNSUPPORTED("state.textureFormat %d", (int)state.textureFormat);
2073 		break;
2074 	}
2075 
2076 	output.x = Float4(V);
2077 	output.z = Float4(U);
2078 }
2079 
sampleTexel(Short4 & uuuu,Short4 & vvvv,Short4 & wwww,const Short4 & layerIndex,const Int4 & sample,Pointer<Byte> & mipmap,Pointer<Byte> buffer)2080 Vector4s SamplerCore::sampleTexel(Short4 &uuuu, Short4 &vvvv, Short4 &wwww, const Short4 &layerIndex, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer)
2081 {
2082 	ASSERT(!isYcbcrFormat());
2083 
2084 	UInt index[4];
2085 	computeIndices(index, uuuu, vvvv, wwww, layerIndex, sample, mipmap);
2086 
2087 	return sampleTexel(index, buffer);
2088 }
2089 
sampleTexel(Int4 & uuuu,Int4 & vvvv,Int4 & wwww,const Float4 & dRef,const Int4 & sample,Pointer<Byte> & mipmap,Pointer<Byte> buffer)2090 Vector4f SamplerCore::sampleTexel(Int4 &uuuu, Int4 &vvvv, Int4 &wwww, const Float4 &dRef, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer)
2091 {
2092 	Int4 valid;
2093 
2094 	if(borderModeActive())
2095 	{
2096 		// Valid texels have positive coordinates.
2097 		Int4 negative = uuuu;
2098 		if(state.is2D() || state.is3D() || state.isCube()) negative |= vvvv;
2099 		if(state.is3D() || state.isCube() || state.isArrayed()) negative |= wwww;
2100 		valid = CmpNLT(negative, Int4(0));
2101 	}
2102 
2103 	UInt index[4];
2104 	computeIndices(index, uuuu, vvvv, wwww, sample, valid, mipmap);
2105 
2106 	Vector4f c;
2107 
2108 	if(hasFloatTexture() || has32bitIntegerTextureComponents())
2109 	{
2110 		UInt4 t0, t1, t2, t3;
2111 
2112 		switch(state.textureFormat)
2113 		{
2114 		case VK_FORMAT_R16_SFLOAT:
2115 			t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 2));
2116 			t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 2));
2117 			t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 2));
2118 			t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 2));
2119 
2120 			c.x.x = Extract(As<Float4>(halfToFloatBits(t0)), 0);
2121 			c.x.y = Extract(As<Float4>(halfToFloatBits(t1)), 0);
2122 			c.x.z = Extract(As<Float4>(halfToFloatBits(t2)), 0);
2123 			c.x.w = Extract(As<Float4>(halfToFloatBits(t3)), 0);
2124 			break;
2125 		case VK_FORMAT_R16G16_SFLOAT:
2126 			t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 4));
2127 			t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 4));
2128 			t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 4));
2129 			t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 4));
2130 
2131 			// TODO: shuffles
2132 			c.x = As<Float4>(halfToFloatBits(t0));
2133 			c.y = As<Float4>(halfToFloatBits(t1));
2134 			c.z = As<Float4>(halfToFloatBits(t2));
2135 			c.w = As<Float4>(halfToFloatBits(t3));
2136 			transpose4x4(c.x, c.y, c.z, c.w);
2137 			break;
2138 		case VK_FORMAT_R16G16B16A16_SFLOAT:
2139 			t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 8));
2140 			t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 8));
2141 			t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 8));
2142 			t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 8));
2143 
2144 			c.x = As<Float4>(halfToFloatBits(t0));
2145 			c.y = As<Float4>(halfToFloatBits(t1));
2146 			c.z = As<Float4>(halfToFloatBits(t2));
2147 			c.w = As<Float4>(halfToFloatBits(t3));
2148 			transpose4x4(c.x, c.y, c.z, c.w);
2149 			break;
2150 		case VK_FORMAT_R32_SFLOAT:
2151 		case VK_FORMAT_R32_SINT:
2152 		case VK_FORMAT_R32_UINT:
2153 		case VK_FORMAT_D32_SFLOAT:
2154 			// TODO: Optimal shuffling?
2155 			c.x.x = *Pointer<Float>(buffer + index[0] * 4);
2156 			c.x.y = *Pointer<Float>(buffer + index[1] * 4);
2157 			c.x.z = *Pointer<Float>(buffer + index[2] * 4);
2158 			c.x.w = *Pointer<Float>(buffer + index[3] * 4);
2159 			break;
2160 		case VK_FORMAT_R32G32_SFLOAT:
2161 		case VK_FORMAT_R32G32_SINT:
2162 		case VK_FORMAT_R32G32_UINT:
2163 			// TODO: Optimal shuffling?
2164 			c.x.xy = *Pointer<Float4>(buffer + index[0] * 8);
2165 			c.x.zw = *Pointer<Float4>(buffer + index[1] * 8 - 8);
2166 			c.z.xy = *Pointer<Float4>(buffer + index[2] * 8);
2167 			c.z.zw = *Pointer<Float4>(buffer + index[3] * 8 - 8);
2168 			c.y = c.x;
2169 			c.x = Float4(c.x.xz, c.z.xz);
2170 			c.y = Float4(c.y.yw, c.z.yw);
2171 			break;
2172 		case VK_FORMAT_R32G32B32A32_SFLOAT:
2173 		case VK_FORMAT_R32G32B32A32_SINT:
2174 		case VK_FORMAT_R32G32B32A32_UINT:
2175 			c.x = *Pointer<Float4>(buffer + index[0] * 16, 16);
2176 			c.y = *Pointer<Float4>(buffer + index[1] * 16, 16);
2177 			c.z = *Pointer<Float4>(buffer + index[2] * 16, 16);
2178 			c.w = *Pointer<Float4>(buffer + index[3] * 16, 16);
2179 			transpose4x4(c.x, c.y, c.z, c.w);
2180 			break;
2181 		case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
2182 			{
2183 				Float4 t;  // TODO: add Insert(UInt4, RValue<UInt>)
2184 				t.x = *Pointer<Float>(buffer + index[0] * 4);
2185 				t.y = *Pointer<Float>(buffer + index[1] * 4);
2186 				t.z = *Pointer<Float>(buffer + index[2] * 4);
2187 				t.w = *Pointer<Float>(buffer + index[3] * 4);
2188 				t0 = As<UInt4>(t);
2189 				c.w = Float4(UInt4(1) << ((t0 >> 27) & UInt4(0x1F))) * Float4(1.0f / (1 << 24));
2190 				c.x = Float4(t0 & UInt4(0x1FF)) * c.w;
2191 				c.y = Float4((t0 >> 9) & UInt4(0x1FF)) * c.w;
2192 				c.z = Float4((t0 >> 18) & UInt4(0x1FF)) * c.w;
2193 			}
2194 			break;
2195 		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2196 			{
2197 				Float4 t;  // TODO: add Insert(UInt4, RValue<UInt>)
2198 				t.x = *Pointer<Float>(buffer + index[0] * 4);
2199 				t.y = *Pointer<Float>(buffer + index[1] * 4);
2200 				t.z = *Pointer<Float>(buffer + index[2] * 4);
2201 				t.w = *Pointer<Float>(buffer + index[3] * 4);
2202 				t0 = As<UInt4>(t);
2203 				c.x = As<Float4>(halfToFloatBits((t0 << 4) & UInt4(0x7FF0)));
2204 				c.y = As<Float4>(halfToFloatBits((t0 >> 7) & UInt4(0x7FF0)));
2205 				c.z = As<Float4>(halfToFloatBits((t0 >> 17) & UInt4(0x7FE0)));
2206 			}
2207 			break;
2208 		default:
2209 			UNSUPPORTED("Format %d", VkFormat(state.textureFormat));
2210 		}
2211 	}
2212 	else
2213 	{
2214 		ASSERT(!isYcbcrFormat());
2215 
2216 		Vector4s cs = sampleTexel(index, buffer);
2217 
2218 		bool isInteger = state.textureFormat.isUnnormalizedInteger();
2219 		int componentCount = textureComponentCount();
2220 		for(int n = 0; n < componentCount; n++)
2221 		{
2222 			if(hasUnsignedTextureComponent(n))
2223 			{
2224 				if(isInteger)
2225 				{
2226 					c[n] = As<Float4>(Int4(As<UShort4>(cs[n])));
2227 				}
2228 				else
2229 				{
2230 					c[n] = Float4(As<UShort4>(cs[n]));
2231 				}
2232 			}
2233 			else
2234 			{
2235 				if(isInteger)
2236 				{
2237 					c[n] = As<Float4>(Int4(cs[n]));
2238 				}
2239 				else
2240 				{
2241 					c[n] = Float4(cs[n]);
2242 				}
2243 			}
2244 		}
2245 	}
2246 
2247 	if(borderModeActive())
2248 	{
2249 		c = replaceBorderTexel(c, valid);
2250 	}
2251 
2252 	if(state.compareEnable)
2253 	{
2254 		Float4 ref = dRef;
2255 
2256 		if(!hasFloatTexture())
2257 		{
2258 			// D16_UNORM: clamp reference, normalize texel value
2259 			ref = Min(Max(ref, Float4(0.0f)), Float4(1.0f));
2260 			c.x = c.x * Float4(1.0f / 0xFFFF);
2261 		}
2262 
2263 		Int4 boolean;
2264 
2265 		switch(state.compareOp)
2266 		{
2267 		case VK_COMPARE_OP_LESS_OR_EQUAL: boolean = CmpLE(ref, c.x); break;
2268 		case VK_COMPARE_OP_GREATER_OR_EQUAL: boolean = CmpNLT(ref, c.x); break;
2269 		case VK_COMPARE_OP_LESS: boolean = CmpLT(ref, c.x); break;
2270 		case VK_COMPARE_OP_GREATER: boolean = CmpNLE(ref, c.x); break;
2271 		case VK_COMPARE_OP_EQUAL: boolean = CmpEQ(ref, c.x); break;
2272 		case VK_COMPARE_OP_NOT_EQUAL: boolean = CmpNEQ(ref, c.x); break;
2273 		case VK_COMPARE_OP_ALWAYS: boolean = Int4(-1); break;
2274 		case VK_COMPARE_OP_NEVER: boolean = Int4(0); break;
2275 		default: ASSERT(false);
2276 		}
2277 
2278 		c.x = As<Float4>(boolean & As<Int4>(Float4(1.0f)));
2279 		c.y = Float4(0.0f);
2280 		c.z = Float4(0.0f);
2281 		c.w = Float4(1.0f);
2282 	}
2283 
2284 	return c;
2285 }
2286 
replaceBorderTexel(const Vector4f & c,Int4 valid)2287 Vector4f SamplerCore::replaceBorderTexel(const Vector4f &c, Int4 valid)
2288 {
2289 	Vector4i border;
2290 
2291 	const bool scaled = hasNormalizedFormat();
2292 	const sw::float4 scaleComp = scaled ? getComponentScale() : sw::float4(1.0f, 1.0f, 1.0f, 1.0f);
2293 
2294 	switch(state.border)
2295 	{
2296 	case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK:
2297 	case VK_BORDER_COLOR_INT_TRANSPARENT_BLACK:
2298 		border.x = Int4(0);
2299 		border.y = Int4(0);
2300 		border.z = Int4(0);
2301 		border.w = Int4(0);
2302 		break;
2303 	case VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK:
2304 		border.x = Int4(0);
2305 		border.y = Int4(0);
2306 		border.z = Int4(0);
2307 		border.w = Int4(bit_cast<int>(scaleComp.w));
2308 		break;
2309 	case VK_BORDER_COLOR_INT_OPAQUE_BLACK:
2310 		border.x = Int4(0);
2311 		border.y = Int4(0);
2312 		border.z = Int4(0);
2313 		border.w = Int4(1);
2314 		break;
2315 	case VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE:
2316 		border.x = Int4(bit_cast<int>(scaleComp.x));
2317 		border.y = Int4(bit_cast<int>(scaleComp.y));
2318 		border.z = Int4(bit_cast<int>(scaleComp.z));
2319 		border.w = Int4(bit_cast<int>(scaleComp.w));
2320 		break;
2321 	case VK_BORDER_COLOR_INT_OPAQUE_WHITE:
2322 		border.x = Int4(1);
2323 		border.y = Int4(1);
2324 		border.z = Int4(1);
2325 		border.w = Int4(1);
2326 		break;
2327 	case VK_BORDER_COLOR_FLOAT_CUSTOM_EXT:
2328 		// This bit-casts from float to int in C++ code instead of Reactor code
2329 		// because Reactor does not guarantee preserving infinity (b/140302841).
2330 		border.x = Int4(bit_cast<int>(scaleComp.x * state.customBorder.float32[0]));
2331 		border.y = Int4(bit_cast<int>(scaleComp.y * state.customBorder.float32[1]));
2332 		border.z = Int4(bit_cast<int>(scaleComp.z * state.customBorder.float32[2]));
2333 		border.w = Int4(bit_cast<int>(scaleComp.w * state.customBorder.float32[3]));
2334 		break;
2335 	case VK_BORDER_COLOR_INT_CUSTOM_EXT:
2336 		border.x = Int4(state.customBorder.int32[0]);
2337 		border.y = Int4(state.customBorder.int32[1]);
2338 		border.z = Int4(state.customBorder.int32[2]);
2339 		border.w = Int4(state.customBorder.int32[3]);
2340 		break;
2341 	default:
2342 		UNSUPPORTED("sint/uint/sfloat border: %u", state.border);
2343 	}
2344 
2345 	Vector4f out;
2346 	out.x = As<Float4>((valid & As<Int4>(c.x)) | (~valid & border.x));  // TODO: IfThenElse()
2347 	out.y = As<Float4>((valid & As<Int4>(c.y)) | (~valid & border.y));
2348 	out.z = As<Float4>((valid & As<Int4>(c.z)) | (~valid & border.z));
2349 	out.w = As<Float4>((valid & As<Int4>(c.w)) | (~valid & border.w));
2350 
2351 	return out;
2352 }
2353 
selectMipmap(const Pointer<Byte> & texture,const Float & lod,bool secondLOD)2354 Pointer<Byte> SamplerCore::selectMipmap(const Pointer<Byte> &texture, const Float &lod, bool secondLOD)
2355 {
2356 	Pointer<Byte> mipmap0 = texture + OFFSET(Texture, mipmap[0]);
2357 
2358 	if(state.mipmapFilter == MIPMAP_NONE)
2359 	{
2360 		return mipmap0;
2361 	}
2362 
2363 	Int ilod;
2364 
2365 	if(state.mipmapFilter == MIPMAP_POINT)
2366 	{
2367 		// TODO: Preferred formula is ceil(lod + 0.5) - 1
2368 		ilod = RoundInt(lod);
2369 	}
2370 	else  // MIPMAP_LINEAR
2371 	{
2372 		ilod = Int(lod);
2373 	}
2374 
2375 	return mipmap0 + ilod * sizeof(Mipmap) + secondLOD * sizeof(Mipmap);
2376 }
2377 
computeFilterOffset(Float & lod)2378 Int4 SamplerCore::computeFilterOffset(Float &lod)
2379 {
2380 	if(state.textureFilter == FILTER_POINT)
2381 	{
2382 		return Int4(0);
2383 	}
2384 	else if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
2385 	{
2386 		return CmpNLE(Float4(lod), Float4(0.0f));
2387 	}
2388 	else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
2389 	{
2390 		return CmpLE(Float4(lod), Float4(0.0f));
2391 	}
2392 
2393 	return Int4(~0);
2394 }
2395 
address(const Float4 & uw,AddressingMode addressingMode)2396 Short4 SamplerCore::address(const Float4 &uw, AddressingMode addressingMode)
2397 {
2398 	if(addressingMode == ADDRESSING_UNUSED)
2399 	{
2400 		return Short4(0);  // TODO(b/134669567): Optimize for 1D filtering
2401 	}
2402 	else if(addressingMode == ADDRESSING_CLAMP || addressingMode == ADDRESSING_BORDER)
2403 	{
2404 		Float4 clamp = Min(Max(uw, Float4(0.0f)), Float4(65535.0f / 65536.0f));
2405 
2406 		return Short4(Int4(clamp * Float4(1 << 16)));
2407 	}
2408 	else if(addressingMode == ADDRESSING_MIRROR)
2409 	{
2410 		Int4 convert = Int4(uw * Float4(1 << 16));
2411 		Int4 mirror = (convert << 15) >> 31;
2412 
2413 		convert ^= mirror;
2414 
2415 		return Short4(convert);
2416 	}
2417 	else if(addressingMode == ADDRESSING_MIRRORONCE)
2418 	{
2419 		// Absolute value
2420 		Int4 convert = Int4(Abs(uw * Float4(1 << 16)));
2421 
2422 		// Clamp
2423 		convert -= Int4(0x00008000, 0x00008000, 0x00008000, 0x00008000);
2424 		convert = As<Int4>(PackSigned(convert, convert));
2425 
2426 		return As<Short4>(Int2(convert)) + Short4(0x8000u);
2427 	}
2428 	else  // Wrap
2429 	{
2430 		return Short4(Int4(uw * Float4(1 << 16)));
2431 	}
2432 }
2433 
computeLayerIndex16(const Float4 & a,Pointer<Byte> & mipmap)2434 Short4 SamplerCore::computeLayerIndex16(const Float4 &a, Pointer<Byte> &mipmap)
2435 {
2436 	if(!state.isArrayed())
2437 	{
2438 		return {};
2439 	}
2440 
2441 	Int4 layers = *Pointer<Int4>(mipmap + OFFSET(Mipmap, depth));
2442 
2443 	return Short4(Min(Max(RoundInt(a), Int4(0)), layers - Int4(1)));
2444 }
2445 
2446 // TODO: Eliminate when the gather + mirror addressing case is handled by mirroring the footprint.
mirror(Int4 n)2447 static Int4 mirror(Int4 n)
2448 {
2449 	auto positive = CmpNLT(n, Int4(0));
2450 	return (positive & n) | (~positive & (-(Int4(1) + n)));
2451 }
2452 
mod(Int4 n,Int4 d)2453 static Int4 mod(Int4 n, Int4 d)
2454 {
2455 	auto x = n % d;
2456 	auto positive = CmpNLT(x, Int4(0));
2457 	return (positive & x) | (~positive & (x + d));
2458 }
2459 
address(const Float4 & uvw,Int4 & xyz0,Int4 & xyz1,Float4 & f,Pointer<Byte> & mipmap,Int4 & filter,int whd,AddressingMode addressingMode)2460 void SamplerCore::address(const Float4 &uvw, Int4 &xyz0, Int4 &xyz1, Float4 &f, Pointer<Byte> &mipmap, Int4 &filter, int whd, AddressingMode addressingMode)
2461 {
2462 	if(addressingMode == ADDRESSING_UNUSED)
2463 	{
2464 		f = Float4(0.0f);  // TODO(b/134669567): Optimize for 1D filtering
2465 		return;
2466 	}
2467 
2468 	Int4 dim = As<Int4>(*Pointer<UInt4>(mipmap + whd, 16));
2469 	Int4 maxXYZ = dim - Int4(1);
2470 
2471 	if(function == Fetch)  // Unnormalized coordinates
2472 	{
2473 		Int4 xyz = As<Int4>(uvw);
2474 		xyz0 = Min(Max(xyz, Int4(0)), maxXYZ);
2475 
2476 		// VK_EXT_image_robustness requires checking for out-of-bounds accesses.
2477 		// TODO(b/162327166): Only perform bounds checks when VK_EXT_image_robustness is enabled.
2478 		// If the above clamping altered the result, the access is out-of-bounds.
2479 		// In that case set the coordinate to -1 to perform texel replacement later.
2480 		Int4 outOfBounds = CmpNEQ(xyz, xyz0);
2481 		xyz0 |= outOfBounds;
2482 	}
2483 	else if(addressingMode == ADDRESSING_CUBEFACE)
2484 	{
2485 		xyz0 = As<Int4>(uvw);
2486 	}
2487 	else
2488 	{
2489 		const int oneBits = 0x3F7FFFFF;  // Value just under 1.0f
2490 
2491 		Float4 coord = uvw;
2492 
2493 		if(state.unnormalizedCoordinates)
2494 		{
2495 			switch(addressingMode)
2496 			{
2497 			case ADDRESSING_CLAMP:
2498 				coord = Min(Max(coord, Float4(0.0f)), Float4(dim) * As<Float4>(Int4(oneBits)));
2499 				break;
2500 			case ADDRESSING_BORDER:
2501 				// Don't map to a valid range here.
2502 				break;
2503 			default:
2504 				// "If unnormalizedCoordinates is VK_TRUE, addressModeU and addressModeV must each be
2505 				//  either VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE or VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER"
2506 				UNREACHABLE("addressingMode %d", int(addressingMode));
2507 				break;
2508 			}
2509 		}
2510 		else if(state.textureFilter == FILTER_GATHER && addressingMode == ADDRESSING_MIRROR)
2511 		{
2512 			// Gather requires the 'footprint' of the texels from which a component is taken, to also mirror around.
2513 			// Therefore we can't just compute one texel's location and find the other ones at +1 offsets from it.
2514 			// Here we handle that case separately by doing the mirroring per texel coordinate.
2515 			// TODO: Mirror the footprint by adjusting the sign of the 0.5f and 1 offsets.
2516 
2517 			coord = coord * Float4(dim);
2518 			coord -= Float4(0.5f);
2519 			Float4 floor = Floor(coord);
2520 			xyz0 = Int4(floor);
2521 			xyz1 = xyz0 + Int4(1);
2522 
2523 			xyz0 = (maxXYZ)-mirror(mod(xyz0, Int4(2) * dim) - dim);
2524 			xyz1 = (maxXYZ)-mirror(mod(xyz1, Int4(2) * dim) - dim);
2525 
2526 			return;
2527 		}
2528 		else
2529 		{
2530 			switch(addressingMode)
2531 			{
2532 			case ADDRESSING_CLAMP:
2533 			case ADDRESSING_SEAMLESS:
2534 				// While cube face coordinates are nominally already in the [0.0, 1.0] range
2535 				// due to the projection, and numerical imprecision is tolerated due to the
2536 				// border of pixels for seamless filtering, the projection doesn't cause
2537 				// range normalization for Inf and NaN values. So we always clamp.
2538 				{
2539 					Float4 one = As<Float4>(Int4(oneBits));
2540 					coord = Min(Max(coord, Float4(0.0f)), one);
2541 				}
2542 				break;
2543 			case ADDRESSING_MIRROR:
2544 				{
2545 					Float4 one = As<Float4>(Int4(oneBits));
2546 					coord = coord * Float4(0.5f);
2547 					coord = Float4(2.0f) * Abs(coord - Round(coord));
2548 					coord = Min(coord, one);
2549 				}
2550 				break;
2551 			case ADDRESSING_MIRRORONCE:
2552 				{
2553 					Float4 one = As<Float4>(Int4(oneBits));
2554 					coord = Min(Abs(coord), one);
2555 				}
2556 				break;
2557 			case ADDRESSING_BORDER:
2558 				// Don't map to a valid range here.
2559 				break;
2560 			default:  // Wrap
2561 				coord = Frac(coord);
2562 				break;
2563 			}
2564 
2565 			coord = coord * Float4(dim);
2566 		}
2567 
2568 		if(state.textureFilter == FILTER_POINT)
2569 		{
2570 			if(addressingMode == ADDRESSING_BORDER)
2571 			{
2572 				xyz0 = Int4(Floor(coord));
2573 			}
2574 			else  // Can't have negative coordinates, so floor() is redundant when casting to int.
2575 			{
2576 				xyz0 = Int4(coord);
2577 			}
2578 		}
2579 		else
2580 		{
2581 			if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR ||
2582 			   state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
2583 			{
2584 				coord -= As<Float4>(As<Int4>(Float4(0.5f)) & filter);
2585 			}
2586 			else
2587 			{
2588 				coord -= Float4(0.5f);
2589 			}
2590 
2591 			Float4 floor = Floor(coord);
2592 			xyz0 = Int4(floor);
2593 			f = coord - floor;
2594 		}
2595 
2596 		if(addressingMode == ADDRESSING_SEAMLESS)  // Adjust for border.
2597 		{
2598 			xyz0 += Int4(1);
2599 		}
2600 
2601 		xyz1 = xyz0 - filter;  // Increment
2602 
2603 		if(addressingMode == ADDRESSING_BORDER)
2604 		{
2605 			// Replace the coordinates with -1 if they're out of range.
2606 			Int4 border0 = CmpLT(xyz0, Int4(0)) | CmpNLT(xyz0, dim);
2607 			Int4 border1 = CmpLT(xyz1, Int4(0)) | CmpNLT(xyz1, dim);
2608 			xyz0 |= border0;
2609 			xyz1 |= border1;
2610 		}
2611 		else if(state.textureFilter != FILTER_POINT)
2612 		{
2613 			switch(addressingMode)
2614 			{
2615 			case ADDRESSING_SEAMLESS:
2616 				break;
2617 			case ADDRESSING_MIRROR:
2618 			case ADDRESSING_MIRRORONCE:
2619 			case ADDRESSING_CLAMP:
2620 				xyz0 = Max(xyz0, Int4(0));
2621 				xyz1 = Min(xyz1, maxXYZ);
2622 				break;
2623 			default:  // Wrap
2624 				{
2625 					Int4 under = CmpLT(xyz0, Int4(0));
2626 					xyz0 = (under & maxXYZ) | (~under & xyz0);  // xyz < 0 ? dim - 1 : xyz   // TODO: IfThenElse()
2627 
2628 					Int4 nover = CmpLT(xyz1, dim);
2629 					xyz1 = nover & xyz1;  // xyz >= dim ? 0 : xyz
2630 				}
2631 				break;
2632 			}
2633 		}
2634 	}
2635 }
2636 
computeLayerIndex(const Float4 & a,Pointer<Byte> & mipmap)2637 Int4 SamplerCore::computeLayerIndex(const Float4 &a, Pointer<Byte> &mipmap)
2638 {
2639 	if(!state.isArrayed())
2640 	{
2641 		return {};
2642 	}
2643 
2644 	Int4 layers = *Pointer<Int4>(mipmap + OFFSET(Mipmap, depth), 16);
2645 	Int4 maxLayer = layers - Int4(1);
2646 
2647 	if(function == Fetch)  // Unnormalized coordinates
2648 	{
2649 		Int4 xyz = As<Int4>(a);
2650 		Int4 xyz0 = Min(Max(xyz, Int4(0)), maxLayer);
2651 
2652 		// VK_EXT_image_robustness requires checking for out-of-bounds accesses.
2653 		// TODO(b/162327166): Only perform bounds checks when VK_EXT_image_robustness is enabled.
2654 		// If the above clamping altered the result, the access is out-of-bounds.
2655 		// In that case set the coordinate to -1 to perform texel replacement later.
2656 		Int4 outOfBounds = CmpNEQ(xyz, xyz0);
2657 		xyz0 |= outOfBounds;
2658 
2659 		return xyz0;
2660 	}
2661 	else
2662 	{
2663 		return Min(Max(RoundInt(a), Int4(0)), maxLayer);
2664 	}
2665 }
2666 
sRGBtoLinearFF00(Short4 & c)2667 void SamplerCore::sRGBtoLinearFF00(Short4 &c)
2668 {
2669 	c = As<UShort4>(c) >> 8;
2670 
2671 	Pointer<Byte> LUT = Pointer<Byte>(constants + OFFSET(Constants, sRGBtoLinearFF_FF00));
2672 
2673 	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 0))), 0);
2674 	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 1))), 1);
2675 	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 2))), 2);
2676 	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 3))), 3);
2677 }
2678 
hasNormalizedFormat() const2679 bool SamplerCore::hasNormalizedFormat() const
2680 {
2681 	return state.textureFormat.isSignedNormalized() || state.textureFormat.isUnsignedNormalized();
2682 }
2683 
hasFloatTexture() const2684 bool SamplerCore::hasFloatTexture() const
2685 {
2686 	return state.textureFormat.isFloatFormat();
2687 }
2688 
hasUnnormalizedIntegerTexture() const2689 bool SamplerCore::hasUnnormalizedIntegerTexture() const
2690 {
2691 	return state.textureFormat.isUnnormalizedInteger();
2692 }
2693 
hasUnsignedTextureComponent(int component) const2694 bool SamplerCore::hasUnsignedTextureComponent(int component) const
2695 {
2696 	return state.textureFormat.isUnsignedComponent(component);
2697 }
2698 
textureComponentCount() const2699 int SamplerCore::textureComponentCount() const
2700 {
2701 	return state.textureFormat.componentCount();
2702 }
2703 
has16bitPackedTextureFormat() const2704 bool SamplerCore::has16bitPackedTextureFormat() const
2705 {
2706 	return state.textureFormat.has16bitPackedTextureFormat();
2707 }
2708 
has8bitTextureComponents() const2709 bool SamplerCore::has8bitTextureComponents() const
2710 {
2711 	return state.textureFormat.has8bitTextureComponents();
2712 }
2713 
has16bitTextureComponents() const2714 bool SamplerCore::has16bitTextureComponents() const
2715 {
2716 	return state.textureFormat.has16bitTextureComponents();
2717 }
2718 
has32bitIntegerTextureComponents() const2719 bool SamplerCore::has32bitIntegerTextureComponents() const
2720 {
2721 	return state.textureFormat.has32bitIntegerTextureComponents();
2722 }
2723 
isYcbcrFormat() const2724 bool SamplerCore::isYcbcrFormat() const
2725 {
2726 	return state.textureFormat.isYcbcrFormat();
2727 }
2728 
isRGBComponent(int component) const2729 bool SamplerCore::isRGBComponent(int component) const
2730 {
2731 	return state.textureFormat.isRGBComponent(component);
2732 }
2733 
borderModeActive() const2734 bool SamplerCore::borderModeActive() const
2735 {
2736 	return state.addressingModeU == ADDRESSING_BORDER ||
2737 	       state.addressingModeV == ADDRESSING_BORDER ||
2738 	       state.addressingModeW == ADDRESSING_BORDER;
2739 }
2740 
gatherSwizzle() const2741 VkComponentSwizzle SamplerCore::gatherSwizzle() const
2742 {
2743 	switch(state.gatherComponent)
2744 	{
2745 	case 0: return state.swizzle.r;
2746 	case 1: return state.swizzle.g;
2747 	case 2: return state.swizzle.b;
2748 	case 3: return state.swizzle.a;
2749 	default:
2750 		UNREACHABLE("Invalid component");
2751 		return VK_COMPONENT_SWIZZLE_R;
2752 	}
2753 }
2754 
getComponentScale() const2755 sw::float4 SamplerCore::getComponentScale() const
2756 {
2757 	// TODO(b/204709464): Unlike other formats, the fixed-point representation of the formats below are handled with bit extension.
2758 	// This special handling of such formats should be removed later.
2759 	switch(state.textureFormat)
2760 	{
2761 	case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
2762 	case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
2763 	case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
2764 		return sw::float4(0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF);
2765 	default:
2766 		break;
2767 	};
2768 
2769 	const sw::int4 bits = state.textureFormat.bitsPerComponent();
2770 	const sw::int4 shift = sw::int4(16 - bits.x, 16 - bits.y, 16 - bits.z, 16 - bits.w);
2771 	const uint16_t sign = state.textureFormat.isUnsigned() ? 0xFFFF : 0x7FFF;
2772 
2773 	return sw::float4(static_cast<uint16_t>(0xFFFF << shift.x) & sign,
2774 	                  static_cast<uint16_t>(0xFFFF << shift.y) & sign,
2775 	                  static_cast<uint16_t>(0xFFFF << shift.z) & sign,
2776 	                  static_cast<uint16_t>(0xFFFF << shift.w) & sign);
2777 }
2778 
getGatherComponent() const2779 int SamplerCore::getGatherComponent() const
2780 {
2781 	VkComponentSwizzle swizzle = gatherSwizzle();
2782 
2783 	switch(swizzle)
2784 	{
2785 	default: UNSUPPORTED("VkComponentSwizzle %d", (int)swizzle); return 0;
2786 	case VK_COMPONENT_SWIZZLE_R:
2787 	case VK_COMPONENT_SWIZZLE_G:
2788 	case VK_COMPONENT_SWIZZLE_B:
2789 	case VK_COMPONENT_SWIZZLE_A:
2790 		// Normalize all components using the gather component scale.
2791 		return swizzle - VK_COMPONENT_SWIZZLE_R;
2792 	case VK_COMPONENT_SWIZZLE_ZERO:
2793 	case VK_COMPONENT_SWIZZLE_ONE:
2794 		// These cases are handled later.
2795 		return 0;
2796 	}
2797 
2798 	return 0;
2799 }
2800 
2801 }  // namespace sw
2802