1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "SamplerCore.hpp"
16
17 #include "Constants.hpp"
18 #include "PixelRoutine.hpp"
19 #include "System/Debug.hpp"
20 #include "Vulkan/VkSampler.hpp"
21
22 namespace sw {
23
SamplerCore(Pointer<Byte> & constants,const Sampler & state,SamplerFunction function)24 SamplerCore::SamplerCore(Pointer<Byte> &constants, const Sampler &state, SamplerFunction function)
25 : constants(constants)
26 , state(state)
27 , function(function)
28 {
29 }
30
sampleTexture(Pointer<Byte> & texture,SIMD::Float uvwa[4],const SIMD::Float & dRef,const Float & lodOrBias,const SIMD::Float & dsx,const SIMD::Float & dsy,SIMD::Int offset[4],const SIMD::Int & sample)31 SIMD::Float4 SamplerCore::sampleTexture(Pointer<Byte> &texture, SIMD::Float uvwa[4], const SIMD::Float &dRef, const Float &lodOrBias, const SIMD::Float &dsx, const SIMD::Float &dsy, SIMD::Int offset[4], const SIMD::Int &sample)
32 {
33 SIMD::Float4 c;
34
35 for(int i = 0; i < SIMD::Width / 4; i++)
36 {
37 Float4 uvwa128[4];
38 uvwa128[0] = Extract128(uvwa[0], i);
39 uvwa128[1] = Extract128(uvwa[1], i);
40 uvwa128[2] = Extract128(uvwa[2], i);
41 uvwa128[3] = Extract128(uvwa[3], i);
42
43 Vector4i offset128;
44 offset128[0] = Extract128(offset[0], i);
45 offset128[1] = Extract128(offset[1], i);
46 offset128[2] = Extract128(offset[2], i);
47 offset128[3] = Extract128(offset[3], i);
48
49 Vector4f c128 = sampleTexture128(texture, uvwa128, Extract128(dRef, i), lodOrBias, Extract128(dsx, i), Extract128(dsy, i), offset128, Extract128(sample, i));
50 c.x = Insert128(c.x, c128.x, i);
51 c.y = Insert128(c.y, c128.y, i);
52 c.z = Insert128(c.z, c128.z, i);
53 c.w = Insert128(c.w, c128.w, i);
54 }
55
56 return c;
57 }
58
sampleTexture128(Pointer<Byte> & texture,Float4 uvwa[4],const Float4 & dRef,const Float & lodOrBias,const Float4 & dsx,const Float4 & dsy,Vector4i & offset,const Int4 & sample)59 Vector4f SamplerCore::sampleTexture128(Pointer<Byte> &texture, Float4 uvwa[4], const Float4 &dRef, const Float &lodOrBias, const Float4 &dsx, const Float4 &dsy, Vector4i &offset, const Int4 &sample)
60 {
61 Vector4f c;
62
63 Float4 u = uvwa[0];
64 Float4 v = uvwa[1];
65 Float4 w = uvwa[2];
66 Float4 a; // Array layer coordinate
67 switch(state.textureType)
68 {
69 case VK_IMAGE_VIEW_TYPE_1D_ARRAY: a = uvwa[1]; break;
70 case VK_IMAGE_VIEW_TYPE_2D_ARRAY: a = uvwa[2]; break;
71 case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY: a = uvwa[3]; break;
72 default: break;
73 }
74
75 Float lod;
76 Float anisotropy;
77 Float4 uDelta;
78 Float4 vDelta;
79 Float4 M; // Major axis
80
81 if(state.isCube())
82 {
83 Int4 face = cubeFace(u, v, uvwa[0], uvwa[1], uvwa[2], M);
84 w = As<Float4>(face);
85 }
86
87 // Determine if we can skip the LOD computation. This is the case when the mipmap has only one level, except for LOD query,
88 // where we have to return the computed value. Anisotropic filtering requires computing the anisotropy factor even for a single mipmap level.
89 bool singleMipLevel = (state.minLod == state.maxLod);
90 bool requiresLodComputation = (function == Query) || (state.textureFilter == FILTER_ANISOTROPIC);
91 bool skipLodComputation = singleMipLevel && !requiresLodComputation;
92
93 if(skipLodComputation)
94 {
95 lod = state.minLod;
96 }
97 else if(function == Implicit || function == Bias || function == Grad || function == Query)
98 {
99 if(state.is1D())
100 {
101 computeLod1D(texture, lod, u, dsx, dsy);
102 }
103 else if(state.is2D())
104 {
105 computeLod2D(texture, lod, anisotropy, uDelta, vDelta, u, v, dsx, dsy);
106 }
107 else if(state.isCube())
108 {
109 computeLodCube(texture, lod, uvwa[0], uvwa[1], uvwa[2], dsx, dsy, M);
110 }
111 else
112 {
113 computeLod3D(texture, lod, u, v, w, dsx, dsy);
114 }
115
116 Float bias = state.mipLodBias;
117
118 if(function == Bias)
119 {
120 // Add SPIR-V Bias operand to the sampler provided bias and clamp to maxSamplerLodBias limit.
121 bias = Min(Max(bias + lodOrBias, -vk::MAX_SAMPLER_LOD_BIAS), vk::MAX_SAMPLER_LOD_BIAS);
122 }
123
124 lod += bias;
125 }
126 else if(function == Lod)
127 {
128 // Vulkan 1.1: "The absolute value of mipLodBias must be less than or equal to VkPhysicalDeviceLimits::maxSamplerLodBias"
129 // Hence no explicit clamping to maxSamplerLodBias is required in this case.
130 lod = lodOrBias + state.mipLodBias;
131 }
132 else if(function == Fetch)
133 {
134 // TODO: Eliminate int-float-int conversion.
135 lod = Float(As<Int>(lodOrBias));
136 lod = Max(lod, state.minLod);
137 lod = Min(lod, state.maxLod);
138 }
139 else if(function == Base || function == Gather)
140 {
141 lod = Float(0);
142 }
143 else
144 UNREACHABLE("Sampler function %d", int(function));
145
146 if(function != Base && function != Fetch && function != Gather)
147 {
148 if(function == Query)
149 {
150 c.y = Float4(lod); // Unclamped LOD.
151 }
152
153 if(!skipLodComputation)
154 {
155 lod = Max(lod, state.minLod);
156 lod = Min(lod, state.maxLod);
157 }
158
159 if(function == Query)
160 {
161 if(state.mipmapFilter == MIPMAP_POINT)
162 {
163 lod = Round(lod); // TODO: Preferred formula is ceil(lod + 0.5) - 1
164 }
165
166 c.x = lod;
167 // c.y contains unclamped LOD.
168
169 return c;
170 }
171 }
172
173 bool force32BitFiltering = state.highPrecisionFiltering && !isYcbcrFormat() && (state.textureFilter != FILTER_POINT);
174 bool use32BitFiltering = hasFloatTexture() || hasUnnormalizedIntegerTexture() || force32BitFiltering ||
175 state.isCube() || state.unnormalizedCoordinates || state.compareEnable ||
176 borderModeActive() || (function == Gather) || (function == Fetch);
177 int numComponents = (function == Gather) ? 4 : textureComponentCount();
178
179 if(use32BitFiltering)
180 {
181 c = sampleFloatFilter(texture, u, v, w, a, dRef, offset, sample, lod, anisotropy, uDelta, vDelta);
182 }
183 else // 16-bit filtering.
184 {
185 Vector4s cs = sampleFilter(texture, u, v, w, a, offset, sample, lod, anisotropy, uDelta, vDelta);
186
187 for(int component = 0; component < numComponents; component++)
188 {
189 if(hasUnsignedTextureComponent(component))
190 {
191 c[component] = Float4(As<UShort4>(cs[component]));
192 }
193 else
194 {
195 c[component] = Float4(cs[component]);
196 }
197 }
198 }
199
200 if(hasNormalizedFormat() && !state.compareEnable)
201 {
202 sw::float4 scale = getComponentScale();
203
204 for(int component = 0; component < numComponents; component++)
205 {
206 int texelComponent = (function == Gather) ? getGatherComponent() : component;
207 c[component] *= Float4(1.0f / scale[texelComponent]);
208 }
209 }
210
211 if(state.textureFormat.isSignedNormalized())
212 {
213 for(int component = 0; component < numComponents; component++)
214 {
215 c[component] = Max(c[component], Float4(-1.0f));
216 }
217 }
218
219 if(state.textureFilter != FILTER_GATHER)
220 {
221 if((state.swizzle.r != VK_COMPONENT_SWIZZLE_R) ||
222 (state.swizzle.g != VK_COMPONENT_SWIZZLE_G) ||
223 (state.swizzle.b != VK_COMPONENT_SWIZZLE_B) ||
224 (state.swizzle.a != VK_COMPONENT_SWIZZLE_A))
225 {
226 const Vector4f col = c;
227 bool integer = hasUnnormalizedIntegerTexture();
228 c.x = applySwizzle(col, state.swizzle.r, integer);
229 c.y = applySwizzle(col, state.swizzle.g, integer);
230 c.z = applySwizzle(col, state.swizzle.b, integer);
231 c.w = applySwizzle(col, state.swizzle.a, integer);
232 }
233 }
234 else // Gather
235 {
236 VkComponentSwizzle swizzle = gatherSwizzle();
237
238 // R/G/B/A swizzles affect the component collected from each texel earlier.
239 // Handle the ZERO and ONE cases here because we don't need to know the format.
240
241 if(swizzle == VK_COMPONENT_SWIZZLE_ZERO)
242 {
243 c.x = c.y = c.z = c.w = Float4(0);
244 }
245 else if(swizzle == VK_COMPONENT_SWIZZLE_ONE)
246 {
247 bool integer = hasUnnormalizedIntegerTexture();
248 c.x = c.y = c.z = c.w = integer ? As<Float4>(Int4(1)) : RValue<Float4>(Float4(1.0f));
249 }
250 }
251
252 return c;
253 }
254
applySwizzle(const Vector4f & c,VkComponentSwizzle swizzle,bool integer)255 Float4 SamplerCore::applySwizzle(const Vector4f &c, VkComponentSwizzle swizzle, bool integer)
256 {
257 switch(swizzle)
258 {
259 default: UNSUPPORTED("VkComponentSwizzle %d", (int)swizzle);
260 case VK_COMPONENT_SWIZZLE_R: return c.x;
261 case VK_COMPONENT_SWIZZLE_G: return c.y;
262 case VK_COMPONENT_SWIZZLE_B: return c.z;
263 case VK_COMPONENT_SWIZZLE_A: return c.w;
264 case VK_COMPONENT_SWIZZLE_ZERO: return Float4(0.0f, 0.0f, 0.0f, 0.0f);
265 case VK_COMPONENT_SWIZZLE_ONE:
266 if(integer)
267 {
268 return Float4(As<Float4>(sw::Int4(1, 1, 1, 1)));
269 }
270 else
271 {
272 return Float4(1.0f, 1.0f, 1.0f, 1.0f);
273 }
274 break;
275 }
276 };
277
offsetSample(Short4 & uvw,Pointer<Byte> & mipmap,int halfOffset,bool wrap,int count,Float & lod)278 Short4 SamplerCore::offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod)
279 {
280 Short4 offset = *Pointer<UShort4>(mipmap + halfOffset);
281
282 if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
283 {
284 offset &= Short4(CmpNLE(Float4(lod), Float4(0.0f)));
285 }
286 else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
287 {
288 offset &= Short4(CmpLE(Float4(lod), Float4(0.0f)));
289 }
290
291 if(wrap)
292 {
293 switch(count)
294 {
295 case -1: return uvw - offset;
296 case 0: return uvw;
297 case +1: return uvw + offset;
298 case 2: return uvw + offset + offset;
299 }
300 }
301 else // Clamp or mirror
302 {
303 switch(count)
304 {
305 case -1: return SubSat(As<UShort4>(uvw), As<UShort4>(offset));
306 case 0: return uvw;
307 case +1: return AddSat(As<UShort4>(uvw), As<UShort4>(offset));
308 case 2: return AddSat(AddSat(As<UShort4>(uvw), As<UShort4>(offset)), As<UShort4>(offset));
309 }
310 }
311
312 return uvw;
313 }
314
sampleFilter(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta)315 Vector4s SamplerCore::sampleFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta)
316 {
317 Vector4s c = sampleAniso(texture, u, v, w, a, offset, sample, lod, anisotropy, uDelta, vDelta, false);
318
319 if(function == Fetch)
320 {
321 return c;
322 }
323
324 if(state.mipmapFilter == MIPMAP_LINEAR)
325 {
326 Vector4s cc = sampleAniso(texture, u, v, w, a, offset, sample, lod, anisotropy, uDelta, vDelta, true);
327
328 lod *= Float(1 << 16);
329
330 UShort4 utri = UShort4(Float4(lod)); // TODO: Optimize
331 Short4 stri = utri >> 1; // TODO: Optimize
332
333 if(hasUnsignedTextureComponent(0))
334 cc.x = MulHigh(As<UShort4>(cc.x), utri);
335 else
336 cc.x = MulHigh(cc.x, stri);
337 if(hasUnsignedTextureComponent(1))
338 cc.y = MulHigh(As<UShort4>(cc.y), utri);
339 else
340 cc.y = MulHigh(cc.y, stri);
341 if(hasUnsignedTextureComponent(2))
342 cc.z = MulHigh(As<UShort4>(cc.z), utri);
343 else
344 cc.z = MulHigh(cc.z, stri);
345 if(hasUnsignedTextureComponent(3))
346 cc.w = MulHigh(As<UShort4>(cc.w), utri);
347 else
348 cc.w = MulHigh(cc.w, stri);
349
350 utri = ~utri;
351 stri = Short4(0x7FFF) - stri;
352
353 if(hasUnsignedTextureComponent(0))
354 c.x = MulHigh(As<UShort4>(c.x), utri);
355 else
356 c.x = MulHigh(c.x, stri);
357 if(hasUnsignedTextureComponent(1))
358 c.y = MulHigh(As<UShort4>(c.y), utri);
359 else
360 c.y = MulHigh(c.y, stri);
361 if(hasUnsignedTextureComponent(2))
362 c.z = MulHigh(As<UShort4>(c.z), utri);
363 else
364 c.z = MulHigh(c.z, stri);
365 if(hasUnsignedTextureComponent(3))
366 c.w = MulHigh(As<UShort4>(c.w), utri);
367 else
368 c.w = MulHigh(c.w, stri);
369
370 c.x += cc.x;
371 c.y += cc.y;
372 c.z += cc.z;
373 c.w += cc.w;
374
375 if(!hasUnsignedTextureComponent(0)) c.x += c.x;
376 if(!hasUnsignedTextureComponent(1)) c.y += c.y;
377 if(!hasUnsignedTextureComponent(2)) c.z += c.z;
378 if(!hasUnsignedTextureComponent(3)) c.w += c.w;
379 }
380
381 return c;
382 }
383
sampleAniso(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta,bool secondLOD)384 Vector4s SamplerCore::sampleAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD)
385 {
386 Vector4s c;
387
388 if(state.textureFilter != FILTER_ANISOTROPIC)
389 {
390 c = sampleQuad(texture, u, v, w, a, offset, sample, lod, secondLOD);
391 }
392 else
393 {
394 Int N = RoundInt(anisotropy);
395
396 Vector4s cSum;
397
398 cSum.x = Short4(0);
399 cSum.y = Short4(0);
400 cSum.z = Short4(0);
401 cSum.w = Short4(0);
402
403 Float4 A = *Pointer<Float4>(constants + OFFSET(Constants, uvWeight) + 16 * N);
404 Float4 B = *Pointer<Float4>(constants + OFFSET(Constants, uvStart) + 16 * N);
405 UShort4 cw = *Pointer<UShort4>(constants + OFFSET(Constants, cWeight) + 8 * N);
406 Short4 sw = Short4(cw >> 1);
407
408 Float4 du = uDelta;
409 Float4 dv = vDelta;
410
411 Float4 u0 = u + B * du;
412 Float4 v0 = v + B * dv;
413
414 du *= A;
415 dv *= A;
416
417 Int i = 0;
418
419 Do
420 {
421 c = sampleQuad(texture, u0, v0, w, a, offset, sample, lod, secondLOD);
422
423 u0 += du;
424 v0 += dv;
425
426 if(hasUnsignedTextureComponent(0))
427 cSum.x += As<Short4>(MulHigh(As<UShort4>(c.x), cw));
428 else
429 cSum.x += MulHigh(c.x, sw);
430 if(hasUnsignedTextureComponent(1))
431 cSum.y += As<Short4>(MulHigh(As<UShort4>(c.y), cw));
432 else
433 cSum.y += MulHigh(c.y, sw);
434 if(hasUnsignedTextureComponent(2))
435 cSum.z += As<Short4>(MulHigh(As<UShort4>(c.z), cw));
436 else
437 cSum.z += MulHigh(c.z, sw);
438 if(hasUnsignedTextureComponent(3))
439 cSum.w += As<Short4>(MulHigh(As<UShort4>(c.w), cw));
440 else
441 cSum.w += MulHigh(c.w, sw);
442
443 i++;
444 }
445 Until(i >= N);
446
447 if(hasUnsignedTextureComponent(0))
448 c.x = cSum.x;
449 else
450 c.x = AddSat(cSum.x, cSum.x);
451 if(hasUnsignedTextureComponent(1))
452 c.y = cSum.y;
453 else
454 c.y = AddSat(cSum.y, cSum.y);
455 if(hasUnsignedTextureComponent(2))
456 c.z = cSum.z;
457 else
458 c.z = AddSat(cSum.z, cSum.z);
459 if(hasUnsignedTextureComponent(3))
460 c.w = cSum.w;
461 else
462 c.w = AddSat(cSum.w, cSum.w);
463 }
464
465 return c;
466 }
467
sampleQuad(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)468 Vector4s SamplerCore::sampleQuad(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
469 {
470 if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
471 {
472 return sampleQuad2D(texture, u, v, w, a, offset, sample, lod, secondLOD);
473 }
474 else
475 {
476 return sample3D(texture, u, v, w, offset, sample, lod, secondLOD);
477 }
478 }
479
bilinearInterpolateFloat(Vector4f & output,const Short4 & uuuu0,const Short4 & vvvv0,Vector4f & c00,Vector4f & c01,Vector4f & c10,Vector4f & c11,const Pointer<Byte> & mipmap,bool interpolateComponent0,bool interpolateComponent1,bool interpolateComponent2,bool interpolateComponent3)480 void SamplerCore::bilinearInterpolateFloat(Vector4f &output, const Short4 &uuuu0, const Short4 &vvvv0, Vector4f &c00, Vector4f &c01, Vector4f &c10, Vector4f &c11, const Pointer<Byte> &mipmap, bool interpolateComponent0, bool interpolateComponent1, bool interpolateComponent2, bool interpolateComponent3)
481 {
482 int componentCount = textureComponentCount();
483
484 Float4 unnormalizedUUUU0 = (Float4(uuuu0) / Float4(1 << 16)) * Float4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, width)));
485 Float4 unnormalizedVVVV0 = (Float4(vvvv0) / Float4(1 << 16)) * Float4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, height)));
486
487 Float4 frac0u = Frac(unnormalizedUUUU0);
488 Float4 frac0v = Frac(unnormalizedVVVV0);
489
490 if(interpolateComponent0 && componentCount >= 1)
491 {
492 c00.x = Mix(c00.x, c10.x, frac0u);
493 c01.x = Mix(c01.x, c11.x, frac0u);
494 output.x = Mix(c00.x, c01.x, frac0v);
495 }
496 if(interpolateComponent1 && componentCount >= 2)
497 {
498 c00.y = Mix(c00.y, c10.y, frac0u);
499 c01.y = Mix(c01.y, c11.y, frac0u);
500 output.y = Mix(c00.y, c01.y, frac0v);
501 }
502 if(interpolateComponent2 && componentCount >= 3)
503 {
504 c00.z = Mix(c00.z, c10.z, frac0u);
505 c01.z = Mix(c01.z, c11.z, frac0u);
506 output.z = Mix(c00.z, c01.z, frac0v);
507 }
508 if(interpolateComponent3 && componentCount >= 4)
509 {
510 c00.w = Mix(c00.w, c10.w, frac0u);
511 c01.w = Mix(c01.w, c11.w, frac0u);
512 output.w = Mix(c00.w, c01.w, frac0v);
513 }
514 }
515
bilinearInterpolate(Vector4s & output,const Short4 & uuuu0,const Short4 & vvvv0,Vector4s & c00,Vector4s & c01,Vector4s & c10,Vector4s & c11,const Pointer<Byte> & mipmap)516 void SamplerCore::bilinearInterpolate(Vector4s &output, const Short4 &uuuu0, const Short4 &vvvv0, Vector4s &c00, Vector4s &c01, Vector4s &c10, Vector4s &c11, const Pointer<Byte> &mipmap)
517 {
518 int componentCount = textureComponentCount();
519
520 // Fractions
521 UShort4 f0u = As<UShort4>(uuuu0) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, width)));
522 UShort4 f0v = As<UShort4>(vvvv0) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, height)));
523
524 UShort4 f1u = ~f0u;
525 UShort4 f1v = ~f0v;
526
527 UShort4 f0u0v = MulHigh(f0u, f0v);
528 UShort4 f1u0v = MulHigh(f1u, f0v);
529 UShort4 f0u1v = MulHigh(f0u, f1v);
530 UShort4 f1u1v = MulHigh(f1u, f1v);
531
532 // Signed fractions
533 Short4 f1u1vs;
534 Short4 f0u1vs;
535 Short4 f1u0vs;
536 Short4 f0u0vs;
537
538 if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
539 {
540 f1u1vs = f1u1v >> 1;
541 f0u1vs = f0u1v >> 1;
542 f1u0vs = f1u0v >> 1;
543 f0u0vs = f0u0v >> 1;
544 }
545
546 // Bilinear interpolation
547 if(componentCount >= 1)
548 {
549 if(has16bitTextureComponents() && hasUnsignedTextureComponent(0))
550 {
551 c00.x = As<UShort4>(c00.x) - MulHigh(As<UShort4>(c00.x), f0u) + MulHigh(As<UShort4>(c10.x), f0u);
552 c01.x = As<UShort4>(c01.x) - MulHigh(As<UShort4>(c01.x), f0u) + MulHigh(As<UShort4>(c11.x), f0u);
553 output.x = As<UShort4>(c00.x) - MulHigh(As<UShort4>(c00.x), f0v) + MulHigh(As<UShort4>(c01.x), f0v);
554 }
555 else
556 {
557 if(hasUnsignedTextureComponent(0))
558 {
559 c00.x = MulHigh(As<UShort4>(c00.x), f1u1v);
560 c10.x = MulHigh(As<UShort4>(c10.x), f0u1v);
561 c01.x = MulHigh(As<UShort4>(c01.x), f1u0v);
562 c11.x = MulHigh(As<UShort4>(c11.x), f0u0v);
563 }
564 else
565 {
566 c00.x = MulHigh(c00.x, f1u1vs);
567 c10.x = MulHigh(c10.x, f0u1vs);
568 c01.x = MulHigh(c01.x, f1u0vs);
569 c11.x = MulHigh(c11.x, f0u0vs);
570 }
571
572 output.x = (c00.x + c10.x) + (c01.x + c11.x);
573 if(!hasUnsignedTextureComponent(0)) output.x = AddSat(output.x, output.x); // Correct for signed fractions
574 }
575 }
576
577 if(componentCount >= 2)
578 {
579 if(has16bitTextureComponents() && hasUnsignedTextureComponent(1))
580 {
581 c00.y = As<UShort4>(c00.y) - MulHigh(As<UShort4>(c00.y), f0u) + MulHigh(As<UShort4>(c10.y), f0u);
582 c01.y = As<UShort4>(c01.y) - MulHigh(As<UShort4>(c01.y), f0u) + MulHigh(As<UShort4>(c11.y), f0u);
583 output.y = As<UShort4>(c00.y) - MulHigh(As<UShort4>(c00.y), f0v) + MulHigh(As<UShort4>(c01.y), f0v);
584 }
585 else
586 {
587 if(hasUnsignedTextureComponent(1))
588 {
589 c00.y = MulHigh(As<UShort4>(c00.y), f1u1v);
590 c10.y = MulHigh(As<UShort4>(c10.y), f0u1v);
591 c01.y = MulHigh(As<UShort4>(c01.y), f1u0v);
592 c11.y = MulHigh(As<UShort4>(c11.y), f0u0v);
593 }
594 else
595 {
596 c00.y = MulHigh(c00.y, f1u1vs);
597 c10.y = MulHigh(c10.y, f0u1vs);
598 c01.y = MulHigh(c01.y, f1u0vs);
599 c11.y = MulHigh(c11.y, f0u0vs);
600 }
601
602 output.y = (c00.y + c10.y) + (c01.y + c11.y);
603 if(!hasUnsignedTextureComponent(1)) output.y = AddSat(output.y, output.y); // Correct for signed fractions
604 }
605 }
606
607 if(componentCount >= 3)
608 {
609 if(has16bitTextureComponents() && hasUnsignedTextureComponent(2))
610 {
611 c00.z = As<UShort4>(c00.z) - MulHigh(As<UShort4>(c00.z), f0u) + MulHigh(As<UShort4>(c10.z), f0u);
612 c01.z = As<UShort4>(c01.z) - MulHigh(As<UShort4>(c01.z), f0u) + MulHigh(As<UShort4>(c11.z), f0u);
613 output.z = As<UShort4>(c00.z) - MulHigh(As<UShort4>(c00.z), f0v) + MulHigh(As<UShort4>(c01.z), f0v);
614 }
615 else
616 {
617 if(hasUnsignedTextureComponent(2))
618 {
619 c00.z = MulHigh(As<UShort4>(c00.z), f1u1v);
620 c10.z = MulHigh(As<UShort4>(c10.z), f0u1v);
621 c01.z = MulHigh(As<UShort4>(c01.z), f1u0v);
622 c11.z = MulHigh(As<UShort4>(c11.z), f0u0v);
623 }
624 else
625 {
626 c00.z = MulHigh(c00.z, f1u1vs);
627 c10.z = MulHigh(c10.z, f0u1vs);
628 c01.z = MulHigh(c01.z, f1u0vs);
629 c11.z = MulHigh(c11.z, f0u0vs);
630 }
631
632 output.z = (c00.z + c10.z) + (c01.z + c11.z);
633 if(!hasUnsignedTextureComponent(2)) output.z = AddSat(output.z, output.z); // Correct for signed fractions
634 }
635 }
636
637 if(componentCount >= 4)
638 {
639 if(has16bitTextureComponents() && hasUnsignedTextureComponent(3))
640 {
641 c00.w = As<UShort4>(c00.w) - MulHigh(As<UShort4>(c00.w), f0u) + MulHigh(As<UShort4>(c10.w), f0u);
642 c01.w = As<UShort4>(c01.w) - MulHigh(As<UShort4>(c01.w), f0u) + MulHigh(As<UShort4>(c11.w), f0u);
643 output.w = As<UShort4>(c00.w) - MulHigh(As<UShort4>(c00.w), f0v) + MulHigh(As<UShort4>(c01.w), f0v);
644 }
645 else
646 {
647 if(hasUnsignedTextureComponent(3))
648 {
649 c00.w = MulHigh(As<UShort4>(c00.w), f1u1v);
650 c10.w = MulHigh(As<UShort4>(c10.w), f0u1v);
651 c01.w = MulHigh(As<UShort4>(c01.w), f1u0v);
652 c11.w = MulHigh(As<UShort4>(c11.w), f0u0v);
653 }
654 else
655 {
656 c00.w = MulHigh(c00.w, f1u1vs);
657 c10.w = MulHigh(c10.w, f0u1vs);
658 c01.w = MulHigh(c01.w, f1u0vs);
659 c11.w = MulHigh(c11.w, f0u0vs);
660 }
661
662 output.w = (c00.w + c10.w) + (c01.w + c11.w);
663 if(!hasUnsignedTextureComponent(3)) output.w = AddSat(output.w, output.w); // Correct for signed fractions
664 }
665 }
666 }
667
sampleQuad2D(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)668 Vector4s SamplerCore::sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
669 {
670 Vector4s c;
671
672 bool gather = (state.textureFilter == FILTER_GATHER);
673
674 Pointer<Byte> mipmap = selectMipmap(texture, lod, secondLOD);
675 Pointer<Byte> buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
676
677 applyOffset(u, v, w, offset, mipmap);
678
679 Short4 uuuu = address(u, state.addressingModeU);
680 Short4 vvvv = address(v, state.addressingModeV);
681 Short4 wwww = address(w, state.addressingModeW);
682 Short4 layerIndex = computeLayerIndex16(a, mipmap);
683
684 if(isYcbcrFormat())
685 {
686 uint8_t lumaBits = 8;
687 uint8_t chromaBits = 8;
688 switch(state.textureFormat)
689 {
690 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
691 case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
692 lumaBits = 8;
693 chromaBits = 8;
694 break;
695 case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
696 lumaBits = 10;
697 chromaBits = 10;
698 break;
699 default:
700 UNSUPPORTED("state.textureFormat %d", (int)state.textureFormat);
701 break;
702 }
703
704 // TODO: investigate apparent precision losses in dEQP-VK.ycbcr when sampling and interpolating with Short4.
705
706 // Unnnormalized YUV values in [0, 255] for 8-bit formats, [0, 1023] for 10-bit formats.
707 Vector4f yuv;
708 Vector4f yuv00;
709 Vector4f yuv10;
710 Vector4f yuv01;
711 Vector4f yuv11;
712
713 if(state.textureFilter == FILTER_POINT)
714 {
715 sampleLumaTexel(yuv, uuuu, vvvv, wwww, layerIndex, sample, mipmap, buffer);
716 }
717 else
718 {
719 Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, -1, lod);
720 Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, -1, lod);
721 Short4 uuuu1 = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, +1, lod);
722 Short4 vvvv1 = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, +1, lod);
723
724 sampleLumaTexel(yuv00, uuuu0, vvvv0, wwww, layerIndex, sample, mipmap, buffer);
725 sampleLumaTexel(yuv01, uuuu0, vvvv1, wwww, layerIndex, sample, mipmap, buffer);
726 sampleLumaTexel(yuv10, uuuu1, vvvv0, wwww, layerIndex, sample, mipmap, buffer);
727 sampleLumaTexel(yuv11, uuuu1, vvvv1, wwww, layerIndex, sample, mipmap, buffer);
728
729 bilinearInterpolateFloat(yuv, uuuu0, vvvv0, yuv00, yuv01, yuv10, yuv11, mipmap, false, true, false, false);
730 }
731
732 // Pointers to the planes of YCbCr images are stored in consecutive mipmap levels.
733 Pointer<Byte> mipmapU = Pointer<Byte>(mipmap + 1 * sizeof(Mipmap));
734 Pointer<Byte> mipmapV = Pointer<Byte>(mipmap + 2 * sizeof(Mipmap));
735 Pointer<Byte> bufferU = *Pointer<Pointer<Byte>>(mipmapU + OFFSET(Mipmap, buffer)); // U/V for 2-plane interleaved formats.
736 Pointer<Byte> bufferV = *Pointer<Pointer<Byte>>(mipmapV + OFFSET(Mipmap, buffer));
737
738 // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#textures-implict-reconstruction
739 // but using normalized coordinates.
740 Float4 chromaU = u;
741 Float4 chromaV = v;
742 if(state.chromaXOffset == VK_CHROMA_LOCATION_COSITED_EVEN)
743 {
744 chromaU += (Float4(0.25f) / Float4(*Pointer<UInt4>(mipmapU + OFFSET(Mipmap, width))));
745 }
746 if(state.chromaYOffset == VK_CHROMA_LOCATION_COSITED_EVEN)
747 {
748 chromaV += (Float4(0.25f) / Float4(*Pointer<UInt4>(mipmapU + OFFSET(Mipmap, height))));
749 }
750
751 Short4 chromaUUUU = address(chromaU, state.addressingModeU);
752 Short4 chromaVVVV = address(chromaV, state.addressingModeV);
753
754 if(state.chromaFilter == FILTER_POINT)
755 {
756 sampleChromaTexel(yuv, chromaUUUU, chromaVVVV, wwww, layerIndex, sample, mipmapU, bufferU, mipmapV, bufferV);
757 }
758 else
759 {
760 Short4 chromaUUUU0 = offsetSample(chromaUUUU, mipmapU, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, -1, lod);
761 Short4 chromaVVVV0 = offsetSample(chromaVVVV, mipmapU, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, -1, lod);
762 Short4 chromaUUUU1 = offsetSample(chromaUUUU, mipmapU, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, +1, lod);
763 Short4 chromaVVVV1 = offsetSample(chromaVVVV, mipmapU, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, +1, lod);
764
765 sampleChromaTexel(yuv00, chromaUUUU0, chromaVVVV0, wwww, layerIndex, sample, mipmapU, bufferU, mipmapV, bufferV);
766 sampleChromaTexel(yuv01, chromaUUUU0, chromaVVVV1, wwww, layerIndex, sample, mipmapU, bufferU, mipmapV, bufferV);
767 sampleChromaTexel(yuv10, chromaUUUU1, chromaVVVV0, wwww, layerIndex, sample, mipmapU, bufferU, mipmapV, bufferV);
768 sampleChromaTexel(yuv11, chromaUUUU1, chromaVVVV1, wwww, layerIndex, sample, mipmapU, bufferU, mipmapV, bufferV);
769
770 bilinearInterpolateFloat(yuv, chromaUUUU0, chromaVVVV0, yuv00, yuv01, yuv10, yuv11, mipmapU, true, false, true, false);
771 }
772
773 if(state.swappedChroma)
774 {
775 std::swap(yuv.x, yuv.z);
776 }
777
778 if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY)
779 {
780 // Scale to the output 15-bit.
781 c.x = UShort4(yuv.x) << (15 - chromaBits);
782 c.y = UShort4(yuv.y) << (15 - lumaBits);
783 c.z = UShort4(yuv.z) << (15 - chromaBits);
784 }
785 else
786 {
787 const float twoPowLumaBits = static_cast<float>(0x1u << lumaBits);
788 const float twoPowLumaBitsMinus8 = static_cast<float>(0x1u << (lumaBits - 8));
789 const float twoPowChromaBits = static_cast<float>(0x1u << chromaBits);
790 const float twoPowChromaBitsMinus1 = static_cast<float>(0x1u << (chromaBits - 1));
791 const float twoPowChromaBitsMinus8 = static_cast<float>(0x1u << (chromaBits - 8));
792
793 Float4 y = Float4(yuv.y);
794 Float4 u = Float4(yuv.z);
795 Float4 v = Float4(yuv.x);
796
797 if(state.studioSwing)
798 {
799 // See https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.html#QUANTIZATION_NARROW
800 y = ((y / Float4(twoPowLumaBitsMinus8)) - Float4(16.0f)) / Float4(219.0f);
801 u = ((u / Float4(twoPowChromaBitsMinus8)) - Float4(128.0f)) / Float4(224.0f);
802 v = ((v / Float4(twoPowChromaBitsMinus8)) - Float4(128.0f)) / Float4(224.0f);
803 }
804 else
805 {
806 // See https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.html#QUANTIZATION_FULL
807 y = y / Float4(twoPowLumaBits - 1.0f);
808 u = (u - Float4(twoPowChromaBitsMinus1)) / Float4(twoPowChromaBits - 1.0f);
809 v = (v - Float4(twoPowChromaBitsMinus1)) / Float4(twoPowChromaBits - 1.0f);
810 }
811
812 // Now, `y` is in [0, 1] and `u` and `v` are in [-0.5, 0.5].
813
814 if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_IDENTITY)
815 {
816 c.x = Short4(v * static_cast<float>(0x7FFF));
817 c.y = Short4(y * static_cast<float>(0x7FFF));
818 c.z = Short4(u * static_cast<float>(0x7FFF));
819 }
820 else
821 {
822 // Generic YCbCr to RGB transformation:
823 // R = Y + 2 * (1 - Kr) * Cr
824 // G = Y - 2 * Kb * (1 - Kb) / Kg * Cb - 2 * Kr * (1 - Kr) / Kg * Cr
825 // B = Y + 2 * (1 - Kb) * Cb
826
827 float Kb = 0.114f;
828 float Kr = 0.299f;
829
830 switch(state.ycbcrModel)
831 {
832 case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_709:
833 Kb = 0.0722f;
834 Kr = 0.2126f;
835 break;
836 case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601:
837 Kb = 0.114f;
838 Kr = 0.299f;
839 break;
840 case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_2020:
841 Kb = 0.0593f;
842 Kr = 0.2627f;
843 break;
844 default:
845 UNSUPPORTED("ycbcrModel %d", int(state.ycbcrModel));
846 }
847
848 const float Kg = 1.0f - Kr - Kb;
849
850 const float Rr = 2 * (1 - Kr);
851 const float Gb = -2 * Kb * (1 - Kb) / Kg;
852 const float Gr = -2 * Kr * (1 - Kr) / Kg;
853 const float Bb = 2 * (1 - Kb);
854
855 Float4 r = y + Float4(Rr) * v;
856 Float4 g = y + Float4(Gb) * u + Float4(Gr) * v;
857 Float4 b = y + Float4(Bb) * u;
858
859 c.x = Short4(r * static_cast<float>(0x7FFF));
860 c.y = Short4(g * static_cast<float>(0x7FFF));
861 c.z = Short4(b * static_cast<float>(0x7FFF));
862 }
863 }
864 }
865 else // !isYcbcrFormat()
866 {
867 if(state.textureFilter == FILTER_POINT)
868 {
869 c = sampleTexel(uuuu, vvvv, wwww, layerIndex, sample, mipmap, buffer);
870 }
871 else
872 {
873 Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, -1, lod);
874 Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, -1, lod);
875 Short4 uuuu1 = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, +1, lod);
876 Short4 vvvv1 = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, +1, lod);
877
878 Vector4s c00 = sampleTexel(uuuu0, vvvv0, wwww, layerIndex, sample, mipmap, buffer);
879 Vector4s c10 = sampleTexel(uuuu1, vvvv0, wwww, layerIndex, sample, mipmap, buffer);
880 Vector4s c01 = sampleTexel(uuuu0, vvvv1, wwww, layerIndex, sample, mipmap, buffer);
881 Vector4s c11 = sampleTexel(uuuu1, vvvv1, wwww, layerIndex, sample, mipmap, buffer);
882
883 if(!gather) // Blend
884 {
885 bilinearInterpolate(c, uuuu0, vvvv0, c00, c01, c10, c11, mipmap);
886 }
887 else
888 {
889 VkComponentSwizzle swizzle = gatherSwizzle();
890 switch(swizzle)
891 {
892 case VK_COMPONENT_SWIZZLE_ZERO:
893 case VK_COMPONENT_SWIZZLE_ONE:
894 // Handled at the final component swizzle.
895 break;
896 default:
897 c.x = c01[swizzle - VK_COMPONENT_SWIZZLE_R];
898 c.y = c11[swizzle - VK_COMPONENT_SWIZZLE_R];
899 c.z = c10[swizzle - VK_COMPONENT_SWIZZLE_R];
900 c.w = c00[swizzle - VK_COMPONENT_SWIZZLE_R];
901 break;
902 }
903 }
904 }
905 }
906
907 return c;
908 }
909
sample3D(Pointer<Byte> & texture,Float4 & u_,Float4 & v_,Float4 & w_,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)910 Vector4s SamplerCore::sample3D(Pointer<Byte> &texture, Float4 &u_, Float4 &v_, Float4 &w_, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
911 {
912 Vector4s c_;
913
914 int componentCount = textureComponentCount();
915
916 Pointer<Byte> mipmap = selectMipmap(texture, lod, secondLOD);
917 Pointer<Byte> buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
918
919 applyOffset(u_, v_, w_, offset, mipmap);
920
921 Short4 uuuu = address(u_, state.addressingModeU);
922 Short4 vvvv = address(v_, state.addressingModeV);
923 Short4 wwww = address(w_, state.addressingModeW);
924
925 if(state.textureFilter == FILTER_POINT)
926 {
927 c_ = sampleTexel(uuuu, vvvv, wwww, 0, sample, mipmap, buffer);
928 }
929 else
930 {
931 Vector4s c[2][2][2];
932
933 Short4 u[2][2][2];
934 Short4 v[2][2][2];
935 Short4 s[2][2][2];
936
937 for(int i = 0; i < 2; i++)
938 {
939 for(int j = 0; j < 2; j++)
940 {
941 for(int k = 0; k < 2; k++)
942 {
943 u[i][j][k] = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, i * 2 - 1, lod);
944 v[i][j][k] = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, j * 2 - 1, lod);
945 s[i][j][k] = offsetSample(wwww, mipmap, OFFSET(Mipmap, wHalf), state.addressingModeW == ADDRESSING_WRAP, k * 2 - 1, lod);
946 }
947 }
948 }
949
950 // Fractions
951 UShort4 f0u = As<UShort4>(u[0][0][0]) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, width)));
952 UShort4 f0v = As<UShort4>(v[0][0][0]) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, height)));
953 UShort4 f0s = As<UShort4>(s[0][0][0]) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, depth)));
954
955 UShort4 f1u = ~f0u;
956 UShort4 f1v = ~f0v;
957 UShort4 f1s = ~f0s;
958
959 UShort4 f[2][2][2];
960 Short4 fs[2][2][2];
961
962 f[1][1][1] = MulHigh(f1u, f1v);
963 f[0][1][1] = MulHigh(f0u, f1v);
964 f[1][0][1] = MulHigh(f1u, f0v);
965 f[0][0][1] = MulHigh(f0u, f0v);
966 f[1][1][0] = MulHigh(f1u, f1v);
967 f[0][1][0] = MulHigh(f0u, f1v);
968 f[1][0][0] = MulHigh(f1u, f0v);
969 f[0][0][0] = MulHigh(f0u, f0v);
970
971 f[1][1][1] = MulHigh(f[1][1][1], f1s);
972 f[0][1][1] = MulHigh(f[0][1][1], f1s);
973 f[1][0][1] = MulHigh(f[1][0][1], f1s);
974 f[0][0][1] = MulHigh(f[0][0][1], f1s);
975 f[1][1][0] = MulHigh(f[1][1][0], f0s);
976 f[0][1][0] = MulHigh(f[0][1][0], f0s);
977 f[1][0][0] = MulHigh(f[1][0][0], f0s);
978 f[0][0][0] = MulHigh(f[0][0][0], f0s);
979
980 // Signed fractions
981 if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
982 {
983 fs[0][0][0] = f[0][0][0] >> 1;
984 fs[0][0][1] = f[0][0][1] >> 1;
985 fs[0][1][0] = f[0][1][0] >> 1;
986 fs[0][1][1] = f[0][1][1] >> 1;
987 fs[1][0][0] = f[1][0][0] >> 1;
988 fs[1][0][1] = f[1][0][1] >> 1;
989 fs[1][1][0] = f[1][1][0] >> 1;
990 fs[1][1][1] = f[1][1][1] >> 1;
991 }
992
993 for(int i = 0; i < 2; i++)
994 {
995 for(int j = 0; j < 2; j++)
996 {
997 for(int k = 0; k < 2; k++)
998 {
999 c[i][j][k] = sampleTexel(u[i][j][k], v[i][j][k], s[i][j][k], 0, sample, mipmap, buffer);
1000
1001 if(componentCount >= 1)
1002 {
1003 if(hasUnsignedTextureComponent(0))
1004 c[i][j][k].x = MulHigh(As<UShort4>(c[i][j][k].x), f[1 - i][1 - j][1 - k]);
1005 else
1006 c[i][j][k].x = MulHigh(c[i][j][k].x, fs[1 - i][1 - j][1 - k]);
1007 }
1008 if(componentCount >= 2)
1009 {
1010 if(hasUnsignedTextureComponent(1))
1011 c[i][j][k].y = MulHigh(As<UShort4>(c[i][j][k].y), f[1 - i][1 - j][1 - k]);
1012 else
1013 c[i][j][k].y = MulHigh(c[i][j][k].y, fs[1 - i][1 - j][1 - k]);
1014 }
1015 if(componentCount >= 3)
1016 {
1017 if(hasUnsignedTextureComponent(2))
1018 c[i][j][k].z = MulHigh(As<UShort4>(c[i][j][k].z), f[1 - i][1 - j][1 - k]);
1019 else
1020 c[i][j][k].z = MulHigh(c[i][j][k].z, fs[1 - i][1 - j][1 - k]);
1021 }
1022 if(componentCount >= 4)
1023 {
1024 if(hasUnsignedTextureComponent(3))
1025 c[i][j][k].w = MulHigh(As<UShort4>(c[i][j][k].w), f[1 - i][1 - j][1 - k]);
1026 else
1027 c[i][j][k].w = MulHigh(c[i][j][k].w, fs[1 - i][1 - j][1 - k]);
1028 }
1029
1030 if(i != 0 || j != 0 || k != 0)
1031 {
1032 if(componentCount >= 1) c[0][0][0].x += c[i][j][k].x;
1033 if(componentCount >= 2) c[0][0][0].y += c[i][j][k].y;
1034 if(componentCount >= 3) c[0][0][0].z += c[i][j][k].z;
1035 if(componentCount >= 4) c[0][0][0].w += c[i][j][k].w;
1036 }
1037 }
1038 }
1039 }
1040
1041 if(componentCount >= 1) c_.x = c[0][0][0].x;
1042 if(componentCount >= 2) c_.y = c[0][0][0].y;
1043 if(componentCount >= 3) c_.z = c[0][0][0].z;
1044 if(componentCount >= 4) c_.w = c[0][0][0].w;
1045
1046 // Correct for signed fractions
1047 if(componentCount >= 1)
1048 if(!hasUnsignedTextureComponent(0)) c_.x = AddSat(c_.x, c_.x);
1049 if(componentCount >= 2)
1050 if(!hasUnsignedTextureComponent(1)) c_.y = AddSat(c_.y, c_.y);
1051 if(componentCount >= 3)
1052 if(!hasUnsignedTextureComponent(2)) c_.z = AddSat(c_.z, c_.z);
1053 if(componentCount >= 4)
1054 if(!hasUnsignedTextureComponent(3)) c_.w = AddSat(c_.w, c_.w);
1055 }
1056
1057 return c_;
1058 }
1059
sampleFloatFilter(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta)1060 Vector4f SamplerCore::sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta)
1061 {
1062 Vector4f c = sampleFloatAniso(texture, u, v, w, a, dRef, offset, sample, lod, anisotropy, uDelta, vDelta, false);
1063
1064 if(function == Fetch)
1065 {
1066 return c;
1067 }
1068
1069 if(state.mipmapFilter == MIPMAP_LINEAR)
1070 {
1071 Vector4f cc = sampleFloatAniso(texture, u, v, w, a, dRef, offset, sample, lod, anisotropy, uDelta, vDelta, true);
1072
1073 Float4 lod4 = Float4(Frac(lod));
1074
1075 c.x = (cc.x - c.x) * lod4 + c.x;
1076 c.y = (cc.y - c.y) * lod4 + c.y;
1077 c.z = (cc.z - c.z) * lod4 + c.z;
1078 c.w = (cc.w - c.w) * lod4 + c.w;
1079 }
1080
1081 return c;
1082 }
1083
sampleFloatAniso(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta,bool secondLOD)1084 Vector4f SamplerCore::sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD)
1085 {
1086 Vector4f c;
1087
1088 if(state.textureFilter != FILTER_ANISOTROPIC)
1089 {
1090 c = sampleFloat(texture, u, v, w, a, dRef, offset, sample, lod, secondLOD);
1091 }
1092 else
1093 {
1094 Int N = RoundInt(anisotropy);
1095
1096 Vector4f cSum;
1097
1098 cSum.x = Float4(0.0f);
1099 cSum.y = Float4(0.0f);
1100 cSum.z = Float4(0.0f);
1101 cSum.w = Float4(0.0f);
1102
1103 Float4 A = *Pointer<Float4>(constants + OFFSET(Constants, uvWeight) + 16 * N);
1104 Float4 B = *Pointer<Float4>(constants + OFFSET(Constants, uvStart) + 16 * N);
1105
1106 Float4 du = uDelta;
1107 Float4 dv = vDelta;
1108
1109 Float4 u0 = u + B * du;
1110 Float4 v0 = v + B * dv;
1111
1112 du *= A;
1113 dv *= A;
1114
1115 Int i = 0;
1116
1117 Do
1118 {
1119 c = sampleFloat(texture, u0, v0, w, a, dRef, offset, sample, lod, secondLOD);
1120
1121 u0 += du;
1122 v0 += dv;
1123
1124 cSum.x += c.x * A;
1125 cSum.y += c.y * A;
1126 cSum.z += c.z * A;
1127 cSum.w += c.w * A;
1128
1129 i++;
1130 }
1131 Until(i >= N);
1132
1133 c.x = cSum.x;
1134 c.y = cSum.y;
1135 c.z = cSum.z;
1136 c.w = cSum.w;
1137 }
1138
1139 return c;
1140 }
1141
sampleFloat(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)1142 Vector4f SamplerCore::sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
1143 {
1144 if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
1145 {
1146 return sampleFloat2D(texture, u, v, w, a, dRef, offset, sample, lod, secondLOD);
1147 }
1148 else
1149 {
1150 return sampleFloat3D(texture, u, v, w, dRef, offset, sample, lod, secondLOD);
1151 }
1152 }
1153
sampleFloat2D(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)1154 Vector4f SamplerCore::sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
1155 {
1156 Vector4f c;
1157
1158 int componentCount = textureComponentCount();
1159 bool gather = (state.textureFilter == FILTER_GATHER);
1160
1161 Pointer<Byte> mipmap = selectMipmap(texture, lod, secondLOD);
1162 Pointer<Byte> buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
1163
1164 applyOffset(u, v, w, offset, mipmap);
1165
1166 Int4 x0, x1, y0, y1;
1167 Float4 fu, fv;
1168 Int4 filter = computeFilterOffset(lod);
1169 address(u, x0, x1, fu, mipmap, filter, OFFSET(Mipmap, width), state.addressingModeU);
1170 address(v, y0, y1, fv, mipmap, filter, OFFSET(Mipmap, height), state.addressingModeV);
1171
1172 Int4 pitchP = As<Int4>(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, pitchP), 16));
1173 y0 *= pitchP;
1174
1175 Int4 z;
1176 if(state.isCube() || state.isArrayed())
1177 {
1178 Int4 face = As<Int4>(w);
1179 Int4 layerIndex = computeLayerIndex(a, mipmap);
1180
1181 // For cube maps, the layer argument is per cube, each of which has 6 layers
1182 if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
1183 {
1184 layerIndex *= Int4(6);
1185 }
1186
1187 z = state.isCube() ? face : layerIndex;
1188
1189 if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
1190 {
1191 z += layerIndex;
1192 }
1193
1194 z *= *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
1195 }
1196
1197 if(state.textureFilter == FILTER_POINT || (function == Fetch))
1198 {
1199 c = sampleTexel(x0, y0, z, dRef, sample, mipmap, buffer);
1200 }
1201 else
1202 {
1203 y1 *= pitchP;
1204
1205 Vector4f c00 = sampleTexel(x0, y0, z, dRef, sample, mipmap, buffer);
1206 Vector4f c10 = sampleTexel(x1, y0, z, dRef, sample, mipmap, buffer);
1207 Vector4f c01 = sampleTexel(x0, y1, z, dRef, sample, mipmap, buffer);
1208 Vector4f c11 = sampleTexel(x1, y1, z, dRef, sample, mipmap, buffer);
1209
1210 if(!gather) // Blend
1211 {
1212 if(componentCount >= 1) c00.x = c00.x + fu * (c10.x - c00.x);
1213 if(componentCount >= 2) c00.y = c00.y + fu * (c10.y - c00.y);
1214 if(componentCount >= 3) c00.z = c00.z + fu * (c10.z - c00.z);
1215 if(componentCount >= 4) c00.w = c00.w + fu * (c10.w - c00.w);
1216
1217 if(componentCount >= 1) c01.x = c01.x + fu * (c11.x - c01.x);
1218 if(componentCount >= 2) c01.y = c01.y + fu * (c11.y - c01.y);
1219 if(componentCount >= 3) c01.z = c01.z + fu * (c11.z - c01.z);
1220 if(componentCount >= 4) c01.w = c01.w + fu * (c11.w - c01.w);
1221
1222 if(componentCount >= 1) c.x = c00.x + fv * (c01.x - c00.x);
1223 if(componentCount >= 2) c.y = c00.y + fv * (c01.y - c00.y);
1224 if(componentCount >= 3) c.z = c00.z + fv * (c01.z - c00.z);
1225 if(componentCount >= 4) c.w = c00.w + fv * (c01.w - c00.w);
1226 }
1227 else // Gather
1228 {
1229 VkComponentSwizzle swizzle = gatherSwizzle();
1230 switch(swizzle)
1231 {
1232 case VK_COMPONENT_SWIZZLE_ZERO:
1233 case VK_COMPONENT_SWIZZLE_ONE:
1234 // Handled at the final component swizzle.
1235 break;
1236 default:
1237 c.x = c01[swizzle - VK_COMPONENT_SWIZZLE_R];
1238 c.y = c11[swizzle - VK_COMPONENT_SWIZZLE_R];
1239 c.z = c10[swizzle - VK_COMPONENT_SWIZZLE_R];
1240 c.w = c00[swizzle - VK_COMPONENT_SWIZZLE_R];
1241 break;
1242 }
1243 }
1244 }
1245
1246 return c;
1247 }
1248
sampleFloat3D(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)1249 Vector4f SamplerCore::sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
1250 {
1251 Vector4f c;
1252
1253 int componentCount = textureComponentCount();
1254
1255 Pointer<Byte> mipmap = selectMipmap(texture, lod, secondLOD);
1256 Pointer<Byte> buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
1257
1258 applyOffset(u, v, w, offset, mipmap);
1259
1260 Int4 x0, x1, y0, y1, z0, z1;
1261 Float4 fu, fv, fw;
1262 Int4 filter = computeFilterOffset(lod);
1263 address(u, x0, x1, fu, mipmap, filter, OFFSET(Mipmap, width), state.addressingModeU);
1264 address(v, y0, y1, fv, mipmap, filter, OFFSET(Mipmap, height), state.addressingModeV);
1265 address(w, z0, z1, fw, mipmap, filter, OFFSET(Mipmap, depth), state.addressingModeW);
1266
1267 Int4 pitchP = As<Int4>(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, pitchP), 16));
1268 Int4 sliceP = As<Int4>(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP), 16));
1269 y0 *= pitchP;
1270 z0 *= sliceP;
1271
1272 if(state.textureFilter == FILTER_POINT || (function == Fetch))
1273 {
1274 c = sampleTexel(x0, y0, z0, dRef, sample, mipmap, buffer);
1275 }
1276 else
1277 {
1278 y1 *= pitchP;
1279 z1 *= sliceP;
1280
1281 Vector4f c000 = sampleTexel(x0, y0, z0, dRef, sample, mipmap, buffer);
1282 Vector4f c100 = sampleTexel(x1, y0, z0, dRef, sample, mipmap, buffer);
1283 Vector4f c010 = sampleTexel(x0, y1, z0, dRef, sample, mipmap, buffer);
1284 Vector4f c110 = sampleTexel(x1, y1, z0, dRef, sample, mipmap, buffer);
1285 Vector4f c001 = sampleTexel(x0, y0, z1, dRef, sample, mipmap, buffer);
1286 Vector4f c101 = sampleTexel(x1, y0, z1, dRef, sample, mipmap, buffer);
1287 Vector4f c011 = sampleTexel(x0, y1, z1, dRef, sample, mipmap, buffer);
1288 Vector4f c111 = sampleTexel(x1, y1, z1, dRef, sample, mipmap, buffer);
1289
1290 // Blend first slice
1291 if(componentCount >= 1) c000.x = c000.x + fu * (c100.x - c000.x);
1292 if(componentCount >= 2) c000.y = c000.y + fu * (c100.y - c000.y);
1293 if(componentCount >= 3) c000.z = c000.z + fu * (c100.z - c000.z);
1294 if(componentCount >= 4) c000.w = c000.w + fu * (c100.w - c000.w);
1295
1296 if(componentCount >= 1) c010.x = c010.x + fu * (c110.x - c010.x);
1297 if(componentCount >= 2) c010.y = c010.y + fu * (c110.y - c010.y);
1298 if(componentCount >= 3) c010.z = c010.z + fu * (c110.z - c010.z);
1299 if(componentCount >= 4) c010.w = c010.w + fu * (c110.w - c010.w);
1300
1301 if(componentCount >= 1) c000.x = c000.x + fv * (c010.x - c000.x);
1302 if(componentCount >= 2) c000.y = c000.y + fv * (c010.y - c000.y);
1303 if(componentCount >= 3) c000.z = c000.z + fv * (c010.z - c000.z);
1304 if(componentCount >= 4) c000.w = c000.w + fv * (c010.w - c000.w);
1305
1306 // Blend second slice
1307 if(componentCount >= 1) c001.x = c001.x + fu * (c101.x - c001.x);
1308 if(componentCount >= 2) c001.y = c001.y + fu * (c101.y - c001.y);
1309 if(componentCount >= 3) c001.z = c001.z + fu * (c101.z - c001.z);
1310 if(componentCount >= 4) c001.w = c001.w + fu * (c101.w - c001.w);
1311
1312 if(componentCount >= 1) c011.x = c011.x + fu * (c111.x - c011.x);
1313 if(componentCount >= 2) c011.y = c011.y + fu * (c111.y - c011.y);
1314 if(componentCount >= 3) c011.z = c011.z + fu * (c111.z - c011.z);
1315 if(componentCount >= 4) c011.w = c011.w + fu * (c111.w - c011.w);
1316
1317 if(componentCount >= 1) c001.x = c001.x + fv * (c011.x - c001.x);
1318 if(componentCount >= 2) c001.y = c001.y + fv * (c011.y - c001.y);
1319 if(componentCount >= 3) c001.z = c001.z + fv * (c011.z - c001.z);
1320 if(componentCount >= 4) c001.w = c001.w + fv * (c011.w - c001.w);
1321
1322 // Blend slices
1323 if(componentCount >= 1) c.x = c000.x + fw * (c001.x - c000.x);
1324 if(componentCount >= 2) c.y = c000.y + fw * (c001.y - c000.y);
1325 if(componentCount >= 3) c.z = c000.z + fw * (c001.z - c000.z);
1326 if(componentCount >= 4) c.w = c000.w + fw * (c001.w - c000.w);
1327 }
1328
1329 return c;
1330 }
1331
log2sqrt(Float lod)1332 static Float log2sqrt(Float lod)
1333 {
1334 // log2(sqrt(lod)) // Equals 0.25 * log2(lod^2).
1335 lod *= lod; // Squaring doubles the exponent and produces an extra bit of precision.
1336 lod = Float(As<Int>(lod)) - Float(0x3F800000); // Interpret as integer and subtract the exponent bias.
1337 lod *= As<Float>(Int(0x33000000)); // Scale by 0.25 * 2^-23 (mantissa length).
1338
1339 return lod;
1340 }
1341
log2(Float lod)1342 static Float log2(Float lod)
1343 {
1344 lod *= lod; // Squaring doubles the exponent and produces an extra bit of precision.
1345 lod = Float(As<Int>(lod)) - Float(0x3F800000); // Interpret as integer and subtract the exponent bias.
1346 lod *= As<Float>(Int(0x33800000)); // Scale by 0.5 * 2^-23 (mantissa length).
1347
1348 return lod;
1349 }
1350
computeLod1D(Pointer<Byte> & texture,Float & lod,Float4 & uuuu,const Float4 & dsx,const Float4 & dsy)1351 void SamplerCore::computeLod1D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, const Float4 &dsx, const Float4 &dsy)
1352 {
1353 Float4 dudxy;
1354
1355 if(function != Grad) // Implicit
1356 {
1357 dudxy = uuuu.yz - uuuu.xx;
1358 }
1359 else
1360 {
1361 dudxy = UnpackLow(dsx, dsy);
1362 }
1363
1364 // Scale by texture dimensions.
1365 Float4 dUdxy = dudxy * *Pointer<Float4>(texture + OFFSET(Texture, widthWidthHeightHeight));
1366
1367 // Note we could take the absolute value here and omit the square root below,
1368 // but this is more consistent with the 2D calculation and still cheap.
1369 Float4 dU2dxy = dUdxy * dUdxy;
1370
1371 lod = Max(Float(dU2dxy.x), Float(dU2dxy.y));
1372 lod = log2sqrt(lod);
1373 }
1374
computeLod2D(Pointer<Byte> & texture,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta,Float4 & uuuu,Float4 & vvvv,const Float4 & dsx,const Float4 & dsy)1375 void SamplerCore::computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, const Float4 &dsx, const Float4 &dsy)
1376 {
1377 Float4 duvdxy;
1378
1379 if(function != Grad) // Implicit
1380 {
1381 duvdxy = Float4(uuuu.yz, vvvv.yz) - Float4(uuuu.xx, vvvv.xx);
1382 }
1383 else
1384 {
1385 Float4 dudxy = Float4(dsx.xx, dsy.xx);
1386 Float4 dvdxy = Float4(dsx.yy, dsy.yy);
1387
1388 duvdxy = Float4(dudxy.xz, dvdxy.xz);
1389 }
1390
1391 // Scale by texture dimensions.
1392 Float4 dUVdxy = duvdxy * *Pointer<Float4>(texture + OFFSET(Texture, widthWidthHeightHeight));
1393
1394 Float4 dUV2dxy = dUVdxy * dUVdxy;
1395 Float4 dUV2 = dUV2dxy.xy + dUV2dxy.zw;
1396
1397 lod = Max(Float(dUV2.x), Float(dUV2.y)); // Square length of major axis
1398
1399 if(state.textureFilter == FILTER_ANISOTROPIC)
1400 {
1401 Float det = Abs(Float(dUVdxy.x) * Float(dUVdxy.w) - Float(dUVdxy.y) * Float(dUVdxy.z));
1402
1403 Float4 dudx = duvdxy.xxxx;
1404 Float4 dudy = duvdxy.yyyy;
1405 Float4 dvdx = duvdxy.zzzz;
1406 Float4 dvdy = duvdxy.wwww;
1407
1408 Int4 mask = As<Int4>(CmpNLT(dUV2.x, dUV2.y));
1409 uDelta = As<Float4>((As<Int4>(dudx) & mask) | ((As<Int4>(dudy) & ~mask)));
1410 vDelta = As<Float4>((As<Int4>(dvdx) & mask) | ((As<Int4>(dvdy) & ~mask)));
1411
1412 anisotropy = lod * Rcp(det, true /* relaxedPrecision */);
1413 anisotropy = Min(anisotropy, state.maxAnisotropy);
1414
1415 // TODO(b/151263485): While we always need `lod` above, when there's only
1416 // a single mipmap level the following calculations could be skipped.
1417 lod *= Rcp(anisotropy * anisotropy, true /* relaxedPrecision */);
1418 }
1419
1420 lod = log2sqrt(lod); // log2(sqrt(lod))
1421 }
1422
computeLodCube(Pointer<Byte> & texture,Float & lod,Float4 & u,Float4 & v,Float4 & w,const Float4 & dsx,const Float4 & dsy,Float4 & M)1423 void SamplerCore::computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float4 &dsx, const Float4 &dsy, Float4 &M)
1424 {
1425 Float4 dudxy, dvdxy, dsdxy;
1426
1427 if(function != Grad) // Implicit
1428 {
1429 Float4 U = u * M;
1430 Float4 V = v * M;
1431 Float4 W = w * M;
1432
1433 dudxy = Abs(U - U.xxxx);
1434 dvdxy = Abs(V - V.xxxx);
1435 dsdxy = Abs(W - W.xxxx);
1436 }
1437 else
1438 {
1439 dudxy = Float4(dsx.xx, dsy.xx);
1440 dvdxy = Float4(dsx.yy, dsy.yy);
1441 dsdxy = Float4(dsx.zz, dsy.zz);
1442
1443 dudxy = Abs(dudxy * Float4(M.x));
1444 dvdxy = Abs(dvdxy * Float4(M.x));
1445 dsdxy = Abs(dsdxy * Float4(M.x));
1446 }
1447
1448 // Compute the largest Manhattan distance in two dimensions.
1449 // This takes the footprint across adjacent faces into account.
1450 Float4 duvdxy = dudxy + dvdxy;
1451 Float4 dusdxy = dudxy + dsdxy;
1452 Float4 dvsdxy = dvdxy + dsdxy;
1453
1454 dudxy = Max(Max(duvdxy, dusdxy), dvsdxy);
1455
1456 lod = Max(Float(dudxy.y), Float(dudxy.z)); // TODO: Max(dudxy.y, dudxy.z);
1457
1458 // Scale by texture dimension.
1459 lod *= *Pointer<Float>(texture + OFFSET(Texture, width));
1460
1461 lod = log2(lod);
1462 }
1463
computeLod3D(Pointer<Byte> & texture,Float & lod,Float4 & uuuu,Float4 & vvvv,Float4 & wwww,const Float4 & dsx,const Float4 & dsy)1464 void SamplerCore::computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, const Float4 &dsx, const Float4 &dsy)
1465 {
1466 Float4 dudxy, dvdxy, dsdxy;
1467
1468 if(function != Grad) // Implicit
1469 {
1470 dudxy = uuuu - uuuu.xxxx;
1471 dvdxy = vvvv - vvvv.xxxx;
1472 dsdxy = wwww - wwww.xxxx;
1473 }
1474 else
1475 {
1476 dudxy = Float4(dsx.xx, dsy.xx);
1477 dvdxy = Float4(dsx.yy, dsy.yy);
1478 dsdxy = Float4(dsx.zz, dsy.zz);
1479 }
1480
1481 // Scale by texture dimensions.
1482 dudxy *= *Pointer<Float4>(texture + OFFSET(Texture, width));
1483 dvdxy *= *Pointer<Float4>(texture + OFFSET(Texture, height));
1484 dsdxy *= *Pointer<Float4>(texture + OFFSET(Texture, depth));
1485
1486 dudxy *= dudxy;
1487 dvdxy *= dvdxy;
1488 dsdxy *= dsdxy;
1489
1490 dudxy += dvdxy;
1491 dudxy += dsdxy;
1492
1493 lod = Max(Float(dudxy.y), Float(dudxy.z)); // TODO: Max(dudxy.y, dudxy.z);
1494
1495 lod = log2sqrt(lod); // log2(sqrt(lod))
1496 }
1497
cubeFace(Float4 & U,Float4 & V,Float4 & x,Float4 & y,Float4 & z,Float4 & M)1498 Int4 SamplerCore::cubeFace(Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M)
1499 {
1500 // TODO: Comply with Vulkan recommendation:
1501 // Vulkan 1.1: "The rules should have as the first rule that rz wins over ry and rx, and the second rule that ry wins over rx."
1502
1503 Int4 xn = CmpLT(x, 0.0f); // x < 0
1504 Int4 yn = CmpLT(y, 0.0f); // y < 0
1505 Int4 zn = CmpLT(z, 0.0f); // z < 0
1506
1507 Float4 absX = Abs(x);
1508 Float4 absY = Abs(y);
1509 Float4 absZ = Abs(z);
1510
1511 Int4 xy = CmpNLE(absX, absY); // abs(x) > abs(y)
1512 Int4 yz = CmpNLE(absY, absZ); // abs(y) > abs(z)
1513 Int4 zx = CmpNLE(absZ, absX); // abs(z) > abs(x)
1514 Int4 xMajor = xy & ~zx; // abs(x) > abs(y) && abs(x) > abs(z)
1515 Int4 yMajor = yz & ~xy; // abs(y) > abs(z) && abs(y) > abs(x)
1516 Int4 zMajor = zx & ~yz; // abs(z) > abs(x) && abs(z) > abs(y)
1517
1518 // FACE_POSITIVE_X = 000b
1519 // FACE_NEGATIVE_X = 001b
1520 // FACE_POSITIVE_Y = 010b
1521 // FACE_NEGATIVE_Y = 011b
1522 // FACE_POSITIVE_Z = 100b
1523 // FACE_NEGATIVE_Z = 101b
1524
1525 Int yAxis = SignMask(yMajor);
1526 Int zAxis = SignMask(zMajor);
1527
1528 Int4 n = ((xn & xMajor) | (yn & yMajor) | (zn & zMajor)) & Int4(0x80000000);
1529 Int negative = SignMask(n);
1530
1531 Int faces = *Pointer<Int>(constants + OFFSET(Constants, transposeBit0) + negative * 4);
1532 faces |= *Pointer<Int>(constants + OFFSET(Constants, transposeBit1) + yAxis * 4);
1533 faces |= *Pointer<Int>(constants + OFFSET(Constants, transposeBit2) + zAxis * 4);
1534
1535 Int4 face;
1536 face.x = faces & 0x7;
1537 face.y = (faces >> 4) & 0x7;
1538 face.z = (faces >> 8) & 0x7;
1539 face.w = (faces >> 12) & 0x7;
1540
1541 M = Max(Max(absX, absY), absZ);
1542
1543 // U = xMajor ? (neg ^ -z) : ((zMajor & neg) ^ x)
1544 U = As<Float4>((xMajor & (n ^ As<Int4>(-z))) | (~xMajor & ((zMajor & n) ^ As<Int4>(x))));
1545
1546 // V = !yMajor ? -y : (n ^ z)
1547 V = As<Float4>((~yMajor & As<Int4>(-y)) | (yMajor & (n ^ As<Int4>(z))));
1548
1549 M = reciprocal(M) * 0.5f;
1550 U = U * M + 0.5f;
1551 V = V * M + 0.5f;
1552
1553 return face;
1554 }
1555
applyOffset(Float4 & u,Float4 & v,Float4 & w,Vector4i & offset,Pointer<Byte> mipmap)1556 void SamplerCore::applyOffset(Float4 &u, Float4 &v, Float4 &w, Vector4i &offset, Pointer<Byte> mipmap)
1557 {
1558 if(function.offset)
1559 {
1560 if(function == Fetch)
1561 {
1562 // Unnormalized coordinates
1563 u = As<Float4>(As<Int4>(u) + offset.x);
1564 if(state.is2D() || state.is3D() || state.isCube())
1565 {
1566 v = As<Float4>(As<Int4>(v) + offset.y);
1567 if(state.is3D())
1568 {
1569 w = As<Float4>(As<Int4>(w) + offset.z);
1570 }
1571 }
1572 }
1573 else
1574 {
1575 // Normalized coordinates
1576 UInt4 width = *Pointer<UInt4>(mipmap + OFFSET(Mipmap, width));
1577 u += Float4(offset.x) / Float4(width);
1578 if(state.is2D() || state.is3D() || state.isCube())
1579 {
1580 UInt4 height = *Pointer<UInt4>(mipmap + OFFSET(Mipmap, height));
1581 v += Float4(offset.y) / Float4(height);
1582 if(state.is3D())
1583 {
1584 UInt4 depth = *Pointer<UInt4>(mipmap + OFFSET(Mipmap, depth));
1585 w += Float4(offset.z) / Float4(depth);
1586 }
1587 }
1588 }
1589 }
1590 }
1591
computeIndices(UInt index[4],Short4 uuuu,Short4 vvvv,Short4 wwww,const Short4 & layerIndex,const Int4 & sample,const Pointer<Byte> & mipmap)1592 void SamplerCore::computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, const Short4 &layerIndex, const Int4 &sample, const Pointer<Byte> &mipmap)
1593 {
1594 uuuu = MulHigh(As<UShort4>(uuuu), UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, width))));
1595
1596 UInt4 indices = Int4(uuuu);
1597
1598 if(state.is2D() || state.is3D() || state.isCube())
1599 {
1600 vvvv = MulHigh(As<UShort4>(vvvv), UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, height))));
1601
1602 Short4 uv0uv1 = As<Short4>(UnpackLow(uuuu, vvvv));
1603 Short4 uv2uv3 = As<Short4>(UnpackHigh(uuuu, vvvv));
1604 Int2 i01 = MulAdd(uv0uv1, *Pointer<Short4>(mipmap + OFFSET(Mipmap, onePitchP)));
1605 Int2 i23 = MulAdd(uv2uv3, *Pointer<Short4>(mipmap + OFFSET(Mipmap, onePitchP)));
1606
1607 indices = UInt4(As<UInt2>(i01), As<UInt2>(i23));
1608 }
1609
1610 if(state.is3D())
1611 {
1612 wwww = MulHigh(As<UShort4>(wwww), UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, depth))));
1613
1614 indices += As<UInt4>(Int4(As<UShort4>(wwww))) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP));
1615 }
1616
1617 if(state.isArrayed())
1618 {
1619 Int4 layer = Int4(As<UShort4>(layerIndex));
1620
1621 if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
1622 {
1623 layer *= Int4(6);
1624 }
1625
1626 UInt4 layerOffset = As<UInt4>(layer) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP));
1627
1628 indices += layerOffset;
1629 }
1630
1631 if(function.sample)
1632 {
1633 UInt4 sampleOffset = Min(As<UInt4>(sample), *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sampleMax), 16)) *
1634 *Pointer<UInt4>(mipmap + OFFSET(Mipmap, samplePitchP), 16);
1635 indices += sampleOffset;
1636 }
1637
1638 index[0] = Extract(indices, 0);
1639 index[1] = Extract(indices, 1);
1640 index[2] = Extract(indices, 2);
1641 index[3] = Extract(indices, 3);
1642 }
1643
computeIndices(UInt index[4],Int4 uuuu,Int4 vvvv,Int4 wwww,const Int4 & sample,Int4 valid,const Pointer<Byte> & mipmap)1644 void SamplerCore::computeIndices(UInt index[4], Int4 uuuu, Int4 vvvv, Int4 wwww, const Int4 &sample, Int4 valid, const Pointer<Byte> &mipmap)
1645 {
1646 UInt4 indices = uuuu;
1647
1648 if(state.is2D() || state.is3D() || state.isCube())
1649 {
1650 indices += As<UInt4>(vvvv);
1651 }
1652
1653 if(state.is3D() || state.isCube() || state.isArrayed())
1654 {
1655 indices += As<UInt4>(wwww);
1656 }
1657
1658 if(function.sample)
1659 {
1660 indices += Min(As<UInt4>(sample), *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sampleMax), 16)) *
1661 *Pointer<UInt4>(mipmap + OFFSET(Mipmap, samplePitchP), 16);
1662 }
1663
1664 if(borderModeActive())
1665 {
1666 // Texels out of range are still sampled before being replaced
1667 // with the border color, so sample them at linear index 0.
1668 indices &= As<UInt4>(valid);
1669 }
1670
1671 for(int i = 0; i < 4; i++)
1672 {
1673 index[i] = Extract(As<Int4>(indices), i);
1674 }
1675 }
1676
sampleTexel(UInt index[4],Pointer<Byte> buffer)1677 Vector4s SamplerCore::sampleTexel(UInt index[4], Pointer<Byte> buffer)
1678 {
1679 Vector4s c;
1680
1681 if(has16bitPackedTextureFormat())
1682 {
1683 c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
1684 c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
1685 c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
1686 c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
1687
1688 switch(state.textureFormat)
1689 {
1690 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1691 c.z = (c.x & Short4(0x001Fu)) << 11;
1692 c.y = (c.x & Short4(0x07E0u)) << 5;
1693 c.x = (c.x & Short4(0xF800u));
1694 break;
1695 case VK_FORMAT_B5G6R5_UNORM_PACK16:
1696 c.z = (c.x & Short4(0xF800u));
1697 c.y = (c.x & Short4(0x07E0u)) << 5;
1698 c.x = (c.x & Short4(0x001Fu)) << 11;
1699 break;
1700 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1701 c.w = (c.x << 12) & Short4(0xF000u);
1702 c.z = (c.x << 8) & Short4(0xF000u);
1703 c.y = (c.x << 4) & Short4(0xF000u);
1704 c.x = (c.x) & Short4(0xF000u);
1705 break;
1706 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1707 c.w = (c.x << 12) & Short4(0xF000u);
1708 c.z = (c.x) & Short4(0xF000u);
1709 c.y = (c.x << 4) & Short4(0xF000u);
1710 c.x = (c.x << 8) & Short4(0xF000u);
1711 break;
1712 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1713 c.w = (c.x) & Short4(0xF000u);
1714 c.z = (c.x << 12) & Short4(0xF000u);
1715 c.y = (c.x << 8) & Short4(0xF000u);
1716 c.x = (c.x << 4) & Short4(0xF000u);
1717 break;
1718 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1719 c.w = (c.x) & Short4(0xF000u);
1720 c.z = (c.x << 4) & Short4(0xF000u);
1721 c.y = (c.x << 8) & Short4(0xF000u);
1722 c.x = (c.x << 12) & Short4(0xF000u);
1723 break;
1724 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1725 c.w = (c.x << 15) & Short4(0x8000u);
1726 c.z = (c.x << 10) & Short4(0xF800u);
1727 c.y = (c.x << 5) & Short4(0xF800u);
1728 c.x = (c.x) & Short4(0xF800u);
1729 break;
1730 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1731 c.w = (c.x << 15) & Short4(0x8000u);
1732 c.z = (c.x) & Short4(0xF800u);
1733 c.y = (c.x << 5) & Short4(0xF800u);
1734 c.x = (c.x << 10) & Short4(0xF800u);
1735 break;
1736 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1737 c.w = (c.x) & Short4(0x8000u);
1738 c.z = (c.x << 11) & Short4(0xF800u);
1739 c.y = (c.x << 6) & Short4(0xF800u);
1740 c.x = (c.x << 1) & Short4(0xF800u);
1741 break;
1742 default:
1743 ASSERT(false);
1744 }
1745 }
1746 else if(has8bitTextureComponents())
1747 {
1748 switch(textureComponentCount())
1749 {
1750 case 4:
1751 {
1752 Byte4 c0 = Pointer<Byte4>(buffer)[index[0]];
1753 Byte4 c1 = Pointer<Byte4>(buffer)[index[1]];
1754 Byte4 c2 = Pointer<Byte4>(buffer)[index[2]];
1755 Byte4 c3 = Pointer<Byte4>(buffer)[index[3]];
1756 c.x = Unpack(c0, c1);
1757 c.y = Unpack(c2, c3);
1758
1759 switch(state.textureFormat)
1760 {
1761 case VK_FORMAT_B8G8R8A8_UNORM:
1762 case VK_FORMAT_B8G8R8A8_SRGB:
1763 c.z = As<Short4>(UnpackLow(c.x, c.y));
1764 c.x = As<Short4>(UnpackHigh(c.x, c.y));
1765 c.y = c.z;
1766 c.w = c.x;
1767 c.z = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.z));
1768 c.y = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.y));
1769 c.x = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.x));
1770 c.w = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.w));
1771 break;
1772 case VK_FORMAT_R8G8B8A8_UNORM:
1773 case VK_FORMAT_R8G8B8A8_SNORM:
1774 case VK_FORMAT_R8G8B8A8_SINT:
1775 case VK_FORMAT_R8G8B8A8_SRGB:
1776 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1777 case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
1778 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1779 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1780 c.z = As<Short4>(UnpackHigh(c.x, c.y));
1781 c.x = As<Short4>(UnpackLow(c.x, c.y));
1782 c.y = c.x;
1783 c.w = c.z;
1784 c.x = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.x));
1785 c.y = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.y));
1786 c.z = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.z));
1787 c.w = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.w));
1788 // Propagate sign bit
1789 if(state.textureFormat == VK_FORMAT_R8G8B8A8_SINT ||
1790 state.textureFormat == VK_FORMAT_A8B8G8R8_SINT_PACK32)
1791 {
1792 c.x >>= 8;
1793 c.y >>= 8;
1794 c.z >>= 8;
1795 c.w >>= 8;
1796 }
1797 break;
1798 case VK_FORMAT_R8G8B8A8_UINT:
1799 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1800 c.z = As<Short4>(UnpackHigh(c.x, c.y));
1801 c.x = As<Short4>(UnpackLow(c.x, c.y));
1802 c.y = c.x;
1803 c.w = c.z;
1804 c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(Short4(0)));
1805 c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(Short4(0)));
1806 c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(Short4(0)));
1807 c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(Short4(0)));
1808 break;
1809 default:
1810 ASSERT(false);
1811 }
1812 }
1813 break;
1814 case 2:
1815 c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
1816 c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
1817 c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
1818 c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
1819
1820 switch(state.textureFormat)
1821 {
1822 case VK_FORMAT_R8G8_UNORM:
1823 case VK_FORMAT_R8G8_SNORM:
1824 case VK_FORMAT_R8G8_SRGB:
1825 c.y = (c.x & Short4(0xFF00u));
1826 c.x = (c.x << 8);
1827 break;
1828 case VK_FORMAT_R8G8_SINT:
1829 c.y = c.x >> 8;
1830 c.x = (c.x << 8) >> 8; // Propagate sign bit
1831 break;
1832 case VK_FORMAT_R8G8_UINT:
1833 c.y = As<Short4>(As<UShort4>(c.x) >> 8);
1834 c.x &= Short4(0x00FFu);
1835 break;
1836 default:
1837 ASSERT(false);
1838 }
1839 break;
1840 case 1:
1841 {
1842 Int c0 = Int(*Pointer<Byte>(buffer + index[0]));
1843 Int c1 = Int(*Pointer<Byte>(buffer + index[1]));
1844 Int c2 = Int(*Pointer<Byte>(buffer + index[2]));
1845 Int c3 = Int(*Pointer<Byte>(buffer + index[3]));
1846 c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
1847
1848 switch(state.textureFormat)
1849 {
1850 case VK_FORMAT_R8_SINT:
1851 case VK_FORMAT_R8_UINT:
1852 case VK_FORMAT_S8_UINT:
1853 {
1854 Int zero(0);
1855 c.x = Unpack(As<Byte4>(c0), As<Byte4>(zero));
1856 // Propagate sign bit
1857 if(state.textureFormat == VK_FORMAT_R8_SINT)
1858 {
1859 c.x = (c.x << 8) >> 8;
1860 }
1861 }
1862 break;
1863 case VK_FORMAT_R8_SNORM:
1864 case VK_FORMAT_R8_UNORM:
1865 case VK_FORMAT_R8_SRGB:
1866 // TODO: avoid populating the low bits at all.
1867 c.x = Unpack(As<Byte4>(c0));
1868 c.x &= Short4(0xFF00u);
1869 break;
1870 default:
1871 c.x = Unpack(As<Byte4>(c0));
1872 break;
1873 }
1874 }
1875 break;
1876 default:
1877 ASSERT(false);
1878 }
1879 }
1880 else if(has16bitTextureComponents())
1881 {
1882 switch(textureComponentCount())
1883 {
1884 case 4:
1885 c.x = Pointer<Short4>(buffer)[index[0]];
1886 c.y = Pointer<Short4>(buffer)[index[1]];
1887 c.z = Pointer<Short4>(buffer)[index[2]];
1888 c.w = Pointer<Short4>(buffer)[index[3]];
1889 transpose4x4(c.x, c.y, c.z, c.w);
1890 break;
1891 case 2:
1892 c.x = *Pointer<Short4>(buffer + 4 * index[0]);
1893 c.x = As<Short4>(UnpackLow(c.x, *Pointer<Short4>(buffer + 4 * index[1])));
1894 c.z = *Pointer<Short4>(buffer + 4 * index[2]);
1895 c.z = As<Short4>(UnpackLow(c.z, *Pointer<Short4>(buffer + 4 * index[3])));
1896 c.y = c.x;
1897 c.x = UnpackLow(As<Int2>(c.x), As<Int2>(c.z));
1898 c.y = UnpackHigh(As<Int2>(c.y), As<Int2>(c.z));
1899 break;
1900 case 1:
1901 c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
1902 c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
1903 c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
1904 c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
1905 break;
1906 default:
1907 ASSERT(false);
1908 }
1909 }
1910 else if(state.textureFormat == VK_FORMAT_A2B10G10R10_UNORM_PACK32)
1911 {
1912 Int4 cc;
1913 cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1914 cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1915 cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1916 cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1917
1918 c.x = Short4(cc << 6) & Short4(0xFFC0u);
1919 c.y = Short4(cc >> 4) & Short4(0xFFC0u);
1920 c.z = Short4(cc >> 14) & Short4(0xFFC0u);
1921 c.w = Short4(cc >> 16) & Short4(0xC000u);
1922 }
1923 else if(state.textureFormat == VK_FORMAT_A2R10G10B10_UNORM_PACK32)
1924 {
1925 Int4 cc;
1926 cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1927 cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1928 cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1929 cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1930
1931 c.x = Short4(cc >> 14) & Short4(0xFFC0u);
1932 c.y = Short4(cc >> 4) & Short4(0xFFC0u);
1933 c.z = Short4(cc << 6) & Short4(0xFFC0u);
1934 c.w = Short4(cc >> 16) & Short4(0xC000u);
1935 }
1936 else if(state.textureFormat == VK_FORMAT_A2B10G10R10_UINT_PACK32)
1937 {
1938 Int4 cc;
1939 cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1940 cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1941 cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1942 cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1943
1944 c.x = Short4(cc & Int4(0x3FF));
1945 c.y = Short4((cc >> 10) & Int4(0x3FF));
1946 c.z = Short4((cc >> 20) & Int4(0x3FF));
1947 c.w = Short4((cc >> 30) & Int4(0x3));
1948 }
1949 else if(state.textureFormat == VK_FORMAT_A2R10G10B10_UINT_PACK32)
1950 {
1951 Int4 cc;
1952 cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1953 cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1954 cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1955 cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1956
1957 c.z = Short4((cc & Int4(0x3FF)));
1958 c.y = Short4(((cc >> 10) & Int4(0x3FF)));
1959 c.x = Short4(((cc >> 20) & Int4(0x3FF)));
1960 c.w = Short4(((cc >> 30) & Int4(0x3)));
1961 }
1962 else
1963 ASSERT(false);
1964
1965 if(state.textureFormat.isSRGBformat())
1966 {
1967 for(int i = 0; i < textureComponentCount(); i++)
1968 {
1969 if(isRGBComponent(i))
1970 {
1971 // The current table-based sRGB conversion requires 0xFF00 to represent 1.0.
1972 ASSERT(state.textureFormat.has8bitTextureComponents());
1973
1974 sRGBtoLinearFF00(c[i]);
1975 }
1976 }
1977 }
1978
1979 return c;
1980 }
1981
sampleLumaTexel(Vector4f & output,Short4 & uuuu,Short4 & vvvv,Short4 & wwww,const Short4 & layerIndex,const Int4 & sample,Pointer<Byte> & lumaMipmap,Pointer<Byte> lumaBuffer)1982 void SamplerCore::sampleLumaTexel(Vector4f &output, Short4 &uuuu, Short4 &vvvv, Short4 &wwww, const Short4 &layerIndex, const Int4 &sample, Pointer<Byte> &lumaMipmap, Pointer<Byte> lumaBuffer)
1983 {
1984 ASSERT(isYcbcrFormat());
1985
1986 UInt index[4];
1987 computeIndices(index, uuuu, vvvv, wwww, layerIndex, sample, lumaMipmap);
1988
1989 // Luminance (either 8-bit or 10-bit in bottom bits).
1990 UShort4 Y;
1991
1992 switch(state.textureFormat)
1993 {
1994 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
1995 case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
1996 {
1997 Y = Insert(Y, UShort(lumaBuffer[index[0]]), 0);
1998 Y = Insert(Y, UShort(lumaBuffer[index[1]]), 1);
1999 Y = Insert(Y, UShort(lumaBuffer[index[2]]), 2);
2000 Y = Insert(Y, UShort(lumaBuffer[index[3]]), 3);
2001 }
2002 break;
2003 case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
2004 {
2005 Y = Insert(Y, Pointer<UShort>(lumaBuffer)[index[0]], 0);
2006 Y = Insert(Y, Pointer<UShort>(lumaBuffer)[index[1]], 1);
2007 Y = Insert(Y, Pointer<UShort>(lumaBuffer)[index[2]], 2);
2008 Y = Insert(Y, Pointer<UShort>(lumaBuffer)[index[3]], 3);
2009 // Top 10 bits of each 16 bits:
2010 Y = (Y & UShort4(0xFFC0u)) >> 6;
2011 }
2012 break;
2013 default:
2014 UNSUPPORTED("state.textureFormat %d", (int)state.textureFormat);
2015 break;
2016 }
2017
2018 output.y = Float4(Y);
2019 }
2020
sampleChromaTexel(Vector4f & output,Short4 & uuuu,Short4 & vvvv,Short4 & wwww,const Short4 & layerIndex,const Int4 & sample,Pointer<Byte> & mipmapU,Pointer<Byte> bufferU,Pointer<Byte> & mipmapV,Pointer<Byte> bufferV)2021 void SamplerCore::sampleChromaTexel(Vector4f &output, Short4 &uuuu, Short4 &vvvv, Short4 &wwww, const Short4 &layerIndex, const Int4 &sample, Pointer<Byte> &mipmapU, Pointer<Byte> bufferU, Pointer<Byte> &mipmapV, Pointer<Byte> bufferV)
2022 {
2023 ASSERT(isYcbcrFormat());
2024
2025 UInt index[4];
2026
2027 // Chroma (either 8-bit or 10-bit in bottom bits).
2028 UShort4 U, V;
2029 computeIndices(index, uuuu, vvvv, wwww, layerIndex, sample, mipmapU);
2030
2031 switch(state.textureFormat)
2032 {
2033 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
2034 {
2035 U = Insert(U, UShort(bufferU[index[0]]), 0);
2036 U = Insert(U, UShort(bufferU[index[1]]), 1);
2037 U = Insert(U, UShort(bufferU[index[2]]), 2);
2038 U = Insert(U, UShort(bufferU[index[3]]), 3);
2039
2040 V = Insert(V, UShort(bufferV[index[0]]), 0);
2041 V = Insert(V, UShort(bufferV[index[1]]), 1);
2042 V = Insert(V, UShort(bufferV[index[2]]), 2);
2043 V = Insert(V, UShort(bufferV[index[3]]), 3);
2044 }
2045 break;
2046 case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
2047 {
2048 UShort4 UV;
2049 UV = Insert(UV, Pointer<UShort>(bufferU)[index[0]], 0);
2050 UV = Insert(UV, Pointer<UShort>(bufferU)[index[1]], 1);
2051 UV = Insert(UV, Pointer<UShort>(bufferU)[index[2]], 2);
2052 UV = Insert(UV, Pointer<UShort>(bufferU)[index[3]], 3);
2053
2054 U = (UV & UShort4(0x00FFu));
2055 V = (UV & UShort4(0xFF00u)) >> 8;
2056 }
2057 break;
2058 case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
2059 {
2060 UInt4 UV;
2061 UV = Insert(UV, Pointer<UInt>(bufferU)[index[0]], 0);
2062 UV = Insert(UV, Pointer<UInt>(bufferU)[index[1]], 1);
2063 UV = Insert(UV, Pointer<UInt>(bufferU)[index[2]], 2);
2064 UV = Insert(UV, Pointer<UInt>(bufferU)[index[3]], 3);
2065 // Top 10 bits of first 16-bits:
2066 U = UShort4((UV & UInt4(0x0000FFC0u)) >> 6);
2067 // Top 10 bits of second 16-bits:
2068 V = UShort4((UV & UInt4(0xFFC00000u)) >> 22);
2069 }
2070 break;
2071 default:
2072 UNSUPPORTED("state.textureFormat %d", (int)state.textureFormat);
2073 break;
2074 }
2075
2076 output.x = Float4(V);
2077 output.z = Float4(U);
2078 }
2079
sampleTexel(Short4 & uuuu,Short4 & vvvv,Short4 & wwww,const Short4 & layerIndex,const Int4 & sample,Pointer<Byte> & mipmap,Pointer<Byte> buffer)2080 Vector4s SamplerCore::sampleTexel(Short4 &uuuu, Short4 &vvvv, Short4 &wwww, const Short4 &layerIndex, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer)
2081 {
2082 ASSERT(!isYcbcrFormat());
2083
2084 UInt index[4];
2085 computeIndices(index, uuuu, vvvv, wwww, layerIndex, sample, mipmap);
2086
2087 return sampleTexel(index, buffer);
2088 }
2089
sampleTexel(Int4 & uuuu,Int4 & vvvv,Int4 & wwww,const Float4 & dRef,const Int4 & sample,Pointer<Byte> & mipmap,Pointer<Byte> buffer)2090 Vector4f SamplerCore::sampleTexel(Int4 &uuuu, Int4 &vvvv, Int4 &wwww, const Float4 &dRef, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer)
2091 {
2092 Int4 valid;
2093
2094 if(borderModeActive())
2095 {
2096 // Valid texels have positive coordinates.
2097 Int4 negative = uuuu;
2098 if(state.is2D() || state.is3D() || state.isCube()) negative |= vvvv;
2099 if(state.is3D() || state.isCube() || state.isArrayed()) negative |= wwww;
2100 valid = CmpNLT(negative, Int4(0));
2101 }
2102
2103 UInt index[4];
2104 computeIndices(index, uuuu, vvvv, wwww, sample, valid, mipmap);
2105
2106 Vector4f c;
2107
2108 if(hasFloatTexture() || has32bitIntegerTextureComponents())
2109 {
2110 UInt4 t0, t1, t2, t3;
2111
2112 switch(state.textureFormat)
2113 {
2114 case VK_FORMAT_R16_SFLOAT:
2115 t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 2));
2116 t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 2));
2117 t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 2));
2118 t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 2));
2119
2120 c.x.x = Extract(As<Float4>(halfToFloatBits(t0)), 0);
2121 c.x.y = Extract(As<Float4>(halfToFloatBits(t1)), 0);
2122 c.x.z = Extract(As<Float4>(halfToFloatBits(t2)), 0);
2123 c.x.w = Extract(As<Float4>(halfToFloatBits(t3)), 0);
2124 break;
2125 case VK_FORMAT_R16G16_SFLOAT:
2126 t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 4));
2127 t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 4));
2128 t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 4));
2129 t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 4));
2130
2131 // TODO: shuffles
2132 c.x = As<Float4>(halfToFloatBits(t0));
2133 c.y = As<Float4>(halfToFloatBits(t1));
2134 c.z = As<Float4>(halfToFloatBits(t2));
2135 c.w = As<Float4>(halfToFloatBits(t3));
2136 transpose4x4(c.x, c.y, c.z, c.w);
2137 break;
2138 case VK_FORMAT_R16G16B16A16_SFLOAT:
2139 t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 8));
2140 t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 8));
2141 t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 8));
2142 t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 8));
2143
2144 c.x = As<Float4>(halfToFloatBits(t0));
2145 c.y = As<Float4>(halfToFloatBits(t1));
2146 c.z = As<Float4>(halfToFloatBits(t2));
2147 c.w = As<Float4>(halfToFloatBits(t3));
2148 transpose4x4(c.x, c.y, c.z, c.w);
2149 break;
2150 case VK_FORMAT_R32_SFLOAT:
2151 case VK_FORMAT_R32_SINT:
2152 case VK_FORMAT_R32_UINT:
2153 case VK_FORMAT_D32_SFLOAT:
2154 // TODO: Optimal shuffling?
2155 c.x.x = *Pointer<Float>(buffer + index[0] * 4);
2156 c.x.y = *Pointer<Float>(buffer + index[1] * 4);
2157 c.x.z = *Pointer<Float>(buffer + index[2] * 4);
2158 c.x.w = *Pointer<Float>(buffer + index[3] * 4);
2159 break;
2160 case VK_FORMAT_R32G32_SFLOAT:
2161 case VK_FORMAT_R32G32_SINT:
2162 case VK_FORMAT_R32G32_UINT:
2163 // TODO: Optimal shuffling?
2164 c.x.xy = *Pointer<Float4>(buffer + index[0] * 8);
2165 c.x.zw = *Pointer<Float4>(buffer + index[1] * 8 - 8);
2166 c.z.xy = *Pointer<Float4>(buffer + index[2] * 8);
2167 c.z.zw = *Pointer<Float4>(buffer + index[3] * 8 - 8);
2168 c.y = c.x;
2169 c.x = Float4(c.x.xz, c.z.xz);
2170 c.y = Float4(c.y.yw, c.z.yw);
2171 break;
2172 case VK_FORMAT_R32G32B32A32_SFLOAT:
2173 case VK_FORMAT_R32G32B32A32_SINT:
2174 case VK_FORMAT_R32G32B32A32_UINT:
2175 c.x = *Pointer<Float4>(buffer + index[0] * 16, 16);
2176 c.y = *Pointer<Float4>(buffer + index[1] * 16, 16);
2177 c.z = *Pointer<Float4>(buffer + index[2] * 16, 16);
2178 c.w = *Pointer<Float4>(buffer + index[3] * 16, 16);
2179 transpose4x4(c.x, c.y, c.z, c.w);
2180 break;
2181 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
2182 {
2183 Float4 t; // TODO: add Insert(UInt4, RValue<UInt>)
2184 t.x = *Pointer<Float>(buffer + index[0] * 4);
2185 t.y = *Pointer<Float>(buffer + index[1] * 4);
2186 t.z = *Pointer<Float>(buffer + index[2] * 4);
2187 t.w = *Pointer<Float>(buffer + index[3] * 4);
2188 t0 = As<UInt4>(t);
2189 c.w = Float4(UInt4(1) << ((t0 >> 27) & UInt4(0x1F))) * Float4(1.0f / (1 << 24));
2190 c.x = Float4(t0 & UInt4(0x1FF)) * c.w;
2191 c.y = Float4((t0 >> 9) & UInt4(0x1FF)) * c.w;
2192 c.z = Float4((t0 >> 18) & UInt4(0x1FF)) * c.w;
2193 }
2194 break;
2195 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2196 {
2197 Float4 t; // TODO: add Insert(UInt4, RValue<UInt>)
2198 t.x = *Pointer<Float>(buffer + index[0] * 4);
2199 t.y = *Pointer<Float>(buffer + index[1] * 4);
2200 t.z = *Pointer<Float>(buffer + index[2] * 4);
2201 t.w = *Pointer<Float>(buffer + index[3] * 4);
2202 t0 = As<UInt4>(t);
2203 c.x = As<Float4>(halfToFloatBits((t0 << 4) & UInt4(0x7FF0)));
2204 c.y = As<Float4>(halfToFloatBits((t0 >> 7) & UInt4(0x7FF0)));
2205 c.z = As<Float4>(halfToFloatBits((t0 >> 17) & UInt4(0x7FE0)));
2206 }
2207 break;
2208 default:
2209 UNSUPPORTED("Format %d", VkFormat(state.textureFormat));
2210 }
2211 }
2212 else
2213 {
2214 ASSERT(!isYcbcrFormat());
2215
2216 Vector4s cs = sampleTexel(index, buffer);
2217
2218 bool isInteger = state.textureFormat.isUnnormalizedInteger();
2219 int componentCount = textureComponentCount();
2220 for(int n = 0; n < componentCount; n++)
2221 {
2222 if(hasUnsignedTextureComponent(n))
2223 {
2224 if(isInteger)
2225 {
2226 c[n] = As<Float4>(Int4(As<UShort4>(cs[n])));
2227 }
2228 else
2229 {
2230 c[n] = Float4(As<UShort4>(cs[n]));
2231 }
2232 }
2233 else
2234 {
2235 if(isInteger)
2236 {
2237 c[n] = As<Float4>(Int4(cs[n]));
2238 }
2239 else
2240 {
2241 c[n] = Float4(cs[n]);
2242 }
2243 }
2244 }
2245 }
2246
2247 if(borderModeActive())
2248 {
2249 c = replaceBorderTexel(c, valid);
2250 }
2251
2252 if(state.compareEnable)
2253 {
2254 Float4 ref = dRef;
2255
2256 if(!hasFloatTexture())
2257 {
2258 // D16_UNORM: clamp reference, normalize texel value
2259 ref = Min(Max(ref, Float4(0.0f)), Float4(1.0f));
2260 c.x = c.x * Float4(1.0f / 0xFFFF);
2261 }
2262
2263 Int4 boolean;
2264
2265 switch(state.compareOp)
2266 {
2267 case VK_COMPARE_OP_LESS_OR_EQUAL: boolean = CmpLE(ref, c.x); break;
2268 case VK_COMPARE_OP_GREATER_OR_EQUAL: boolean = CmpNLT(ref, c.x); break;
2269 case VK_COMPARE_OP_LESS: boolean = CmpLT(ref, c.x); break;
2270 case VK_COMPARE_OP_GREATER: boolean = CmpNLE(ref, c.x); break;
2271 case VK_COMPARE_OP_EQUAL: boolean = CmpEQ(ref, c.x); break;
2272 case VK_COMPARE_OP_NOT_EQUAL: boolean = CmpNEQ(ref, c.x); break;
2273 case VK_COMPARE_OP_ALWAYS: boolean = Int4(-1); break;
2274 case VK_COMPARE_OP_NEVER: boolean = Int4(0); break;
2275 default: ASSERT(false);
2276 }
2277
2278 c.x = As<Float4>(boolean & As<Int4>(Float4(1.0f)));
2279 c.y = Float4(0.0f);
2280 c.z = Float4(0.0f);
2281 c.w = Float4(1.0f);
2282 }
2283
2284 return c;
2285 }
2286
replaceBorderTexel(const Vector4f & c,Int4 valid)2287 Vector4f SamplerCore::replaceBorderTexel(const Vector4f &c, Int4 valid)
2288 {
2289 Vector4i border;
2290
2291 const bool scaled = hasNormalizedFormat();
2292 const sw::float4 scaleComp = scaled ? getComponentScale() : sw::float4(1.0f, 1.0f, 1.0f, 1.0f);
2293
2294 switch(state.border)
2295 {
2296 case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK:
2297 case VK_BORDER_COLOR_INT_TRANSPARENT_BLACK:
2298 border.x = Int4(0);
2299 border.y = Int4(0);
2300 border.z = Int4(0);
2301 border.w = Int4(0);
2302 break;
2303 case VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK:
2304 border.x = Int4(0);
2305 border.y = Int4(0);
2306 border.z = Int4(0);
2307 border.w = Int4(bit_cast<int>(scaleComp.w));
2308 break;
2309 case VK_BORDER_COLOR_INT_OPAQUE_BLACK:
2310 border.x = Int4(0);
2311 border.y = Int4(0);
2312 border.z = Int4(0);
2313 border.w = Int4(1);
2314 break;
2315 case VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE:
2316 border.x = Int4(bit_cast<int>(scaleComp.x));
2317 border.y = Int4(bit_cast<int>(scaleComp.y));
2318 border.z = Int4(bit_cast<int>(scaleComp.z));
2319 border.w = Int4(bit_cast<int>(scaleComp.w));
2320 break;
2321 case VK_BORDER_COLOR_INT_OPAQUE_WHITE:
2322 border.x = Int4(1);
2323 border.y = Int4(1);
2324 border.z = Int4(1);
2325 border.w = Int4(1);
2326 break;
2327 case VK_BORDER_COLOR_FLOAT_CUSTOM_EXT:
2328 // This bit-casts from float to int in C++ code instead of Reactor code
2329 // because Reactor does not guarantee preserving infinity (b/140302841).
2330 border.x = Int4(bit_cast<int>(scaleComp.x * state.customBorder.float32[0]));
2331 border.y = Int4(bit_cast<int>(scaleComp.y * state.customBorder.float32[1]));
2332 border.z = Int4(bit_cast<int>(scaleComp.z * state.customBorder.float32[2]));
2333 border.w = Int4(bit_cast<int>(scaleComp.w * state.customBorder.float32[3]));
2334 break;
2335 case VK_BORDER_COLOR_INT_CUSTOM_EXT:
2336 border.x = Int4(state.customBorder.int32[0]);
2337 border.y = Int4(state.customBorder.int32[1]);
2338 border.z = Int4(state.customBorder.int32[2]);
2339 border.w = Int4(state.customBorder.int32[3]);
2340 break;
2341 default:
2342 UNSUPPORTED("sint/uint/sfloat border: %u", state.border);
2343 }
2344
2345 Vector4f out;
2346 out.x = As<Float4>((valid & As<Int4>(c.x)) | (~valid & border.x)); // TODO: IfThenElse()
2347 out.y = As<Float4>((valid & As<Int4>(c.y)) | (~valid & border.y));
2348 out.z = As<Float4>((valid & As<Int4>(c.z)) | (~valid & border.z));
2349 out.w = As<Float4>((valid & As<Int4>(c.w)) | (~valid & border.w));
2350
2351 return out;
2352 }
2353
selectMipmap(const Pointer<Byte> & texture,const Float & lod,bool secondLOD)2354 Pointer<Byte> SamplerCore::selectMipmap(const Pointer<Byte> &texture, const Float &lod, bool secondLOD)
2355 {
2356 Pointer<Byte> mipmap0 = texture + OFFSET(Texture, mipmap[0]);
2357
2358 if(state.mipmapFilter == MIPMAP_NONE)
2359 {
2360 return mipmap0;
2361 }
2362
2363 Int ilod;
2364
2365 if(state.mipmapFilter == MIPMAP_POINT)
2366 {
2367 // TODO: Preferred formula is ceil(lod + 0.5) - 1
2368 ilod = RoundInt(lod);
2369 }
2370 else // MIPMAP_LINEAR
2371 {
2372 ilod = Int(lod);
2373 }
2374
2375 return mipmap0 + ilod * sizeof(Mipmap) + secondLOD * sizeof(Mipmap);
2376 }
2377
computeFilterOffset(Float & lod)2378 Int4 SamplerCore::computeFilterOffset(Float &lod)
2379 {
2380 if(state.textureFilter == FILTER_POINT)
2381 {
2382 return Int4(0);
2383 }
2384 else if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
2385 {
2386 return CmpNLE(Float4(lod), Float4(0.0f));
2387 }
2388 else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
2389 {
2390 return CmpLE(Float4(lod), Float4(0.0f));
2391 }
2392
2393 return Int4(~0);
2394 }
2395
address(const Float4 & uw,AddressingMode addressingMode)2396 Short4 SamplerCore::address(const Float4 &uw, AddressingMode addressingMode)
2397 {
2398 if(addressingMode == ADDRESSING_UNUSED)
2399 {
2400 return Short4(0); // TODO(b/134669567): Optimize for 1D filtering
2401 }
2402 else if(addressingMode == ADDRESSING_CLAMP || addressingMode == ADDRESSING_BORDER)
2403 {
2404 Float4 clamp = Min(Max(uw, Float4(0.0f)), Float4(65535.0f / 65536.0f));
2405
2406 return Short4(Int4(clamp * Float4(1 << 16)));
2407 }
2408 else if(addressingMode == ADDRESSING_MIRROR)
2409 {
2410 Int4 convert = Int4(uw * Float4(1 << 16));
2411 Int4 mirror = (convert << 15) >> 31;
2412
2413 convert ^= mirror;
2414
2415 return Short4(convert);
2416 }
2417 else if(addressingMode == ADDRESSING_MIRRORONCE)
2418 {
2419 // Absolute value
2420 Int4 convert = Int4(Abs(uw * Float4(1 << 16)));
2421
2422 // Clamp
2423 convert -= Int4(0x00008000, 0x00008000, 0x00008000, 0x00008000);
2424 convert = As<Int4>(PackSigned(convert, convert));
2425
2426 return As<Short4>(Int2(convert)) + Short4(0x8000u);
2427 }
2428 else // Wrap
2429 {
2430 return Short4(Int4(uw * Float4(1 << 16)));
2431 }
2432 }
2433
computeLayerIndex16(const Float4 & a,Pointer<Byte> & mipmap)2434 Short4 SamplerCore::computeLayerIndex16(const Float4 &a, Pointer<Byte> &mipmap)
2435 {
2436 if(!state.isArrayed())
2437 {
2438 return {};
2439 }
2440
2441 Int4 layers = *Pointer<Int4>(mipmap + OFFSET(Mipmap, depth));
2442
2443 return Short4(Min(Max(RoundInt(a), Int4(0)), layers - Int4(1)));
2444 }
2445
2446 // TODO: Eliminate when the gather + mirror addressing case is handled by mirroring the footprint.
mirror(Int4 n)2447 static Int4 mirror(Int4 n)
2448 {
2449 auto positive = CmpNLT(n, Int4(0));
2450 return (positive & n) | (~positive & (-(Int4(1) + n)));
2451 }
2452
mod(Int4 n,Int4 d)2453 static Int4 mod(Int4 n, Int4 d)
2454 {
2455 auto x = n % d;
2456 auto positive = CmpNLT(x, Int4(0));
2457 return (positive & x) | (~positive & (x + d));
2458 }
2459
address(const Float4 & uvw,Int4 & xyz0,Int4 & xyz1,Float4 & f,Pointer<Byte> & mipmap,Int4 & filter,int whd,AddressingMode addressingMode)2460 void SamplerCore::address(const Float4 &uvw, Int4 &xyz0, Int4 &xyz1, Float4 &f, Pointer<Byte> &mipmap, Int4 &filter, int whd, AddressingMode addressingMode)
2461 {
2462 if(addressingMode == ADDRESSING_UNUSED)
2463 {
2464 f = Float4(0.0f); // TODO(b/134669567): Optimize for 1D filtering
2465 return;
2466 }
2467
2468 Int4 dim = As<Int4>(*Pointer<UInt4>(mipmap + whd, 16));
2469 Int4 maxXYZ = dim - Int4(1);
2470
2471 if(function == Fetch) // Unnormalized coordinates
2472 {
2473 Int4 xyz = As<Int4>(uvw);
2474 xyz0 = Min(Max(xyz, Int4(0)), maxXYZ);
2475
2476 // VK_EXT_image_robustness requires checking for out-of-bounds accesses.
2477 // TODO(b/162327166): Only perform bounds checks when VK_EXT_image_robustness is enabled.
2478 // If the above clamping altered the result, the access is out-of-bounds.
2479 // In that case set the coordinate to -1 to perform texel replacement later.
2480 Int4 outOfBounds = CmpNEQ(xyz, xyz0);
2481 xyz0 |= outOfBounds;
2482 }
2483 else if(addressingMode == ADDRESSING_CUBEFACE)
2484 {
2485 xyz0 = As<Int4>(uvw);
2486 }
2487 else
2488 {
2489 const int oneBits = 0x3F7FFFFF; // Value just under 1.0f
2490
2491 Float4 coord = uvw;
2492
2493 if(state.unnormalizedCoordinates)
2494 {
2495 switch(addressingMode)
2496 {
2497 case ADDRESSING_CLAMP:
2498 coord = Min(Max(coord, Float4(0.0f)), Float4(dim) * As<Float4>(Int4(oneBits)));
2499 break;
2500 case ADDRESSING_BORDER:
2501 // Don't map to a valid range here.
2502 break;
2503 default:
2504 // "If unnormalizedCoordinates is VK_TRUE, addressModeU and addressModeV must each be
2505 // either VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE or VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER"
2506 UNREACHABLE("addressingMode %d", int(addressingMode));
2507 break;
2508 }
2509 }
2510 else if(state.textureFilter == FILTER_GATHER && addressingMode == ADDRESSING_MIRROR)
2511 {
2512 // Gather requires the 'footprint' of the texels from which a component is taken, to also mirror around.
2513 // Therefore we can't just compute one texel's location and find the other ones at +1 offsets from it.
2514 // Here we handle that case separately by doing the mirroring per texel coordinate.
2515 // TODO: Mirror the footprint by adjusting the sign of the 0.5f and 1 offsets.
2516
2517 coord = coord * Float4(dim);
2518 coord -= Float4(0.5f);
2519 Float4 floor = Floor(coord);
2520 xyz0 = Int4(floor);
2521 xyz1 = xyz0 + Int4(1);
2522
2523 xyz0 = (maxXYZ)-mirror(mod(xyz0, Int4(2) * dim) - dim);
2524 xyz1 = (maxXYZ)-mirror(mod(xyz1, Int4(2) * dim) - dim);
2525
2526 return;
2527 }
2528 else
2529 {
2530 switch(addressingMode)
2531 {
2532 case ADDRESSING_CLAMP:
2533 case ADDRESSING_SEAMLESS:
2534 // While cube face coordinates are nominally already in the [0.0, 1.0] range
2535 // due to the projection, and numerical imprecision is tolerated due to the
2536 // border of pixels for seamless filtering, the projection doesn't cause
2537 // range normalization for Inf and NaN values. So we always clamp.
2538 {
2539 Float4 one = As<Float4>(Int4(oneBits));
2540 coord = Min(Max(coord, Float4(0.0f)), one);
2541 }
2542 break;
2543 case ADDRESSING_MIRROR:
2544 {
2545 Float4 one = As<Float4>(Int4(oneBits));
2546 coord = coord * Float4(0.5f);
2547 coord = Float4(2.0f) * Abs(coord - Round(coord));
2548 coord = Min(coord, one);
2549 }
2550 break;
2551 case ADDRESSING_MIRRORONCE:
2552 {
2553 Float4 one = As<Float4>(Int4(oneBits));
2554 coord = Min(Abs(coord), one);
2555 }
2556 break;
2557 case ADDRESSING_BORDER:
2558 // Don't map to a valid range here.
2559 break;
2560 default: // Wrap
2561 coord = Frac(coord);
2562 break;
2563 }
2564
2565 coord = coord * Float4(dim);
2566 }
2567
2568 if(state.textureFilter == FILTER_POINT)
2569 {
2570 if(addressingMode == ADDRESSING_BORDER)
2571 {
2572 xyz0 = Int4(Floor(coord));
2573 }
2574 else // Can't have negative coordinates, so floor() is redundant when casting to int.
2575 {
2576 xyz0 = Int4(coord);
2577 }
2578 }
2579 else
2580 {
2581 if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR ||
2582 state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
2583 {
2584 coord -= As<Float4>(As<Int4>(Float4(0.5f)) & filter);
2585 }
2586 else
2587 {
2588 coord -= Float4(0.5f);
2589 }
2590
2591 Float4 floor = Floor(coord);
2592 xyz0 = Int4(floor);
2593 f = coord - floor;
2594 }
2595
2596 if(addressingMode == ADDRESSING_SEAMLESS) // Adjust for border.
2597 {
2598 xyz0 += Int4(1);
2599 }
2600
2601 xyz1 = xyz0 - filter; // Increment
2602
2603 if(addressingMode == ADDRESSING_BORDER)
2604 {
2605 // Replace the coordinates with -1 if they're out of range.
2606 Int4 border0 = CmpLT(xyz0, Int4(0)) | CmpNLT(xyz0, dim);
2607 Int4 border1 = CmpLT(xyz1, Int4(0)) | CmpNLT(xyz1, dim);
2608 xyz0 |= border0;
2609 xyz1 |= border1;
2610 }
2611 else if(state.textureFilter != FILTER_POINT)
2612 {
2613 switch(addressingMode)
2614 {
2615 case ADDRESSING_SEAMLESS:
2616 break;
2617 case ADDRESSING_MIRROR:
2618 case ADDRESSING_MIRRORONCE:
2619 case ADDRESSING_CLAMP:
2620 xyz0 = Max(xyz0, Int4(0));
2621 xyz1 = Min(xyz1, maxXYZ);
2622 break;
2623 default: // Wrap
2624 {
2625 Int4 under = CmpLT(xyz0, Int4(0));
2626 xyz0 = (under & maxXYZ) | (~under & xyz0); // xyz < 0 ? dim - 1 : xyz // TODO: IfThenElse()
2627
2628 Int4 nover = CmpLT(xyz1, dim);
2629 xyz1 = nover & xyz1; // xyz >= dim ? 0 : xyz
2630 }
2631 break;
2632 }
2633 }
2634 }
2635 }
2636
computeLayerIndex(const Float4 & a,Pointer<Byte> & mipmap)2637 Int4 SamplerCore::computeLayerIndex(const Float4 &a, Pointer<Byte> &mipmap)
2638 {
2639 if(!state.isArrayed())
2640 {
2641 return {};
2642 }
2643
2644 Int4 layers = *Pointer<Int4>(mipmap + OFFSET(Mipmap, depth), 16);
2645 Int4 maxLayer = layers - Int4(1);
2646
2647 if(function == Fetch) // Unnormalized coordinates
2648 {
2649 Int4 xyz = As<Int4>(a);
2650 Int4 xyz0 = Min(Max(xyz, Int4(0)), maxLayer);
2651
2652 // VK_EXT_image_robustness requires checking for out-of-bounds accesses.
2653 // TODO(b/162327166): Only perform bounds checks when VK_EXT_image_robustness is enabled.
2654 // If the above clamping altered the result, the access is out-of-bounds.
2655 // In that case set the coordinate to -1 to perform texel replacement later.
2656 Int4 outOfBounds = CmpNEQ(xyz, xyz0);
2657 xyz0 |= outOfBounds;
2658
2659 return xyz0;
2660 }
2661 else
2662 {
2663 return Min(Max(RoundInt(a), Int4(0)), maxLayer);
2664 }
2665 }
2666
sRGBtoLinearFF00(Short4 & c)2667 void SamplerCore::sRGBtoLinearFF00(Short4 &c)
2668 {
2669 c = As<UShort4>(c) >> 8;
2670
2671 Pointer<Byte> LUT = Pointer<Byte>(constants + OFFSET(Constants, sRGBtoLinearFF_FF00));
2672
2673 c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 0))), 0);
2674 c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 1))), 1);
2675 c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 2))), 2);
2676 c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 3))), 3);
2677 }
2678
hasNormalizedFormat() const2679 bool SamplerCore::hasNormalizedFormat() const
2680 {
2681 return state.textureFormat.isSignedNormalized() || state.textureFormat.isUnsignedNormalized();
2682 }
2683
hasFloatTexture() const2684 bool SamplerCore::hasFloatTexture() const
2685 {
2686 return state.textureFormat.isFloatFormat();
2687 }
2688
hasUnnormalizedIntegerTexture() const2689 bool SamplerCore::hasUnnormalizedIntegerTexture() const
2690 {
2691 return state.textureFormat.isUnnormalizedInteger();
2692 }
2693
hasUnsignedTextureComponent(int component) const2694 bool SamplerCore::hasUnsignedTextureComponent(int component) const
2695 {
2696 return state.textureFormat.isUnsignedComponent(component);
2697 }
2698
textureComponentCount() const2699 int SamplerCore::textureComponentCount() const
2700 {
2701 return state.textureFormat.componentCount();
2702 }
2703
has16bitPackedTextureFormat() const2704 bool SamplerCore::has16bitPackedTextureFormat() const
2705 {
2706 return state.textureFormat.has16bitPackedTextureFormat();
2707 }
2708
has8bitTextureComponents() const2709 bool SamplerCore::has8bitTextureComponents() const
2710 {
2711 return state.textureFormat.has8bitTextureComponents();
2712 }
2713
has16bitTextureComponents() const2714 bool SamplerCore::has16bitTextureComponents() const
2715 {
2716 return state.textureFormat.has16bitTextureComponents();
2717 }
2718
has32bitIntegerTextureComponents() const2719 bool SamplerCore::has32bitIntegerTextureComponents() const
2720 {
2721 return state.textureFormat.has32bitIntegerTextureComponents();
2722 }
2723
isYcbcrFormat() const2724 bool SamplerCore::isYcbcrFormat() const
2725 {
2726 return state.textureFormat.isYcbcrFormat();
2727 }
2728
isRGBComponent(int component) const2729 bool SamplerCore::isRGBComponent(int component) const
2730 {
2731 return state.textureFormat.isRGBComponent(component);
2732 }
2733
borderModeActive() const2734 bool SamplerCore::borderModeActive() const
2735 {
2736 return state.addressingModeU == ADDRESSING_BORDER ||
2737 state.addressingModeV == ADDRESSING_BORDER ||
2738 state.addressingModeW == ADDRESSING_BORDER;
2739 }
2740
gatherSwizzle() const2741 VkComponentSwizzle SamplerCore::gatherSwizzle() const
2742 {
2743 switch(state.gatherComponent)
2744 {
2745 case 0: return state.swizzle.r;
2746 case 1: return state.swizzle.g;
2747 case 2: return state.swizzle.b;
2748 case 3: return state.swizzle.a;
2749 default:
2750 UNREACHABLE("Invalid component");
2751 return VK_COMPONENT_SWIZZLE_R;
2752 }
2753 }
2754
getComponentScale() const2755 sw::float4 SamplerCore::getComponentScale() const
2756 {
2757 // TODO(b/204709464): Unlike other formats, the fixed-point representation of the formats below are handled with bit extension.
2758 // This special handling of such formats should be removed later.
2759 switch(state.textureFormat)
2760 {
2761 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
2762 case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
2763 case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
2764 return sw::float4(0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF);
2765 default:
2766 break;
2767 };
2768
2769 const sw::int4 bits = state.textureFormat.bitsPerComponent();
2770 const sw::int4 shift = sw::int4(16 - bits.x, 16 - bits.y, 16 - bits.z, 16 - bits.w);
2771 const uint16_t sign = state.textureFormat.isUnsigned() ? 0xFFFF : 0x7FFF;
2772
2773 return sw::float4(static_cast<uint16_t>(0xFFFF << shift.x) & sign,
2774 static_cast<uint16_t>(0xFFFF << shift.y) & sign,
2775 static_cast<uint16_t>(0xFFFF << shift.z) & sign,
2776 static_cast<uint16_t>(0xFFFF << shift.w) & sign);
2777 }
2778
getGatherComponent() const2779 int SamplerCore::getGatherComponent() const
2780 {
2781 VkComponentSwizzle swizzle = gatherSwizzle();
2782
2783 switch(swizzle)
2784 {
2785 default: UNSUPPORTED("VkComponentSwizzle %d", (int)swizzle); return 0;
2786 case VK_COMPONENT_SWIZZLE_R:
2787 case VK_COMPONENT_SWIZZLE_G:
2788 case VK_COMPONENT_SWIZZLE_B:
2789 case VK_COMPONENT_SWIZZLE_A:
2790 // Normalize all components using the gather component scale.
2791 return swizzle - VK_COMPONENT_SWIZZLE_R;
2792 case VK_COMPONENT_SWIZZLE_ZERO:
2793 case VK_COMPONENT_SWIZZLE_ONE:
2794 // These cases are handled later.
2795 return 0;
2796 }
2797
2798 return 0;
2799 }
2800
2801 } // namespace sw
2802