// Copyright 2016 The SwiftShader Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "VertexRoutine.hpp" #include "Constants.hpp" #include "SpirvShader.hpp" #include "Device/Clipper.hpp" #include "Device/Renderer.hpp" #include "Device/Vertex.hpp" #include "System/Debug.hpp" #include "System/Half.hpp" #include "Vulkan/VkDevice.hpp" namespace sw { VertexRoutine::VertexRoutine( const VertexProcessor::State &state, const vk::PipelineLayout *pipelineLayout, const SpirvShader *spirvShader) : routine(pipelineLayout) , state(state) , spirvShader(spirvShader) { spirvShader->emitProlog(&routine); } VertexRoutine::~VertexRoutine() { } void VertexRoutine::generate() { Pointer cache = task + OFFSET(VertexTask, vertexCache); Pointer vertexCache = cache + OFFSET(VertexCache, vertex); Pointer tagCache = Pointer(cache + OFFSET(VertexCache, tag)); UInt vertexCount = *Pointer(task + OFFSET(VertexTask, vertexCount)); constants = device + OFFSET(vk::Device, constants); // Check the cache one vertex index at a time. If a hit occurs, copy from the cache to the 'vertex' output buffer. // On a cache miss, process a SIMD width of consecutive indices from the input batch. They're written to the cache // in reverse order to guarantee that the first one doesn't get evicted and can be written out. Do { UInt index = *batch; UInt cacheIndex = index & VertexCache::TAG_MASK; If(tagCache[cacheIndex] != index) { readInput(batch); program(batch, vertexCount); computeClipFlags(); computeCullMask(); writeCache(vertexCache, tagCache, batch); } Pointer cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex)); // For points, vertexCount is 1 per primitive, so duplicate vertex for all 3 vertices of the primitive for(int i = 0; i < (state.isPoint ? 3 : 1); i++) { writeVertex(vertex, cacheEntry); vertex += sizeof(Vertex); } batch = Pointer(Pointer(batch) + sizeof(uint32_t)); vertexCount--; } Until(vertexCount == 0); Return(); } void VertexRoutine::readInput(Pointer &batch) { for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4) { if(spirvShader->inputs[i + 0].Type != Spirv::ATTRIBTYPE_UNUSED || spirvShader->inputs[i + 1].Type != Spirv::ATTRIBTYPE_UNUSED || spirvShader->inputs[i + 2].Type != Spirv::ATTRIBTYPE_UNUSED || spirvShader->inputs[i + 3].Type != Spirv::ATTRIBTYPE_UNUSED) { Pointer input = *Pointer>(data + OFFSET(DrawData, input) + sizeof(void *) * (i / 4)); UInt stride = *Pointer(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4)); Int baseVertex = *Pointer(data + OFFSET(DrawData, baseVertex)); UInt robustnessSize(0); if(state.robustBufferAccess) { robustnessSize = *Pointer(data + OFFSET(DrawData, robustnessSize) + sizeof(uint32_t) * (i / 4)); } auto value = readStream(input, stride, state.input[i / 4], batch, state.robustBufferAccess, robustnessSize, baseVertex); routine.inputs[i + 0] = value.x; routine.inputs[i + 1] = value.y; routine.inputs[i + 2] = value.z; routine.inputs[i + 3] = value.w; } } } void VertexRoutine::computeClipFlags() { auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition); if(it != spirvShader->outputBuiltins.end()) { assert(it->second.SizeInComponents == 4); auto &pos = routine.getVariable(it->second.Id); auto posX = pos[it->second.FirstComponent + 0]; auto posY = pos[it->second.FirstComponent + 1]; auto posZ = pos[it->second.FirstComponent + 2]; auto posW = pos[it->second.FirstComponent + 3]; SIMD::Int maxX = CmpLT(posW, posX); SIMD::Int maxY = CmpLT(posW, posY); SIMD::Int minX = CmpNLE(-posW, posX); SIMD::Int minY = CmpNLE(-posW, posY); clipFlags = maxX & Clipper::CLIP_RIGHT; clipFlags |= maxY & Clipper::CLIP_TOP; clipFlags |= minX & Clipper::CLIP_LEFT; clipFlags |= minY & Clipper::CLIP_BOTTOM; if(state.depthClipEnable) { // If depthClipNegativeOneToOne is enabled, depth values are in [-1, 1] instead of [0, 1]. SIMD::Int maxZ = CmpLT(posW, posZ); SIMD::Int minZ = CmpNLE(state.depthClipNegativeOneToOne ? -posW : 0.0f, posZ); clipFlags |= maxZ & Clipper::CLIP_FAR; clipFlags |= minZ & Clipper::CLIP_NEAR; } SIMD::Float maxPos = As(SIMD::Int(0x7F7FFFFF)); SIMD::Int finiteX = CmpLE(Abs(posX), maxPos); SIMD::Int finiteY = CmpLE(Abs(posY), maxPos); SIMD::Int finiteZ = CmpLE(Abs(posZ), maxPos); SIMD::Int finiteXYZ = finiteX & finiteY & finiteZ; clipFlags |= finiteXYZ & Clipper::CLIP_FINITE; } } void VertexRoutine::computeCullMask() { cullMask = Int(15); auto it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance); if(it != spirvShader->outputBuiltins.end()) { auto count = spirvShader->getNumOutputCullDistances(); for(uint32_t i = 0; i < count; i++) { const auto &distance = routine.getVariable(it->second.Id)[it->second.FirstComponent + i]; auto mask = SignMask(CmpGE(distance, SIMD::Float(0))); cullMask &= mask; } } } Vector4f VertexRoutine::readStream(Pointer &buffer, UInt &stride, const Stream &stream, Pointer &batch, bool robustBufferAccess, UInt &robustnessSize, Int baseVertex) { Vector4f v; // Because of the following rule in the Vulkan spec, we do not care if a very large negative // baseVertex would overflow all the way back into a valid region of the index buffer: // "Out-of-bounds buffer loads will return any of the following values : // - Values from anywhere within the memory range(s) bound to the buffer (possibly including // bytes of memory past the end of the buffer, up to the end of the bound range)." UInt4 offsets = (*Pointer(As>(batch)) + As(Int4(baseVertex))) * UInt4(stride); Pointer source0 = buffer + offsets.x; Pointer source1 = buffer + offsets.y; Pointer source2 = buffer + offsets.z; Pointer source3 = buffer + offsets.w; vk::Format format(stream.format); UInt4 zero(0); if(robustBufferAccess) { // Prevent integer overflow on the addition below. offsets = Min(offsets, UInt4(robustnessSize)); // "vertex input attributes are considered out of bounds if the offset of the attribute // in the bound vertex buffer range plus the size of the attribute is greater than ..." UInt4 limits = offsets + UInt4(format.bytes()); Pointer zeroSource = As>(&zero); // TODO(b/141124876): Optimize for wide-vector gather operations. source0 = IfThenElse(limits.x > robustnessSize, zeroSource, source0); source1 = IfThenElse(limits.y > robustnessSize, zeroSource, source1); source2 = IfThenElse(limits.z > robustnessSize, zeroSource, source2); source3 = IfThenElse(limits.w > robustnessSize, zeroSource, source3); } int componentCount = format.componentCount(); bool normalized = !format.isUnnormalizedInteger(); bool isNativeFloatAttrib = (stream.attribType == Spirv::ATTRIBTYPE_FLOAT) || normalized; bool bgra = false; switch(stream.format) { case VK_FORMAT_R32_SFLOAT: case VK_FORMAT_R32G32_SFLOAT: case VK_FORMAT_R32G32B32_SFLOAT: case VK_FORMAT_R32G32B32A32_SFLOAT: { if(componentCount == 0) { // Null stream, all default components } else { if(componentCount == 1) { v.x.x = *Pointer(source0); v.x.y = *Pointer(source1); v.x.z = *Pointer(source2); v.x.w = *Pointer(source3); } else { v.x = *Pointer(source0); v.y = *Pointer(source1); v.z = *Pointer(source2); v.w = *Pointer(source3); transpose4xN(v.x, v.y, v.z, v.w, componentCount); } } } break; case VK_FORMAT_B8G8R8A8_UNORM: bgra = true; // [[fallthrough]] case VK_FORMAT_R8_UNORM: case VK_FORMAT_R8G8_UNORM: case VK_FORMAT_R8G8B8A8_UNORM: case VK_FORMAT_A8B8G8R8_UNORM_PACK32: v.x = Float4(*Pointer(source0)); v.y = Float4(*Pointer(source1)); v.z = Float4(*Pointer(source2)); v.w = Float4(*Pointer(source3)); transpose4xN(v.x, v.y, v.z, v.w, componentCount); if(componentCount >= 1) v.x *= (1.0f / 0xFF); if(componentCount >= 2) v.y *= (1.0f / 0xFF); if(componentCount >= 3) v.z *= (1.0f / 0xFF); if(componentCount >= 4) v.w *= (1.0f / 0xFF); break; case VK_FORMAT_R8_UINT: case VK_FORMAT_R8G8_UINT: case VK_FORMAT_R8G8B8A8_UINT: case VK_FORMAT_A8B8G8R8_UINT_PACK32: v.x = As(Int4(*Pointer(source0))); v.y = As(Int4(*Pointer(source1))); v.z = As(Int4(*Pointer(source2))); v.w = As(Int4(*Pointer(source3))); transpose4xN(v.x, v.y, v.z, v.w, componentCount); break; case VK_FORMAT_R8_SNORM: case VK_FORMAT_R8G8_SNORM: case VK_FORMAT_R8G8B8A8_SNORM: case VK_FORMAT_A8B8G8R8_SNORM_PACK32: v.x = Float4(*Pointer(source0)); v.y = Float4(*Pointer(source1)); v.z = Float4(*Pointer(source2)); v.w = Float4(*Pointer(source3)); transpose4xN(v.x, v.y, v.z, v.w, componentCount); if(componentCount >= 1) v.x = Max(v.x * (1.0f / 0x7F), Float4(-1.0f)); if(componentCount >= 2) v.y = Max(v.y * (1.0f / 0x7F), Float4(-1.0f)); if(componentCount >= 3) v.z = Max(v.z * (1.0f / 0x7F), Float4(-1.0f)); if(componentCount >= 4) v.w = Max(v.w * (1.0f / 0x7F), Float4(-1.0f)); break; case VK_FORMAT_R8_USCALED: case VK_FORMAT_R8G8_USCALED: case VK_FORMAT_R8G8B8A8_USCALED: case VK_FORMAT_A8B8G8R8_USCALED_PACK32: v.x = Float4(*Pointer(source0)); v.y = Float4(*Pointer(source1)); v.z = Float4(*Pointer(source2)); v.w = Float4(*Pointer(source3)); transpose4xN(v.x, v.y, v.z, v.w, componentCount); break; case VK_FORMAT_R8_SSCALED: case VK_FORMAT_R8G8_SSCALED: case VK_FORMAT_R8G8B8A8_SSCALED: case VK_FORMAT_A8B8G8R8_SSCALED_PACK32: v.x = Float4(*Pointer(source0)); v.y = Float4(*Pointer(source1)); v.z = Float4(*Pointer(source2)); v.w = Float4(*Pointer(source3)); transpose4xN(v.x, v.y, v.z, v.w, componentCount); break; case VK_FORMAT_R8_SINT: case VK_FORMAT_R8G8_SINT: case VK_FORMAT_R8G8B8A8_SINT: case VK_FORMAT_A8B8G8R8_SINT_PACK32: v.x = As(Int4(*Pointer(source0))); v.y = As(Int4(*Pointer(source1))); v.z = As(Int4(*Pointer(source2))); v.w = As(Int4(*Pointer(source3))); transpose4xN(v.x, v.y, v.z, v.w, componentCount); break; case VK_FORMAT_R16_UNORM: case VK_FORMAT_R16G16_UNORM: case VK_FORMAT_R16G16B16A16_UNORM: v.x = Float4(*Pointer(source0)); v.y = Float4(*Pointer(source1)); v.z = Float4(*Pointer(source2)); v.w = Float4(*Pointer(source3)); transpose4xN(v.x, v.y, v.z, v.w, componentCount); if(componentCount >= 1) v.x *= (1.0f / 0xFFFF); if(componentCount >= 2) v.y *= (1.0f / 0xFFFF); if(componentCount >= 3) v.z *= (1.0f / 0xFFFF); if(componentCount >= 4) v.w *= (1.0f / 0xFFFF); break; case VK_FORMAT_R16_SNORM: case VK_FORMAT_R16G16_SNORM: case VK_FORMAT_R16G16B16A16_SNORM: v.x = Float4(*Pointer(source0)); v.y = Float4(*Pointer(source1)); v.z = Float4(*Pointer(source2)); v.w = Float4(*Pointer(source3)); transpose4xN(v.x, v.y, v.z, v.w, componentCount); if(componentCount >= 1) v.x = Max(v.x * (1.0f / 0x7FFF), Float4(-1.0f)); if(componentCount >= 2) v.y = Max(v.y * (1.0f / 0x7FFF), Float4(-1.0f)); if(componentCount >= 3) v.z = Max(v.z * (1.0f / 0x7FFF), Float4(-1.0f)); if(componentCount >= 4) v.w = Max(v.w * (1.0f / 0x7FFF), Float4(-1.0f)); break; case VK_FORMAT_R16_USCALED: case VK_FORMAT_R16G16_USCALED: case VK_FORMAT_R16G16B16A16_USCALED: v.x = Float4(*Pointer(source0)); v.y = Float4(*Pointer(source1)); v.z = Float4(*Pointer(source2)); v.w = Float4(*Pointer(source3)); transpose4xN(v.x, v.y, v.z, v.w, componentCount); break; case VK_FORMAT_R16_SSCALED: case VK_FORMAT_R16G16_SSCALED: case VK_FORMAT_R16G16B16A16_SSCALED: v.x = Float4(*Pointer(source0)); v.y = Float4(*Pointer(source1)); v.z = Float4(*Pointer(source2)); v.w = Float4(*Pointer(source3)); transpose4xN(v.x, v.y, v.z, v.w, componentCount); break; case VK_FORMAT_R16_SINT: case VK_FORMAT_R16G16_SINT: case VK_FORMAT_R16G16B16A16_SINT: v.x = As(Int4(*Pointer(source0))); v.y = As(Int4(*Pointer(source1))); v.z = As(Int4(*Pointer(source2))); v.w = As(Int4(*Pointer(source3))); transpose4xN(v.x, v.y, v.z, v.w, componentCount); break; case VK_FORMAT_R16_UINT: case VK_FORMAT_R16G16_UINT: case VK_FORMAT_R16G16B16A16_UINT: v.x = As(Int4(*Pointer(source0))); v.y = As(Int4(*Pointer(source1))); v.z = As(Int4(*Pointer(source2))); v.w = As(Int4(*Pointer(source3))); transpose4xN(v.x, v.y, v.z, v.w, componentCount); break; case VK_FORMAT_R32_SINT: case VK_FORMAT_R32G32_SINT: case VK_FORMAT_R32G32B32_SINT: case VK_FORMAT_R32G32B32A32_SINT: v.x = *Pointer(source0); v.y = *Pointer(source1); v.z = *Pointer(source2); v.w = *Pointer(source3); transpose4xN(v.x, v.y, v.z, v.w, componentCount); break; case VK_FORMAT_R32_UINT: case VK_FORMAT_R32G32_UINT: case VK_FORMAT_R32G32B32_UINT: case VK_FORMAT_R32G32B32A32_UINT: v.x = *Pointer(source0); v.y = *Pointer(source1); v.z = *Pointer(source2); v.w = *Pointer(source3); transpose4xN(v.x, v.y, v.z, v.w, componentCount); break; case VK_FORMAT_R16_SFLOAT: case VK_FORMAT_R16G16_SFLOAT: case VK_FORMAT_R16G16B16A16_SFLOAT: { if(componentCount >= 1) { UShort x0 = *Pointer(source0 + 0); UShort x1 = *Pointer(source1 + 0); UShort x2 = *Pointer(source2 + 0); UShort x3 = *Pointer(source3 + 0); v.x.x = *Pointer(constants + OFFSET(Constants, half2float) + Int(x0) * 4); v.x.y = *Pointer(constants + OFFSET(Constants, half2float) + Int(x1) * 4); v.x.z = *Pointer(constants + OFFSET(Constants, half2float) + Int(x2) * 4); v.x.w = *Pointer(constants + OFFSET(Constants, half2float) + Int(x3) * 4); } if(componentCount >= 2) { UShort y0 = *Pointer(source0 + 2); UShort y1 = *Pointer(source1 + 2); UShort y2 = *Pointer(source2 + 2); UShort y3 = *Pointer(source3 + 2); v.y.x = *Pointer(constants + OFFSET(Constants, half2float) + Int(y0) * 4); v.y.y = *Pointer(constants + OFFSET(Constants, half2float) + Int(y1) * 4); v.y.z = *Pointer(constants + OFFSET(Constants, half2float) + Int(y2) * 4); v.y.w = *Pointer(constants + OFFSET(Constants, half2float) + Int(y3) * 4); } if(componentCount >= 3) { UShort z0 = *Pointer(source0 + 4); UShort z1 = *Pointer(source1 + 4); UShort z2 = *Pointer(source2 + 4); UShort z3 = *Pointer(source3 + 4); v.z.x = *Pointer(constants + OFFSET(Constants, half2float) + Int(z0) * 4); v.z.y = *Pointer(constants + OFFSET(Constants, half2float) + Int(z1) * 4); v.z.z = *Pointer(constants + OFFSET(Constants, half2float) + Int(z2) * 4); v.z.w = *Pointer(constants + OFFSET(Constants, half2float) + Int(z3) * 4); } if(componentCount >= 4) { UShort w0 = *Pointer(source0 + 6); UShort w1 = *Pointer(source1 + 6); UShort w2 = *Pointer(source2 + 6); UShort w3 = *Pointer(source3 + 6); v.w.x = *Pointer(constants + OFFSET(Constants, half2float) + Int(w0) * 4); v.w.y = *Pointer(constants + OFFSET(Constants, half2float) + Int(w1) * 4); v.w.z = *Pointer(constants + OFFSET(Constants, half2float) + Int(w2) * 4); v.w.w = *Pointer(constants + OFFSET(Constants, half2float) + Int(w3) * 4); } } break; case VK_FORMAT_A2R10G10B10_SNORM_PACK32: bgra = true; // [[fallthrough]] case VK_FORMAT_A2B10G10R10_SNORM_PACK32: { Int4 src; src = Insert(src, *Pointer(source0), 0); src = Insert(src, *Pointer(source1), 1); src = Insert(src, *Pointer(source2), 2); src = Insert(src, *Pointer(source3), 3); v.x = Float4((src << 22) >> 22); v.y = Float4((src << 12) >> 22); v.z = Float4((src << 02) >> 22); v.w = Float4(src >> 30); v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f)); v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f)); v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f)); v.w = Max(v.w, Float4(-1.0f)); } break; case VK_FORMAT_A2R10G10B10_SINT_PACK32: bgra = true; // [[fallthrough]] case VK_FORMAT_A2B10G10R10_SINT_PACK32: { Int4 src; src = Insert(src, *Pointer(source0), 0); src = Insert(src, *Pointer(source1), 1); src = Insert(src, *Pointer(source2), 2); src = Insert(src, *Pointer(source3), 3); v.x = As((src << 22) >> 22); v.y = As((src << 12) >> 22); v.z = As((src << 02) >> 22); v.w = As(src >> 30); } break; case VK_FORMAT_A2R10G10B10_UNORM_PACK32: bgra = true; // [[fallthrough]] case VK_FORMAT_A2B10G10R10_UNORM_PACK32: { Int4 src; src = Insert(src, *Pointer(source0), 0); src = Insert(src, *Pointer(source1), 1); src = Insert(src, *Pointer(source2), 2); src = Insert(src, *Pointer(source3), 3); v.x = Float4(src & Int4(0x3FF)); v.y = Float4((src >> 10) & Int4(0x3FF)); v.z = Float4((src >> 20) & Int4(0x3FF)); v.w = Float4((src >> 30) & Int4(0x3)); v.x *= Float4(1.0f / 0x3FF); v.y *= Float4(1.0f / 0x3FF); v.z *= Float4(1.0f / 0x3FF); v.w *= Float4(1.0f / 0x3); } break; case VK_FORMAT_A2R10G10B10_UINT_PACK32: bgra = true; // [[fallthrough]] case VK_FORMAT_A2B10G10R10_UINT_PACK32: { Int4 src; src = Insert(src, *Pointer(source0), 0); src = Insert(src, *Pointer(source1), 1); src = Insert(src, *Pointer(source2), 2); src = Insert(src, *Pointer(source3), 3); v.x = As(src & Int4(0x3FF)); v.y = As((src >> 10) & Int4(0x3FF)); v.z = As((src >> 20) & Int4(0x3FF)); v.w = As((src >> 30) & Int4(0x3)); } break; default: UNSUPPORTED("stream.format %d", int(stream.format)); } if(bgra) { // Swap red and blue Float4 t = v.x; v.x = v.z; v.z = t; } if(componentCount < 1) v.x = Float4(0.0f); if(componentCount < 2) v.y = Float4(0.0f); if(componentCount < 3) v.z = Float4(0.0f); if(componentCount < 4) v.w = isNativeFloatAttrib ? As(Float4(1.0f)) : As(Int4(1)); return v; } void VertexRoutine::writeCache(Pointer &vertexCache, Pointer &tagCache, Pointer &batch) { ASSERT(SIMD::Width == 4); UInt index0 = batch[0]; UInt index1 = batch[1]; UInt index2 = batch[2]; UInt index3 = batch[3]; UInt cacheIndex0 = index0 & VertexCache::TAG_MASK; UInt cacheIndex1 = index1 & VertexCache::TAG_MASK; UInt cacheIndex2 = index2 & VertexCache::TAG_MASK; UInt cacheIndex3 = index3 & VertexCache::TAG_MASK; // We processed a SIMD group of vertices, with the first one being the one that missed the cache tag check. // Write them out in reverse order here and below to ensure the first one is now guaranteed to be in the cache. tagCache[cacheIndex3] = index3; tagCache[cacheIndex2] = index2; tagCache[cacheIndex1] = index1; tagCache[cacheIndex0] = index0; auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition); if(it != spirvShader->outputBuiltins.end()) { assert(it->second.SizeInComponents == 4); auto &position = routine.getVariable(it->second.Id); SIMD::Float4 pos; pos.x = position[it->second.FirstComponent + 0]; pos.y = position[it->second.FirstComponent + 1]; pos.z = position[it->second.FirstComponent + 2]; pos.w = position[it->second.FirstComponent + 3]; // Projection and viewport transform. SIMD::Float w = As(As(pos.w) | (As(CmpEQ(pos.w, 0.0f)) & As(SIMD::Float(1.0f)))); SIMD::Float rhw = 1.0f / w; SIMD::Float4 proj; proj.x = As(RoundIntClamped(SIMD::Float(*Pointer(data + OFFSET(DrawData, X0xF))) + pos.x * rhw * SIMD::Float(*Pointer(data + OFFSET(DrawData, WxF))))); proj.y = As(RoundIntClamped(SIMD::Float(*Pointer(data + OFFSET(DrawData, Y0xF))) + pos.y * rhw * SIMD::Float(*Pointer(data + OFFSET(DrawData, HxF))))); proj.z = pos.z * rhw; proj.w = rhw; Float4 pos_x = Extract128(pos.x, 0); Float4 pos_y = Extract128(pos.y, 0); Float4 pos_z = Extract128(pos.z, 0); Float4 pos_w = Extract128(pos.w, 0); transpose4x4(pos_x, pos_y, pos_z, pos_w); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, position), 16) = pos_w; *Pointer(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, position), 16) = pos_z; *Pointer(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, position), 16) = pos_y; *Pointer(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, position), 16) = pos_x; *Pointer(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipFlags)) = Extract(clipFlags, 3); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipFlags)) = Extract(clipFlags, 2); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipFlags)) = Extract(clipFlags, 1); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipFlags)) = Extract(clipFlags, 0); Float4 proj_x = Extract128(proj.x, 0); Float4 proj_y = Extract128(proj.y, 0); Float4 proj_z = Extract128(proj.z, 0); Float4 proj_w = Extract128(proj.w, 0); transpose4x4(proj_x, proj_y, proj_z, proj_w); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, projected), 16) = proj_w; *Pointer(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, projected), 16) = proj_z; *Pointer(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, projected), 16) = proj_y; *Pointer(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, projected), 16) = proj_x; } it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize); if(it != spirvShader->outputBuiltins.end()) { ASSERT(it->second.SizeInComponents == 1); auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent]; *Pointer(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, pointSize)) = Extract(psize, 3); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, pointSize)) = Extract(psize, 2); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, pointSize)) = Extract(psize, 1); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, pointSize)) = Extract(psize, 0); } it = spirvShader->outputBuiltins.find(spv::BuiltInClipDistance); if(it != spirvShader->outputBuiltins.end()) { auto count = spirvShader->getNumOutputClipDistances(); for(unsigned int i = 0; i < count; i++) { auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i]; *Pointer(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 3); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 2); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 1); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 0); } } it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance); if(it != spirvShader->outputBuiltins.end()) { auto count = spirvShader->getNumOutputCullDistances(); for(unsigned int i = 0; i < count; i++) { auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i]; *Pointer(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 3); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 2); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 1); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 0); } } *Pointer(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullMask)) = -((cullMask >> 3) & 1); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullMask)) = -((cullMask >> 2) & 1); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullMask)) = -((cullMask >> 1) & 1); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullMask)) = -((cullMask >> 0) & 1); for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4) { if(spirvShader->outputs[i + 0].Type != Spirv::ATTRIBTYPE_UNUSED || spirvShader->outputs[i + 1].Type != Spirv::ATTRIBTYPE_UNUSED || spirvShader->outputs[i + 2].Type != Spirv::ATTRIBTYPE_UNUSED || spirvShader->outputs[i + 3].Type != Spirv::ATTRIBTYPE_UNUSED) { Vector4f v; v.x = Extract128(routine.outputs[i + 0], 0); v.y = Extract128(routine.outputs[i + 1], 0); v.z = Extract128(routine.outputs[i + 2], 0); v.w = Extract128(routine.outputs[i + 3], 0); transpose4x4(v.x, v.y, v.z, v.w); *Pointer(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, v[i]), 16) = v.w; *Pointer(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, v[i]), 16) = v.z; *Pointer(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, v[i]), 16) = v.y; *Pointer(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, v[i]), 16) = v.x; } } } void VertexRoutine::writeVertex(const Pointer &vertex, Pointer &cacheEntry) { *Pointer(vertex + OFFSET(Vertex, position)) = *Pointer(cacheEntry + OFFSET(Vertex, position)); *Pointer(vertex + OFFSET(Vertex, pointSize)) = *Pointer(cacheEntry + OFFSET(Vertex, pointSize)); *Pointer(vertex + OFFSET(Vertex, clipFlags)) = *Pointer(cacheEntry + OFFSET(Vertex, clipFlags)); *Pointer(vertex + OFFSET(Vertex, cullMask)) = *Pointer(cacheEntry + OFFSET(Vertex, cullMask)); *Pointer(vertex + OFFSET(Vertex, projected)) = *Pointer(cacheEntry + OFFSET(Vertex, projected)); for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++) { if(spirvShader->outputs[i].Type != Spirv::ATTRIBTYPE_UNUSED) { *Pointer(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer(cacheEntry + OFFSET(Vertex, v[i]), 4); } } for(unsigned int i = 0; i < spirvShader->getNumOutputClipDistances(); i++) { *Pointer(vertex + OFFSET(Vertex, clipDistance[i]), 4) = *Pointer(cacheEntry + OFFSET(Vertex, clipDistance[i]), 4); } for(unsigned int i = 0; i < spirvShader->getNumOutputCullDistances(); i++) { *Pointer(vertex + OFFSET(Vertex, cullDistance[i]), 4) = *Pointer(cacheEntry + OFFSET(Vertex, cullDistance[i]), 4); } } } // namespace sw