// Copyright 2022 The SwiftShader Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef rr_SIMD_hpp #define rr_SIMD_hpp #include "Reactor.hpp" #include #include namespace rr { namespace scalar { using Int = rr::Int; using UInt = rr::UInt; using Float = rr::Float; template using Pointer = rr::Pointer; } // namespace scalar namespace packed { using Int4 = rr::Int4; using UInt4 = rr::UInt4; using Float4 = rr::Float4; } // namespace packed namespace SIMD { extern const int Width; class Int; class UInt; class Float; class Pointer; class Int : public LValue, public XYZW // TODO(b/214583550): Eliminate and replace with SwizzleQuad() and/or other intrinsics. { public: explicit Int(RValue cast); Int(); Int(int broadcast); Int(int x, int y, int z, int w); Int(std::vector v); Int(std::function LaneValueProducer); Int(RValue rhs); Int(const Int &rhs); Int(const Reference &rhs); Int(RValue rhs); Int(const UInt &rhs); Int(const Reference &rhs); Int(RValue rhs); Int(const scalar::Int &rhs); Int(const Reference &rhs); template Int(const SwizzleMask1 &rhs); RValue operator=(int broadcast); RValue operator=(RValue rhs); RValue operator=(const Int &rhs); RValue operator=(const Reference &rhs); static Type *type(); static int element_count() { return SIMD::Width; } }; class UInt : public LValue, public XYZW // TODO(b/214583550): Eliminate and replace with SwizzleQuad() and/or other intrinsics. { public: explicit UInt(RValue cast); UInt(); UInt(int broadcast); UInt(int x, int y, int z, int w); UInt(std::vector v); UInt(std::function LaneValueProducer); UInt(RValue rhs); UInt(const UInt &rhs); UInt(const Reference &rhs); UInt(RValue rhs); UInt(const Int &rhs); UInt(const Reference &rhs); UInt(RValue rhs); UInt(const scalar::UInt &rhs); UInt(const Reference &rhs); RValue operator=(RValue rhs); RValue operator=(const UInt &rhs); RValue operator=(const Reference &rhs); static Type *type(); static int element_count() { return SIMD::Width; } }; class Float : public LValue, public XYZW // TODO(b/214583550): Eliminate and replace with SwizzleQuad() and/or other intrinsics. { public: explicit Float(RValue cast); explicit Float(RValue cast); Float(); Float(float broadcast); Float(float x, float y, float z, float w); Float(std::vector v); Float(std::function LaneValueProducer); Float(RValue rhs); Float(const Float &rhs); Float(const Reference &rhs); Float(RValue rhs); Float(const scalar::Float &rhs); Float(const Reference &rhs); Float(RValue rhs); RValue operator=(RValue rhs); template Float(const SwizzleMask1 &rhs); RValue operator=(float broadcast); RValue operator=(RValue rhs); RValue operator=(const Float &rhs); RValue operator=(const Reference &rhs); RValue operator=(RValue rhs); RValue operator=(const scalar::Float &rhs); RValue operator=(const Reference &rhs); static SIMD::Float infinity(); static Type *type(); static int element_count() { return SIMD::Width; } }; class Pointer { public: Pointer(scalar::Pointer base, scalar::Int limit); Pointer(scalar::Pointer base, unsigned int limit); Pointer(scalar::Pointer base, scalar::Int limit, SIMD::Int offset); Pointer(scalar::Pointer base, unsigned int limit, SIMD::Int offset); Pointer(std::vector> pointers); explicit Pointer(SIMD::UInt cast); // Cast from 32-bit integers to 32-bit pointers explicit Pointer(SIMD::UInt castLow, SIMD::UInt castHight); // Cast from pairs of 32-bit integers to 64-bit pointers Pointer &operator+=(SIMD::Int i); Pointer operator+(SIMD::Int i); Pointer &operator+=(int i); Pointer operator+(int i); SIMD::Int offsets() const; SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const; bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const; Int limit() const; // Returns true if all offsets are compile-time static and sequential // (N+0*step, N+1*step, N+2*step, N+3*step) bool hasStaticSequentialOffsets(unsigned int step) const; // Returns true if all offsets are compile-time static and equal // (N, N, N, N) bool hasStaticEqualOffsets() const; template inline T Load(OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float)); template inline void Store(T val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed); template inline void Store(RValue val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed); scalar::Pointer getUniformPointer() const; scalar::Pointer getPointerForLane(int lane) const; static Pointer IfThenElse(SIMD::Int condition, const Pointer &lhs, const Pointer &rhs); void castTo(SIMD::UInt &bits) const; // Cast from 32-bit pointers to 32-bit integers void castTo(SIMD::UInt &lowerBits, SIMD::UInt &upperBits) const; // Cast from 64-bit pointers to pairs of 32-bit integers #ifdef ENABLE_RR_PRINT std::vector getPrintValues() const; #endif private: // Base address for the pointer, common across all lanes. scalar::Pointer base; // Per-lane address for dealing with non-uniform data std::vector> pointers; public: // Upper (non-inclusive) limit for offsets from base. scalar::Int dynamicLimit; // If hasDynamicLimit is false, dynamicLimit is zero. unsigned int staticLimit = 0; // Per lane offsets from base. SIMD::Int dynamicOffsets; // If hasDynamicOffsets is false, all dynamicOffsets are zero. std::vector staticOffsets; bool hasDynamicLimit = false; // True if dynamicLimit is non-zero. bool hasDynamicOffsets = false; // True if any dynamicOffsets are non-zero. bool isBasePlusOffset = false; // True if this uses base+offset. False if this is a collection of Pointers }; } // namespace SIMD RValue operator+(RValue lhs, RValue rhs); RValue operator-(RValue lhs, RValue rhs); RValue operator*(RValue lhs, RValue rhs); RValue operator/(RValue lhs, RValue rhs); RValue operator%(RValue lhs, RValue rhs); RValue operator&(RValue lhs, RValue rhs); RValue operator|(RValue lhs, RValue rhs); RValue operator^(RValue lhs, RValue rhs); RValue operator<<(RValue lhs, unsigned char rhs); RValue operator>>(RValue lhs, unsigned char rhs); RValue operator<<(RValue lhs, RValue rhs); RValue operator>>(RValue lhs, RValue rhs); RValue operator+=(SIMD::Int &lhs, RValue rhs); RValue operator-=(SIMD::Int &lhs, RValue rhs); RValue operator*=(SIMD::Int &lhs, RValue rhs); // RValue operator/=(SIMD::Int &lhs, RValue rhs); // RValue operator%=(SIMD::Int &lhs, RValue rhs); RValue operator&=(SIMD::Int &lhs, RValue rhs); RValue operator|=(SIMD::Int &lhs, RValue rhs); RValue operator^=(SIMD::Int &lhs, RValue rhs); RValue operator<<=(SIMD::Int &lhs, unsigned char rhs); RValue operator>>=(SIMD::Int &lhs, unsigned char rhs); RValue operator+(RValue val); RValue operator-(RValue val); RValue operator~(RValue val); // RValue operator++(SIMD::Int &val, int); // Post-increment // const Int &operator++(SIMD::Int &val); // Pre-increment // RValue operator--(SIMD::Int &val, int); // Post-decrement // const Int &operator--(SIMD::Int &val); // Pre-decrement // RValue operator<(RValue lhs, RValue rhs); // RValue operator<=(RValue lhs, RValue rhs); // RValue operator>(RValue lhs, RValue rhs); // RValue operator>=(RValue lhs, RValue rhs); // RValue operator!=(RValue lhs, RValue rhs); // RValue operator==(RValue lhs, RValue rhs); RValue CmpEQ(RValue x, RValue y); RValue CmpLT(RValue x, RValue y); RValue CmpLE(RValue x, RValue y); RValue CmpNEQ(RValue x, RValue y); RValue CmpNLT(RValue x, RValue y); RValue CmpNLE(RValue x, RValue y); inline RValue CmpGT(RValue x, RValue y) { return CmpNLE(x, y); } inline RValue CmpGE(RValue x, RValue y) { return CmpNLT(x, y); } RValue Abs(RValue x); RValue Max(RValue x, RValue y); RValue Min(RValue x, RValue y); // Convert to nearest integer. If a converted value is outside of the integer // range, the returned result is undefined. RValue RoundInt(RValue cast); // Rounds to the nearest integer, but clamps very large values to an // implementation-dependent range. // Specifically, on x86, values larger than 2147483583.0 are converted to // 2147483583 (0x7FFFFFBF) instead of producing 0x80000000. RValue RoundIntClamped(RValue cast); RValue Extract(RValue val, int i); RValue Insert(RValue val, RValue element, int i); RValue Extract128(RValue val, int i); RValue Insert128(RValue val, RValue element, int i); RValue operator+(RValue lhs, RValue rhs); RValue operator-(RValue lhs, RValue rhs); RValue operator*(RValue lhs, RValue rhs); RValue operator/(RValue lhs, RValue rhs); RValue operator%(RValue lhs, RValue rhs); RValue operator&(RValue lhs, RValue rhs); RValue operator|(RValue lhs, RValue rhs); RValue operator^(RValue lhs, RValue rhs); RValue operator<<(RValue lhs, unsigned char rhs); RValue operator>>(RValue lhs, unsigned char rhs); RValue operator<<(RValue lhs, RValue rhs); RValue operator>>(RValue lhs, RValue rhs); RValue operator+=(SIMD::UInt &lhs, RValue rhs); RValue operator-=(SIMD::UInt &lhs, RValue rhs); RValue operator*=(SIMD::UInt &lhs, RValue rhs); // RValue operator/=(SIMD::UInt &lhs, RValue rhs); // RValue operator%=(SIMD::UInt &lhs, RValue rhs); RValue operator&=(SIMD::UInt &lhs, RValue rhs); RValue operator|=(SIMD::UInt &lhs, RValue rhs); RValue operator^=(SIMD::UInt &lhs, RValue rhs); RValue operator<<=(SIMD::UInt &lhs, unsigned char rhs); RValue operator>>=(SIMD::UInt &lhs, unsigned char rhs); RValue operator+(RValue val); RValue operator-(RValue val); RValue operator~(RValue val); // RValue operator++(SIMD::UInt &val, int); // Post-increment // const UInt &operator++(SIMD::UInt &val); // Pre-increment // RValue operator--(SIMD::UInt &val, int); // Post-decrement // const UInt &operator--(SIMD::UInt &val); // Pre-decrement // RValue operator<(RValue lhs, RValue rhs); // RValue operator<=(RValue lhs, RValue rhs); // RValue operator>(RValue lhs, RValue rhs); // RValue operator>=(RValue lhs, RValue rhs); // RValue operator!=(RValue lhs, RValue rhs); // RValue operator==(RValue lhs, RValue rhs); RValue CmpEQ(RValue x, RValue y); RValue CmpLT(RValue x, RValue y); RValue CmpLE(RValue x, RValue y); RValue CmpNEQ(RValue x, RValue y); RValue CmpNLT(RValue x, RValue y); RValue CmpNLE(RValue x, RValue y); inline RValue CmpGT(RValue x, RValue y) { return CmpNLE(x, y); } inline RValue CmpGE(RValue x, RValue y) { return CmpNLT(x, y); } RValue Max(RValue x, RValue y); RValue Min(RValue x, RValue y); RValue Extract(RValue val, int i); RValue Insert(RValue val, RValue element, int i); RValue Extract128(RValue val, int i); RValue Insert128(RValue val, RValue element, int i); // RValue RoundInt(RValue cast); RValue operator+(RValue lhs, RValue rhs); RValue operator-(RValue lhs, RValue rhs); RValue operator*(RValue lhs, RValue rhs); RValue operator/(RValue lhs, RValue rhs); RValue operator%(RValue lhs, RValue rhs); RValue operator+=(SIMD::Float &lhs, RValue rhs); RValue operator-=(SIMD::Float &lhs, RValue rhs); RValue operator*=(SIMD::Float &lhs, RValue rhs); RValue operator/=(SIMD::Float &lhs, RValue rhs); RValue operator%=(SIMD::Float &lhs, RValue rhs); RValue operator+(RValue val); RValue operator-(RValue val); // Computes `x * y + z`, which may be fused into one operation to produce a higher-precision result. RValue MulAdd(RValue x, RValue y, RValue z); // Computes a fused `x * y + z` operation. Caps::fmaIsFast indicates whether it emits an FMA instruction. RValue FMA(RValue x, RValue y, RValue z); RValue Abs(RValue x); RValue Max(RValue x, RValue y); RValue Min(RValue x, RValue y); RValue Rcp(RValue x, bool relaxedPrecision, bool exactAtPow2 = false); RValue RcpSqrt(RValue x, bool relaxedPrecision); RValue Sqrt(RValue x); RValue Insert(RValue val, RValue element, int i); RValue Extract(RValue x, int i); RValue Extract128(RValue val, int i); RValue Insert128(RValue val, RValue element, int i); // Ordered comparison functions RValue CmpEQ(RValue x, RValue y); RValue CmpLT(RValue x, RValue y); RValue CmpLE(RValue x, RValue y); RValue CmpNEQ(RValue x, RValue y); RValue CmpNLT(RValue x, RValue y); RValue CmpNLE(RValue x, RValue y); inline RValue CmpGT(RValue x, RValue y) { return CmpNLE(x, y); } inline RValue CmpGE(RValue x, RValue y) { return CmpNLT(x, y); } // Unordered comparison functions RValue CmpUEQ(RValue x, RValue y); RValue CmpULT(RValue x, RValue y); RValue CmpULE(RValue x, RValue y); RValue CmpUNEQ(RValue x, RValue y); RValue CmpUNLT(RValue x, RValue y); RValue CmpUNLE(RValue x, RValue y); inline RValue CmpUGT(RValue x, RValue y) { return CmpUNLE(x, y); } inline RValue CmpUGE(RValue x, RValue y) { return CmpUNLT(x, y); } RValue IsInf(RValue x); RValue IsNan(RValue x); RValue Round(RValue x); RValue Trunc(RValue x); RValue Frac(RValue x); RValue Floor(RValue x); RValue Ceil(RValue x); // Trigonometric functions RValue Sin(RValue x); RValue Cos(RValue x); RValue Tan(RValue x); RValue Asin(RValue x); RValue Acos(RValue x); RValue Atan(RValue x); RValue Sinh(RValue x); RValue Cosh(RValue x); RValue Tanh(RValue x); RValue Asinh(RValue x); RValue Acosh(RValue x); RValue Atanh(RValue x); RValue Atan2(RValue x, RValue y); // Exponential functions RValue Pow(RValue x, RValue y); RValue Exp(RValue x); RValue Log(RValue x); RValue Exp2(RValue x); RValue Log2(RValue x); RValue SignMask(RValue x); RValue Ctlz(RValue x, bool isZeroUndef); RValue Cttz(RValue x, bool isZeroUndef); RValue MulHigh(RValue x, RValue y); RValue MulHigh(RValue x, RValue y); RValue AnyTrue(const RValue &bools); RValue AnyFalse(const RValue &bools); RValue Divergent(const RValue &ints); RValue Swizzle(RValue x, uint16_t select); RValue Swizzle(RValue x, uint16_t select); RValue Swizzle(RValue x, uint16_t select); RValue Shuffle(RValue x, RValue y, uint16_t select); RValue Shuffle(RValue x, RValue y, uint16_t select); RValue Shuffle(RValue x, RValue y, uint16_t select); RValue Gather(RValue> base, RValue offsets, RValue mask, unsigned int alignment, bool zeroMaskedLanes = false); RValue Gather(RValue> base, RValue offsets, RValue mask, unsigned int alignment, bool zeroMaskedLanes = false); void Scatter(RValue> base, RValue val, RValue offsets, RValue mask, unsigned int alignment); void Scatter(RValue> base, RValue val, RValue offsets, RValue mask, unsigned int alignment); template<> inline RValue::RValue(int i) : val(broadcast(i, SIMD::Int::type())) { RR_DEBUG_INFO_EMIT_VAR(val); } template<> inline RValue::RValue(unsigned int i) : val(broadcast(int(i), SIMD::UInt::type())) { RR_DEBUG_INFO_EMIT_VAR(val); } template<> inline RValue::RValue(float f) : val(broadcast(f, SIMD::Float::type())) { RR_DEBUG_INFO_EMIT_VAR(val); } template SIMD::Int::Int(const SwizzleMask1 &rhs) : XYZW(this) { *this = rhs.operator RValue(); } template SIMD::Float::Float(const SwizzleMask1 &rhs) : XYZW(this) { *this = rhs.operator RValue(); } template inline T SIMD::Pointer::Load(OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */) { using EL = typename Scalar::Type; if(!isBasePlusOffset) { T out = T(0); for(int i = 0; i < SIMD::Width; i++) { If(Extract(mask, i) != 0) { auto el = rr::Load(scalar::Pointer(pointers[i]), alignment, atomic, order); out = Insert(out, el, i); } } return out; } if(isStaticallyInBounds(sizeof(float), robustness)) { // All elements are statically known to be in-bounds. // We can avoid costly conditional on masks. if(hasStaticSequentialOffsets(sizeof(float))) { // Offsets are sequential. Perform regular load. return rr::Load(scalar::Pointer(base + staticOffsets[0]), alignment, atomic, order); } if(hasStaticEqualOffsets()) { // Load one, replicate. return T(*scalar::Pointer(base + staticOffsets[0], alignment)); } } else { switch(robustness) { case OutOfBoundsBehavior::Nullify: case OutOfBoundsBehavior::RobustBufferAccess: case OutOfBoundsBehavior::UndefinedValue: mask &= isInBounds(sizeof(float), robustness); // Disable out-of-bounds reads. break; case OutOfBoundsBehavior::UndefinedBehavior: // Nothing to do. Application/compiler must guarantee no out-of-bounds accesses. break; } } auto offs = offsets(); if(!atomic && order == std::memory_order_relaxed) { if(hasStaticEqualOffsets()) { // Load one, replicate. // Be careful of the case where the post-bounds-check mask // is 0, in which case we must not load. T out = T(0); If(AnyTrue(mask)) { EL el = *scalar::Pointer(base + staticOffsets[0], alignment); out = T(el); } return out; } bool zeroMaskedLanes = true; switch(robustness) { case OutOfBoundsBehavior::Nullify: case OutOfBoundsBehavior::RobustBufferAccess: // Must either return an in-bounds value, or zero. zeroMaskedLanes = true; break; case OutOfBoundsBehavior::UndefinedValue: case OutOfBoundsBehavior::UndefinedBehavior: zeroMaskedLanes = false; break; } // TODO(b/195446858): Optimize static sequential offsets case by using masked load. return Gather(scalar::Pointer(base), offs, mask, alignment, zeroMaskedLanes); } else { T out; auto anyLanesDisabled = AnyFalse(mask); If(hasStaticEqualOffsets() && !anyLanesDisabled) { // Load one, replicate. auto offset = Extract(offs, 0); out = T(rr::Load(scalar::Pointer(&base[offset]), alignment, atomic, order)); } Else If(hasStaticSequentialOffsets(sizeof(float)) && !anyLanesDisabled) { // Load all elements in a single SIMD instruction. auto offset = Extract(offs, 0); out = rr::Load(scalar::Pointer(&base[offset]), alignment, atomic, order); } Else { // Divergent offsets or masked lanes. out = T(0); for(int i = 0; i < SIMD::Width; i++) { If(Extract(mask, i) != 0) { auto offset = Extract(offs, i); auto el = rr::Load(scalar::Pointer(&base[offset]), alignment, atomic, order); out = Insert(out, el, i); } } } return out; } } template<> inline SIMD::Pointer SIMD::Pointer::Load(OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */) { std::vector> pointers(SIMD::Width); for(int i = 0; i < SIMD::Width; i++) { If(Extract(mask, i) != 0) { pointers[i] = rr::Load(scalar::Pointer>(getPointerForLane(i)), alignment, atomic, order); } } return SIMD::Pointer(pointers); } template inline void SIMD::Pointer::Store(T val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */) { using EL = typename Scalar::Type; constexpr size_t alignment = sizeof(float); if(!isBasePlusOffset) { for(int i = 0; i < SIMD::Width; i++) { If(Extract(mask, i) != 0) { rr::Store(Extract(val, i), scalar::Pointer(pointers[i]), alignment, atomic, order); } } return; } auto offs = offsets(); switch(robustness) { case OutOfBoundsBehavior::Nullify: case OutOfBoundsBehavior::RobustBufferAccess: // TODO: Allows writing anywhere within bounds. Could be faster than masking. case OutOfBoundsBehavior::UndefinedValue: // Should not be used for store operations. Treat as robust buffer access. mask &= isInBounds(sizeof(float), robustness); // Disable out-of-bounds writes. break; case OutOfBoundsBehavior::UndefinedBehavior: // Nothing to do. Application/compiler must guarantee no out-of-bounds accesses. break; } if(!atomic && order == std::memory_order_relaxed) { if(hasStaticEqualOffsets()) { If(AnyTrue(mask)) { assert(SIMD::Width == 4); // All equal. One of these writes will win -- elect the winning lane. auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx)); auto maskedVal = As(val) & elect; auto scalarVal = Extract(maskedVal, 0) | Extract(maskedVal, 1) | Extract(maskedVal, 2) | Extract(maskedVal, 3); *scalar::Pointer(base + staticOffsets[0], alignment) = As(scalarVal); } } else if(hasStaticSequentialOffsets(sizeof(float)) && isStaticallyInBounds(sizeof(float), robustness)) { // TODO(b/195446858): Optimize using masked store. // Pointer has no elements OOB, and the store is not atomic. // Perform a read-modify-write. auto p = scalar::Pointer(base + staticOffsets[0], alignment); auto prev = *p; *p = (prev & ~mask) | (As(val) & mask); } else { Scatter(scalar::Pointer(base), val, offs, mask, alignment); } } else { auto anyLanesDisabled = AnyFalse(mask); If(hasStaticSequentialOffsets(sizeof(float)) && !anyLanesDisabled) { // Store all elements in a single SIMD instruction. auto offset = Extract(offs, 0); rr::Store(val, scalar::Pointer(&base[offset]), alignment, atomic, order); } Else { // Divergent offsets or masked lanes. for(int i = 0; i < SIMD::Width; i++) { If(Extract(mask, i) != 0) { auto offset = Extract(offs, i); rr::Store(Extract(val, i), scalar::Pointer(&base[offset]), alignment, atomic, order); } } } } } template<> inline void SIMD::Pointer::Store(SIMD::Pointer val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */) { constexpr size_t alignment = sizeof(void *); for(int i = 0; i < SIMD::Width; i++) { If(Extract(mask, i) != 0) { rr::Store(val.getPointerForLane(i), scalar::Pointer>(getPointerForLane(i)), alignment, atomic, order); } } } template inline void SIMD::Pointer::Store(RValue val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */) { Store(T(val), robustness, mask, atomic, order); } } // namespace rr #endif // rr_SIMD_hpp