diff --git a/Engine/Graphics/3DRenderer.cpp b/Engine/Graphics/3DRenderer.cpp index 0f50440..e285418 100644 --- a/Engine/Graphics/3DRenderer.cpp +++ b/Engine/Graphics/3DRenderer.cpp @@ -25,7 +25,7 @@ // * https://en.wikipedia.org/wiki/Hidden-surface_determination#Occlusion_culling // * https://en.wikipedia.org/wiki/Bounding_volume_hierarchy -static bool VertexIsInsideClipSpace(M3D_F4& V); +static bool VertexClipTest(M3D_F4& V, sf::Vector2f& RTsize, float gb_factor); Graphic3DRenderer::Graphic3DRenderer() { mRTSize = {1280.f, 324.f}; @@ -140,28 +140,27 @@ void Graphic3DRenderer::Draw(sf::RenderTexture& context) { break; // Triangle clipping - //TODO: implement complete Cohen-Sutherland algo or similar - if (VertexIsInsideClipSpace(projVertices[indicePtr[i]]) && - VertexIsInsideClipSpace(projVertices[indicePtr[i+1]]) && - VertexIsInsideClipSpace(projVertices[indicePtr[i+2]])) + //TODO: scissor/clipping depending of how many vertices are outside/inside the clipspace, implement complete Cohen-Sutherland algo or similar + if (VertexClipTest(projVertices[indicePtr[i]], mRTSize, 2.5f) && + VertexClipTest(projVertices[indicePtr[i+1]], mRTSize, 2.5f) && + VertexClipTest(projVertices[indicePtr[i+2]], mRTSize, 2.5f)) { M3D_VECTOR V1 = M3D_V4LoadF4(&projVertices[indicePtr[i]]); M3D_VECTOR V2 = M3D_V4LoadF4(&projVertices[indicePtr[i+1]]); M3D_VECTOR V3 = M3D_V4LoadF4(&projVertices[indicePtr[i+2]]); + // Do the perspective divide + V1 = M3D_V4Divide(V1, M3D_V4SplatW(V1)); + V2 = M3D_V4Divide(V2, M3D_V4SplatW(V2)); + V3 = M3D_V4Divide(V3, M3D_V4SplatW(V3)); + + V1 = M3D_V3TransformNDCToViewport(V1, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f); + V2 = M3D_V3TransformNDCToViewport(V2, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f); + V3 = M3D_V3TransformNDCToViewport(V3, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f); + // Face culling - M3D_VECTOR faceNormal = M3D_TNormal(V1,V2,V3); - if (M3D_V4GetX(M3D_V3Dot(V1, faceNormal)) >= 0) { - // Do the perspective divide - V1 = M3D_V4Divide(V1, M3D_V4SplatW(V1)); - V2 = M3D_V4Divide(V2, M3D_V4SplatW(V2)); - V3 = M3D_V4Divide(V3, M3D_V4SplatW(V3)); - - V1 = M3D_V3TransformNDCToViewport(V1, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f); - V2 = M3D_V3TransformNDCToViewport(V2, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f); - V3 = M3D_V3TransformNDCToViewport(V3, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f); - + if (M3D_V4GetX(M3D_TNormal(V1,V2,V3))*0.5f <= 0) { if (objInFrustrum == DISJOINT) { v_tri[0].color = sf::Color::Red; v_tri[1].color = sf::Color::Red; @@ -205,9 +204,9 @@ void Graphic3DRenderer::UpdateInternalTestObjects() { mRenderList[3]->SetRotation(0.f, thetaAngle, 0.f); } -inline static bool VertexIsInsideClipSpace(M3D_F4& V) { - return (V.x > -V.w && V.x < V.w && - V.y > -V.w && V.y < V.w && - V.z > 0 && V.z < V.w +inline static bool VertexClipTest(M3D_F4& V, sf::Vector2f& RTsize, float gb_factor) { + // Guard band are usually 2-3x the viewport size for the clipping test + return (V.x > -RTsize.x*gb_factor*V.w && V.x < RTsize.y*gb_factor*V.w && + V.y > -RTsize.x*gb_factor*V.w && V.y < RTsize.y*gb_factor*V.w ); } \ No newline at end of file diff --git a/Engine/Utils/3DMaths.hpp b/Engine/Utils/3DMaths.hpp index 73abddf..51eb4c8 100644 --- a/Engine/Utils/3DMaths.hpp +++ b/Engine/Utils/3DMaths.hpp @@ -419,6 +419,11 @@ M3D_VECTOR M3D_V3Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept; M3D_VECTOR M3D_V3LengthSq(M3D_VECTOR V) noexcept; M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept; M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept; +M3D_VECTOR M3D_V2Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept; +M3D_VECTOR M3D_V2Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept; +M3D_VECTOR M3D_V2LengthSq(M3D_VECTOR V) noexcept; +M3D_VECTOR M3D_V2Length(M3D_VECTOR V) noexcept; +M3D_VECTOR M3D_V2Normalize(M3D_VECTOR V) noexcept; #ifndef DISABLE_INTRINSICS diff --git a/Engine/Utils/3DMaths_vec.inl b/Engine/Utils/3DMaths_vec.inl index bd259de..ba107a7 100644 --- a/Engine/Utils/3DMaths_vec.inl +++ b/Engine/Utils/3DMaths_vec.inl @@ -65,16 +65,16 @@ inline M3D_VECTOR M3D_V4LoadF3(const M3D_F3* src) noexcept { V.v4f[2] = src->z; V.v4f[3] = 0.f; return V; -/* -#elif defined(SSE4_INTRINSICS) +#else // SSE4_INTRINSICS __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(src))); __m128 z = _mm_load_ss(&src->z); return _mm_insert_ps(xy, z, 0x20); -*/ -#else +/* +#elif defined(SSE_INTRINSICS) __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(src))); __m128 z = _mm_load_ss(&src->z); return _mm_movelh_ps(xy, z); +*/ #endif } @@ -97,16 +97,16 @@ inline void M3D_V4StoreF3(M3D_F3* dst, M3D_VECTOR V) noexcept { dst->x = V.v4f[0]; dst->y = V.v4f[1]; dst->z = V.v4f[2]; -/* -#elif defined(SSE4_INTRINSICS) +#else // SSE4_INTRINSICS *reinterpret_cast(&dst->x) = _mm_extract_ps(V, 0); *reinterpret_cast(&dst->y) = _mm_extract_ps(V, 1); *reinterpret_cast(&dst->z) = _mm_extract_ps(V, 2); -*/ -#else +/* +#elif defined(SSE_INTRINSICS) _mm_store_sd(reinterpret_cast(dst), _mm_castps_pd(V)); __m128 z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); _mm_store_ss(&dst->z, z); +*/ #endif } @@ -115,15 +115,15 @@ inline void M3D_V4StoreF3A(M3D_F3A* dst, M3D_VECTOR V) noexcept { dst->x = V.v4f[0]; dst->y = V.v4f[1]; dst->z = V.v4f[2]; -/* -#elif defined(SSE4_INTRINSICS) +#else // SSE4_INTRINSICS _mm_store_sd(reinterpret_cast(dst), _mm_castps_pd(V)); *reinterpret_cast(&dst->z) = _mm_extract_ps(V, 2); -*/ -#else +/* +#elif defined(SSE_INTRINSICS) _mm_store_sd(reinterpret_cast(dst), _mm_castps_pd(V)); __m128 z = _mm_movehl_ps(V, V); _mm_store_ss(&dst->z, z); +*/ #endif } @@ -434,11 +434,12 @@ inline M3D_VECTOR M3D_V4SetY(M3D_VECTOR V, float y) noexcept { V.v4f[3] }}}; return U.v; -#elif defined(SSE4_INTRINSICS) +#else // SSE4_INTRINSICS M3D_VECTOR vResult = _mm_set_ss(y); vResult = _mm_insert_ps(V, vResult, 0x10); return vResult; -#else +/* +#elif defined(SSE_INTRINSICS) // Swap y and x M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); // Convert input to vector @@ -448,6 +449,7 @@ inline M3D_VECTOR M3D_V4SetY(M3D_VECTOR V, float y) noexcept { // Swap y and x again vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1)); return vResult; +*/ #endif } @@ -460,11 +462,12 @@ inline M3D_VECTOR M3D_V4SetZ(M3D_VECTOR V, float z) noexcept { V.v4f[3] }}}; return U.v; -#elif defined(SSE4_INTRINSICS) +#else // SSE4_INTRINSICS M3D_VECTOR vResult = _mm_set_ss(z); vResult = _mm_insert_ps(V, vResult, 0x20); return vResult; -#else +/* +#elif defined(SSE_INTRINSICS) // Swap z and x M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2)); // Convert input to vector @@ -474,6 +477,7 @@ inline M3D_VECTOR M3D_V4SetZ(M3D_VECTOR V, float z) noexcept { // Swap z and x again vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); return vResult; +*/ #endif } @@ -486,11 +490,12 @@ inline M3D_VECTOR M3D_V4SetW(M3D_VECTOR V, float w) noexcept { w }}}; return U.v; -#elif defined(SSE4_INTRINSICS) +#else // SSE4_INTRINSICS M3D_VECTOR vResult = _mm_set_ss(w); vResult = _mm_insert_ps(V, vResult, 0x30); return vResult; -#else +/* +#elif defined(SSE_INTRINSICS) // Swap w and x M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3)); // Convert input to vector @@ -500,6 +505,7 @@ inline M3D_VECTOR M3D_V4SetW(M3D_VECTOR V, float w) noexcept { // Swap w and x again vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3)); return vResult; +*/ #endif } @@ -693,9 +699,10 @@ inline M3D_VECTOR M3D_V4Round(M3D_VECTOR V) noexcept { M3D_Internal::round_to_nearest(V.v4f[3]) } } }; return Result.v; -#elif defined(SSE4_INTRINSICS) +#else // SSE4_INTRINSICS return _mm_round_ps(V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); -#else +/* +#elif defined(SSE_INTRINSICS) __m128 sign = _mm_and_ps(V, M3D_MNegativeZero); __m128 sMagic = _mm_or_ps(M3D_MNoFraction, sign); __m128 R1 = _mm_add_ps(V, sMagic); @@ -706,6 +713,7 @@ inline M3D_VECTOR M3D_V4Round(M3D_VECTOR V) noexcept { R1 = _mm_and_ps(R1, mask); M3D_VECTOR vResult = _mm_xor_ps(R1, R2); return vResult; +*/ #endif } @@ -827,8 +835,9 @@ inline M3D_VECTOR M3D_V4Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept { Result.f[2] = Result.f[3] = V1.v4f[0] * V2.v4f[0] + V1.v4f[1] * V2.v4f[1] + V1.v4f[2] * V2.v4f[2] + V1.v4f[3] * V2.v4f[3]; return Result.v; -#elif defined(SSE4_INTRINSICS) +#else // SSE4_INTRINSICS return _mm_dp_ps(V1, V2, 0xff); +/* #elif defined(SSE3_INTRINSICS) M3D_VECTOR vTemp = _mm_mul_ps(V1, V2); vTemp = _mm_hadd_ps(vTemp, vTemp); @@ -841,6 +850,7 @@ inline M3D_VECTOR M3D_V4Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept { vTemp = _mm_shuffle_ps(vTemp, vTemp2, _MM_SHUFFLE(0, 3, 0, 0)); // Copy W to the Z position vTemp = _mm_add_ps(vTemp, vTemp2); // Add Z and W together return M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(2, 2, 2, 2)); // Splat Z and return +*/ #endif } @@ -856,16 +866,17 @@ inline M3D_VECTOR M3D_V4Length(M3D_VECTOR V) noexcept { Result = M3D_V4Sqrt(Result); return Result; -#elif defined(SSE4_INTRINSICS) +#else // SSE4_INTRINSICS M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0xff); return _mm_sqrt_ps(vTemp); +/* #elif defined(SSE3_INTRINSICS) M3D_VECTOR vLengthSq = _mm_mul_ps(V, V); vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); vLengthSq = _mm_sqrt_ps(vLengthSq); return vLengthSq; -#else +#elif defined(SSE_INTRINSICS) // Perform the dot product on x,y,z and w M3D_VECTOR vLengthSq = _mm_mul_ps(V, V); // vTemp has z and w @@ -883,6 +894,7 @@ inline M3D_VECTOR M3D_V4Length(M3D_VECTOR V) noexcept { // Get the length vLengthSq = _mm_sqrt_ps(vLengthSq); return vLengthSq; +*/ #endif } @@ -1066,14 +1078,15 @@ inline M3D_VECTOR M3D_V3Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept { vResult.f[2] = vResult.f[3] = fValue; return vResult.v; -#elif defined(SSE4_INTRINSICS) +#else // SSE4_INTRINSICS return _mm_dp_ps(V1, V2, 0x7f); +/* #elif defined(SSE3_INTRINSICS) M3D_VECTOR vTemp = _mm_mul_ps(V1, V2); vTemp = _mm_and_ps(vTemp, g_XMMask3); vTemp = _mm_hadd_ps(vTemp, vTemp); return _mm_hadd_ps(vTemp, vTemp); -#else +#elif defined(SSE_INTRINSICS) // Perform the dot product M3D_VECTOR vDot = _mm_mul_ps(V1, V2); // x=Dot.v4f[1], y=Dot.v4f[2] @@ -1086,6 +1099,7 @@ inline M3D_VECTOR M3D_V3Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept { vDot = _mm_add_ss(vDot, vTemp); // Splat x return M3D_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0)); +*/ #endif } @@ -1130,9 +1144,10 @@ inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept { Result = M3D_V4Sqrt(Result); return Result; -#elif defined(SSE4_INTRINSICS) +#else // SSE4_INTRINSICS M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0x7f); return _mm_sqrt_ps(vTemp); +/* #elif defined(SSE3_INTRINSICS) M3D_VECTOR vLengthSq = _mm_mul_ps(V, V); vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); @@ -1140,7 +1155,7 @@ inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept { vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); vLengthSq = _mm_sqrt_ps(vLengthSq); return vLengthSq; -#else +#elif defined(SSE_INTRINSICS) // Perform the dot product on x,y and z M3D_VECTOR vLengthSq = _mm_mul_ps(V, V); // vTemp has z and y @@ -1156,6 +1171,7 @@ inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept { // Get the length vLengthSq = _mm_sqrt_ps(vLengthSq); return vLengthSq; +*/ #endif } @@ -1174,8 +1190,7 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept { vResult.v4f[2] = V.v4f[2] * fLength; vResult.v4f[3] = V.v4f[3] * fLength; return vResult; - -#elif defined(SSE4_INTRINSICS) +#else // SSE4_INTRINSICS M3D_VECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f); // Prepare for the division M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq); @@ -1185,16 +1200,17 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept { vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); // Failsafe on zero (Or epsilon) length planes // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity); // Divide to perform the normalization vResult = _mm_div_ps(V, vResult); // Any that are infinity, set to zero vResult = _mm_and_ps(vResult, vZeroMask); // Select qnan or result based on infinite length - M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN); M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); vResult = _mm_or_ps(vTemp1, vTemp2); return vResult; +/* #elif defined(SSE3_INTRINSICS) // Perform the dot product on x,y and z only M3D_VECTOR vLengthSq = _mm_mul_ps(V, V); @@ -1209,17 +1225,17 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept { vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); // Failsafe on zero (Or epsilon) length planes // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity); // Divide to perform the normalization vResult = _mm_div_ps(V, vResult); // Any that are infinity, set to zero vResult = _mm_and_ps(vResult, vZeroMask); // Select qnan or result based on infinite length - M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN); M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); vResult = _mm_or_ps(vTemp1, vTemp2); return vResult; -#else +#elif defined(SSE_INTRINSICS) // Perform the dot product on x,y and z only M3D_VECTOR vLengthSq = _mm_mul_ps(V, V); M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1)); @@ -1245,6 +1261,182 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept { M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); vResult = _mm_or_ps(vTemp1, vTemp2); return vResult; +*/ +#endif +} + +inline M3D_VECTOR M3D_V2Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept { +#ifdef DISABLE_INTRINSICS + M3D_V4F32 Result; + Result.f[0] = + Result.f[1] = + Result.f[2] = + Result.f[3] = V1.v4f[0] * V2.v4f[0] + V1.v4f[1] * V2.v4f[1]; + return Result.v; +#else // SSE4_INTRINSICS + return _mm_dp_ps(V1, V2, 0x3f); +/* +#elif defined(SSE3_INTRINSICS) + M3D_VECTOR vDot = _mm_mul_ps(V1, V2); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_moveldup_ps(vDot); + return vDot; +#elif defined(SSE_INTRINSICS) + // Perform the dot product on x and y + M3D_VECTOR vLengthSq = _mm_mul_ps(V1, V2); + // vTemp has y splatted + M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +*/ +#endif +} + +inline M3D_VECTOR M3D_V2Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept { + // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ] +#ifdef DISABLE_INTRINSICS + float fCross = (V1.v4f[0] * V2.v4f[1]) - (V1.v4f[1] * V2.v4f[0]); + M3D_V4F32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = fCross; + return vResult.v; +#else + // Swap x and y + M3D_VECTOR vResult = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 1, 0, 1)); + // Perform the muls + vResult = _mm_mul_ps(vResult, V1); + // Splat y + M3D_VECTOR vTemp = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(1, 1, 1, 1)); + // Sub the values + vResult = _mm_sub_ss(vResult, vTemp); + // Splat the cross product + vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 0, 0, 0)); + return vResult; +#endif +} + +inline M3D_VECTOR M3D_V2LengthSq(M3D_VECTOR V) noexcept { + return M3D_V2Dot(V, V); +} + +inline M3D_VECTOR M3D_V2Length(M3D_VECTOR V) noexcept { +#ifdef DISABLE_INTRINSICS + M3D_VECTOR Result; + Result = M3D_V2LengthSq(V); + Result = M3D_V4Sqrt(Result); + return Result; +#else // SSE4_INTRINSICS + M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + return _mm_sqrt_ps(vTemp); +/* +#elif defined(SSE3_INTRINSICS) + M3D_VECTOR vLengthSq = _mm_mul_ps(V, V); + M3D_VECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(SSE_INTRINSICS) + // Perform the dot product on x and y + M3D_VECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +*/ +#endif +} + +inline M3D_VECTOR M3D_V2Normalize(M3D_VECTOR V) noexcept { +#ifdef DISABLE_INTRINSICS + M3D_VECTOR vResult = M3D_V2Length(V); + float fLength = vResult.v4f[0]; + + // Prevent divide by zero + if (fLength > 0) { + fLength = 1.0f / fLength; + } + + vResult.v4f[0] = V.v4f[0] * fLength; + vResult.v4f[1] = V.v4f[1] * fLength; + vResult.v4f[2] = V.v4f[2] * fLength; + vResult.v4f[3] = V.v4f[3] * fLength; + return vResult; +#else // SSE4_INTRINSICS + M3D_VECTOR vLengthSq = _mm_dp_ps(V, V, 0x3f); + // Prepare for the division + M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + M3D_VECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN); + M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +/* +#elif defined(SSE3_INTRINSICS) + // Perform the dot product on x and y only + M3D_VECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_moveldup_ps(vLengthSq); + // Prepare for the division + M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + M3D_VECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN); + M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(SSE_INTRINSICS) + // Perform the dot product on x and y only + M3D_VECTOR vLengthSq = _mm_mul_ps(V, V); + M3D_VECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Prepare for the division + M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + M3D_VECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN); + M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +*/ #endif } @@ -1317,7 +1509,7 @@ inline M3D_VECTOR M3D_TNormal(M3D_VECTOR P1, M3D_VECTOR P2, M3D_VECTOR P3) noexc M3D_VECTOR L1 = M3D_V4Subtract(P2, P1); M3D_VECTOR L2 = M3D_V4Subtract(P3, P1); - return M3D_V3Normalize(M3D_V3Cross(L2, L1)); + return M3D_V2Normalize(M3D_V2Cross(L2, L1)); }