Improved clipping and fix culling normal computation

2024-11-03 14:12:36 +01:00 · 2024-11-03 14:12:36 +01:00 · debc5b219b
commit debc5b219b
parent f1a1c2199f
3 changed files with 251 additions and 55 deletions
--- a/Engine/Graphics/3DRenderer.cpp
+++ b/Engine/Graphics/3DRenderer.cpp
@ -25,7 +25,7 @@
 //   * https://en.wikipedia.org/wiki/Hidden-surface_determination#Occlusion_culling
 //   * https://en.wikipedia.org/wiki/Bounding_volume_hierarchy

-static bool VertexIsInsideClipSpace(M3D_F4& V);
+static bool VertexClipTest(M3D_F4& V, sf::Vector2f& RTsize, float gb_factor);

 Graphic3DRenderer::Graphic3DRenderer() {
    mRTSize = {1280.f, 324.f};
@ -140,28 +140,27 @@ void Graphic3DRenderer::Draw(sf::RenderTexture& context) {
                        break;

                    // Triangle clipping
-                    //TODO: implement complete Cohen-Sutherland algo or similar
-                    if (VertexIsInsideClipSpace(projVertices[indicePtr[i]]) &&
-                        VertexIsInsideClipSpace(projVertices[indicePtr[i+1]]) &&
-                        VertexIsInsideClipSpace(projVertices[indicePtr[i+2]])) 
+                    //TODO: scissor/clipping depending of how many vertices are outside/inside the clipspace, implement complete Cohen-Sutherland algo or similar
+                    if (VertexClipTest(projVertices[indicePtr[i]], mRTSize, 2.5f) &&
+                        VertexClipTest(projVertices[indicePtr[i+1]], mRTSize, 2.5f) &&
+                        VertexClipTest(projVertices[indicePtr[i+2]], mRTSize, 2.5f)) 
                    {
                        
                        M3D_VECTOR V1 = M3D_V4LoadF4(&projVertices[indicePtr[i]]);
                        M3D_VECTOR V2 = M3D_V4LoadF4(&projVertices[indicePtr[i+1]]);
                        M3D_VECTOR V3 = M3D_V4LoadF4(&projVertices[indicePtr[i+2]]);

+                        // Do the perspective divide
+                        V1 = M3D_V4Divide(V1, M3D_V4SplatW(V1));
+                        V2 = M3D_V4Divide(V2, M3D_V4SplatW(V2));
+                        V3 = M3D_V4Divide(V3, M3D_V4SplatW(V3));
+
+                        V1 = M3D_V3TransformNDCToViewport(V1, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
+                        V2 = M3D_V3TransformNDCToViewport(V2, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
+                        V3 = M3D_V3TransformNDCToViewport(V3, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
+
                        // Face culling
-                        M3D_VECTOR faceNormal = M3D_TNormal(V1,V2,V3);
-                        if (M3D_V4GetX(M3D_V3Dot(V1, faceNormal)) >= 0) {
-                            // Do the perspective divide
-                            V1 = M3D_V4Divide(V1, M3D_V4SplatW(V1));
-                            V2 = M3D_V4Divide(V2, M3D_V4SplatW(V2));
-                            V3 = M3D_V4Divide(V3, M3D_V4SplatW(V3));
-
-                            V1 = M3D_V3TransformNDCToViewport(V1, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
-                            V2 = M3D_V3TransformNDCToViewport(V2, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
-                            V3 = M3D_V3TransformNDCToViewport(V3, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
-
+                        if (M3D_V4GetX(M3D_TNormal(V1,V2,V3))*0.5f <= 0) {
                            if (objInFrustrum == DISJOINT) {
                                v_tri[0].color = sf::Color::Red;
                                v_tri[1].color = sf::Color::Red;
@ -205,9 +204,9 @@ void Graphic3DRenderer::UpdateInternalTestObjects() {
    mRenderList[3]->SetRotation(0.f, thetaAngle, 0.f);
 }

-inline static bool VertexIsInsideClipSpace(M3D_F4& V) {
-    return (V.x > -V.w && V.x < V.w &&
-            V.y > -V.w && V.y < V.w &&
-            V.z > 0 && V.z < V.w
+inline static bool VertexClipTest(M3D_F4& V, sf::Vector2f& RTsize, float gb_factor) {
+    // Guard band are usually 2-3x the viewport size for the clipping test
+    return (V.x > -RTsize.x*gb_factor*V.w && V.x < RTsize.y*gb_factor*V.w &&
+            V.y > -RTsize.x*gb_factor*V.w && V.y < RTsize.y*gb_factor*V.w
        );
 }
--- a/Engine/Utils/3DMaths.hpp
+++ b/Engine/Utils/3DMaths.hpp
@ -419,6 +419,11 @@ M3D_VECTOR M3D_V3Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept;
 M3D_VECTOR M3D_V3LengthSq(M3D_VECTOR V) noexcept;
 M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept;
 M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept;
+M3D_VECTOR M3D_V2Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept;
+M3D_VECTOR M3D_V2Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept;
+M3D_VECTOR M3D_V2LengthSq(M3D_VECTOR V) noexcept;
+M3D_VECTOR M3D_V2Length(M3D_VECTOR V) noexcept;
+M3D_VECTOR M3D_V2Normalize(M3D_VECTOR V) noexcept;


 #ifndef DISABLE_INTRINSICS
--- a/Engine/Utils/3DMaths_vec.inl
+++ b/Engine/Utils/3DMaths_vec.inl
@ -65,16 +65,16 @@ inline M3D_VECTOR M3D_V4LoadF3(const M3D_F3* src) noexcept {
    V.v4f[2] = src->z;
    V.v4f[3] = 0.f;
    return V;
-/*
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
    __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(src)));
    __m128 z = _mm_load_ss(&src->z);
    return _mm_insert_ps(xy, z, 0x20);
-*/
-#else
+/*
+#elif defined(SSE_INTRINSICS)
    __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(src)));
    __m128 z = _mm_load_ss(&src->z);
    return _mm_movelh_ps(xy, z);
+*/
 #endif
 }

@ -97,16 +97,16 @@ inline void M3D_V4StoreF3(M3D_F3* dst, M3D_VECTOR V) noexcept {
    dst->x = V.v4f[0];
    dst->y = V.v4f[1];
    dst->z = V.v4f[2];
-/*
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
    *reinterpret_cast<int*>(&dst->x) = _mm_extract_ps(V, 0);
    *reinterpret_cast<int*>(&dst->y) = _mm_extract_ps(V, 1);
    *reinterpret_cast<int*>(&dst->z) = _mm_extract_ps(V, 2);
-*/
-#else
+/*
+#elif defined(SSE_INTRINSICS)
    _mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
    __m128 z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
    _mm_store_ss(&dst->z, z);
+*/
 #endif
 }

@ -115,15 +115,15 @@ inline void M3D_V4StoreF3A(M3D_F3A* dst, M3D_VECTOR V) noexcept {
    dst->x = V.v4f[0];
    dst->y = V.v4f[1];
    dst->z = V.v4f[2];
-/*
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
    _mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
    *reinterpret_cast<int*>(&dst->z) = _mm_extract_ps(V, 2);
-*/
-#else
+/*
+#elif defined(SSE_INTRINSICS)
    _mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
    __m128 z = _mm_movehl_ps(V, V);
    _mm_store_ss(&dst->z, z);
+*/
 #endif
 }

@ -434,11 +434,12 @@ inline M3D_VECTOR M3D_V4SetY(M3D_VECTOR V, float y) noexcept {
            V.v4f[3]
        }}};
    return U.v;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
    M3D_VECTOR vResult = _mm_set_ss(y);
    vResult = _mm_insert_ps(V, vResult, 0x10);
    return vResult;
-#else
+/*
+#elif defined(SSE_INTRINSICS)
    // Swap y and x
    M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
    // Convert input to vector
@ -448,6 +449,7 @@ inline M3D_VECTOR M3D_V4SetY(M3D_VECTOR V, float y) noexcept {
    // Swap y and x again
    vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
    return vResult;
+*/
 #endif
 }

@ -460,11 +462,12 @@ inline M3D_VECTOR M3D_V4SetZ(M3D_VECTOR V, float z) noexcept {
            V.v4f[3]
        }}};
    return U.v;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
    M3D_VECTOR vResult = _mm_set_ss(z);
    vResult = _mm_insert_ps(V, vResult, 0x20);
    return vResult;
-#else
+/*
+#elif defined(SSE_INTRINSICS)
    // Swap z and x
    M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
    // Convert input to vector
@ -474,6 +477,7 @@ inline M3D_VECTOR M3D_V4SetZ(M3D_VECTOR V, float z) noexcept {
    // Swap z and x again
    vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
    return vResult;
+*/
 #endif
 }

@ -486,11 +490,12 @@ inline M3D_VECTOR M3D_V4SetW(M3D_VECTOR V, float w) noexcept {
            w
        }}};
    return U.v;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
    M3D_VECTOR vResult = _mm_set_ss(w);
    vResult = _mm_insert_ps(V, vResult, 0x30);
    return vResult;
-#else
+/*
+#elif defined(SSE_INTRINSICS)
    // Swap w and x
    M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
    // Convert input to vector
@ -500,6 +505,7 @@ inline M3D_VECTOR M3D_V4SetW(M3D_VECTOR V, float w) noexcept {
    // Swap w and x again
    vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
    return vResult;
+*/
 #endif
 }

@ -693,9 +699,10 @@ inline M3D_VECTOR M3D_V4Round(M3D_VECTOR V) noexcept {
            M3D_Internal::round_to_nearest(V.v4f[3])
        } } };
    return Result.v;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
    return _mm_round_ps(V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-#else
+/*
+#elif defined(SSE_INTRINSICS)
    __m128 sign = _mm_and_ps(V, M3D_MNegativeZero);
    __m128 sMagic = _mm_or_ps(M3D_MNoFraction, sign);
    __m128 R1 = _mm_add_ps(V, sMagic);
@ -706,6 +713,7 @@ inline M3D_VECTOR M3D_V4Round(M3D_VECTOR V) noexcept {
    R1 = _mm_and_ps(R1, mask);
    M3D_VECTOR vResult = _mm_xor_ps(R1, R2);
    return vResult;
+*/
 #endif
 }

@ -827,8 +835,9 @@ inline M3D_VECTOR M3D_V4Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
        Result.f[2] =
        Result.f[3] = V1.v4f[0] * V2.v4f[0] + V1.v4f[1] * V2.v4f[1] + V1.v4f[2] * V2.v4f[2] + V1.v4f[3] * V2.v4f[3];
    return Result.v;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
    return _mm_dp_ps(V1, V2, 0xff);
+/*
 #elif defined(SSE3_INTRINSICS)
    M3D_VECTOR vTemp = _mm_mul_ps(V1, V2);
    vTemp = _mm_hadd_ps(vTemp, vTemp);
@ -841,6 +850,7 @@ inline M3D_VECTOR M3D_V4Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
    vTemp = _mm_shuffle_ps(vTemp, vTemp2, _MM_SHUFFLE(0, 3, 0, 0));  // Copy W to the Z position
    vTemp = _mm_add_ps(vTemp, vTemp2);           // Add Z and W together
    return M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(2, 2, 2, 2));    // Splat Z and return
+*/
 #endif
 }

@ -856,16 +866,17 @@ inline M3D_VECTOR M3D_V4Length(M3D_VECTOR V) noexcept {
    Result = M3D_V4Sqrt(Result);

    return Result;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
    M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0xff);
    return _mm_sqrt_ps(vTemp);
+/*
 #elif defined(SSE3_INTRINSICS)
    M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    return vLengthSq;
-#else
+#elif defined(SSE_INTRINSICS)
    // Perform the dot product on x,y,z and w
    M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has z and w
@ -883,6 +894,7 @@ inline M3D_VECTOR M3D_V4Length(M3D_VECTOR V) noexcept {
    // Get the length
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    return vLengthSq;
+*/
 #endif
 }

@ -1066,14 +1078,15 @@ inline M3D_VECTOR M3D_V3Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
        vResult.f[2] =
        vResult.f[3] = fValue;
    return vResult.v;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
    return _mm_dp_ps(V1, V2, 0x7f);
+/*
 #elif defined(SSE3_INTRINSICS)
    M3D_VECTOR vTemp = _mm_mul_ps(V1, V2);
    vTemp = _mm_and_ps(vTemp, g_XMMask3);
    vTemp = _mm_hadd_ps(vTemp, vTemp);
    return _mm_hadd_ps(vTemp, vTemp);
-#else
+#elif defined(SSE_INTRINSICS)
    // Perform the dot product
    M3D_VECTOR vDot = _mm_mul_ps(V1, V2);
    // x=Dot.v4f[1], y=Dot.v4f[2]
@ -1086,6 +1099,7 @@ inline M3D_VECTOR M3D_V3Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
    vDot = _mm_add_ss(vDot, vTemp);
    // Splat x
    return M3D_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
+*/
 #endif
 }

@ -1130,9 +1144,10 @@ inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept {
    Result = M3D_V4Sqrt(Result);

    return Result;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
    M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
    return _mm_sqrt_ps(vTemp);
+/*
 #elif defined(SSE3_INTRINSICS)
    M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
@ -1140,7 +1155,7 @@ inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept {
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    return vLengthSq;
-#else
+#elif defined(SSE_INTRINSICS)
    // Perform the dot product on x,y and z
    M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has z and y
@ -1156,6 +1171,7 @@ inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept {
    // Get the length
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    return vLengthSq;
+*/
 #endif
 }

@ -1174,8 +1190,7 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
    vResult.v4f[2] = V.v4f[2] * fLength;
    vResult.v4f[3] = V.v4f[3] * fLength;
    return vResult;
-
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
    M3D_VECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f);
    // Prepare for the division
    M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
@ -1185,16 +1200,17 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
    // Divide to perform the normalization
    vResult = _mm_div_ps(V, vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult, vZeroMask);
    // Select qnan or result based on infinite length
-    M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
+    M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
    M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
    vResult = _mm_or_ps(vTemp1, vTemp2);
    return vResult;
+/*
 #elif defined(SSE3_INTRINSICS)
    // Perform the dot product on x,y and z only
    M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
@ -1209,17 +1225,17 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
    // Divide to perform the normalization
    vResult = _mm_div_ps(V, vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult, vZeroMask);
    // Select qnan or result based on infinite length
-    M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
+    M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
    M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
    vResult = _mm_or_ps(vTemp1, vTemp2);
    return vResult;
-#else
+#elif defined(SSE_INTRINSICS)
    // Perform the dot product on x,y and z only
    M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
    M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1));
@ -1245,6 +1261,182 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
    M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
    vResult = _mm_or_ps(vTemp1, vTemp2);
    return vResult;
+*/
+#endif
+}
+
+inline M3D_VECTOR M3D_V2Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
+#ifdef DISABLE_INTRINSICS
+    M3D_V4F32 Result;
+    Result.f[0] =
+        Result.f[1] =
+        Result.f[2] =
+        Result.f[3] = V1.v4f[0] * V2.v4f[0] + V1.v4f[1] * V2.v4f[1];
+    return Result.v;
+#else  // SSE4_INTRINSICS
+    return _mm_dp_ps(V1, V2, 0x3f);
+/*
+#elif defined(SSE3_INTRINSICS)
+    M3D_VECTOR vDot = _mm_mul_ps(V1, V2);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_moveldup_ps(vDot);
+    return vDot;
+#elif defined(SSE_INTRINSICS)
+    // Perform the dot product on x and y
+    M3D_VECTOR vLengthSq = _mm_mul_ps(V1, V2);
+    // vTemp has y splatted
+    M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+*/
+#endif
+}
+
+inline M3D_VECTOR M3D_V2Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
+    // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ]
+#ifdef DISABLE_INTRINSICS
+    float fCross = (V1.v4f[0] * V2.v4f[1]) - (V1.v4f[1] * V2.v4f[0]);
+    M3D_V4F32 vResult;
+    vResult.f[0] =
+        vResult.f[1] =
+        vResult.f[2] =
+        vResult.f[3] = fCross;
+    return vResult.v;
+#else
+    // Swap x and y
+    M3D_VECTOR vResult = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 1, 0, 1));
+    // Perform the muls
+    vResult = _mm_mul_ps(vResult, V1);
+    // Splat y
+    M3D_VECTOR vTemp = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(1, 1, 1, 1));
+    // Sub the values
+    vResult = _mm_sub_ss(vResult, vTemp);
+    // Splat the cross product
+    vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 0, 0, 0));
+    return vResult;
+#endif
+}
+
+inline M3D_VECTOR M3D_V2LengthSq(M3D_VECTOR V) noexcept {
+    return M3D_V2Dot(V, V);
+}
+
+inline M3D_VECTOR M3D_V2Length(M3D_VECTOR V) noexcept {
+#ifdef DISABLE_INTRINSICS
+    M3D_VECTOR Result;
+    Result = M3D_V2LengthSq(V);
+    Result = M3D_V4Sqrt(Result);
+    return Result;
+#else  // SSE4_INTRINSICS
+    M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
+    return _mm_sqrt_ps(vTemp);
+/*
+#elif defined(SSE3_INTRINSICS)
+    M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
+    M3D_VECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ss(vTemp);
+    vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+#elif defined(SSE_INTRINSICS)
+    // Perform the dot product on x and y
+    M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has y splatted
+    M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+*/
+#endif
+}
+
+inline M3D_VECTOR M3D_V2Normalize(M3D_VECTOR V) noexcept {
+#ifdef DISABLE_INTRINSICS
+    M3D_VECTOR vResult = M3D_V2Length(V);
+    float fLength = vResult.v4f[0];
+
+    // Prevent divide by zero
+    if (fLength > 0) {
+        fLength = 1.0f / fLength;
+    }
+
+    vResult.v4f[0] = V.v4f[0] * fLength;
+    vResult.v4f[1] = V.v4f[1] * fLength;
+    vResult.v4f[2] = V.v4f[2] * fLength;
+    vResult.v4f[3] = V.v4f[3] * fLength;
+    return vResult;
+#else  // SSE4_INTRINSICS
+    M3D_VECTOR vLengthSq = _mm_dp_ps(V, V, 0x3f);
+    // Prepare for the division
+    M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    M3D_VECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
+    M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+/*
+#elif defined(SSE3_INTRINSICS)
+    // Perform the dot product on x and y only
+    M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_moveldup_ps(vLengthSq);
+    // Prepare for the division
+    M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    M3D_VECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
+    M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+#elif defined(SSE_INTRINSICS)
+    // Perform the dot product on x and y only
+    M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
+    M3D_VECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    // Prepare for the division
+    M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    M3D_VECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
+    M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+*/
 #endif
 }

@ -1317,7 +1509,7 @@ inline M3D_VECTOR M3D_TNormal(M3D_VECTOR P1, M3D_VECTOR P2, M3D_VECTOR P3) noexc
    M3D_VECTOR L1 = M3D_V4Subtract(P2, P1);
    M3D_VECTOR L2 = M3D_V4Subtract(P3, P1);

-    return M3D_V3Normalize(M3D_V3Cross(L2, L1));
+    return M3D_V2Normalize(M3D_V2Cross(L2, L1));
 }