diff --git a/Engine/Graphics/3DRenderer.cpp b/Engine/Graphics/3DRenderer.cpp
index 0f50440..e285418 100644
--- a/Engine/Graphics/3DRenderer.cpp
+++ b/Engine/Graphics/3DRenderer.cpp
@@ -25,7 +25,7 @@
 //   * https://en.wikipedia.org/wiki/Hidden-surface_determination#Occlusion_culling
 //   * https://en.wikipedia.org/wiki/Bounding_volume_hierarchy
 
-static bool VertexIsInsideClipSpace(M3D_F4& V);
+static bool VertexClipTest(M3D_F4& V, sf::Vector2f& RTsize, float gb_factor);
 
 Graphic3DRenderer::Graphic3DRenderer() {
     mRTSize = {1280.f, 324.f};
@@ -140,28 +140,27 @@ void Graphic3DRenderer::Draw(sf::RenderTexture& context) {
                         break;
 
                     // Triangle clipping
-                    //TODO: implement complete Cohen-Sutherland algo or similar
-                    if (VertexIsInsideClipSpace(projVertices[indicePtr[i]]) &&
-                        VertexIsInsideClipSpace(projVertices[indicePtr[i+1]]) &&
-                        VertexIsInsideClipSpace(projVertices[indicePtr[i+2]])) 
+                    //TODO: scissor/clipping depending of how many vertices are outside/inside the clipspace, implement complete Cohen-Sutherland algo or similar
+                    if (VertexClipTest(projVertices[indicePtr[i]], mRTSize, 2.5f) &&
+                        VertexClipTest(projVertices[indicePtr[i+1]], mRTSize, 2.5f) &&
+                        VertexClipTest(projVertices[indicePtr[i+2]], mRTSize, 2.5f)) 
                     {
                         
                         M3D_VECTOR V1 = M3D_V4LoadF4(&projVertices[indicePtr[i]]);
                         M3D_VECTOR V2 = M3D_V4LoadF4(&projVertices[indicePtr[i+1]]);
                         M3D_VECTOR V3 = M3D_V4LoadF4(&projVertices[indicePtr[i+2]]);
 
+                        // Do the perspective divide
+                        V1 = M3D_V4Divide(V1, M3D_V4SplatW(V1));
+                        V2 = M3D_V4Divide(V2, M3D_V4SplatW(V2));
+                        V3 = M3D_V4Divide(V3, M3D_V4SplatW(V3));
+
+                        V1 = M3D_V3TransformNDCToViewport(V1, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
+                        V2 = M3D_V3TransformNDCToViewport(V2, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
+                        V3 = M3D_V3TransformNDCToViewport(V3, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
+
                         // Face culling
-                        M3D_VECTOR faceNormal = M3D_TNormal(V1,V2,V3);
-                        if (M3D_V4GetX(M3D_V3Dot(V1, faceNormal)) >= 0) {
-                            // Do the perspective divide
-                            V1 = M3D_V4Divide(V1, M3D_V4SplatW(V1));
-                            V2 = M3D_V4Divide(V2, M3D_V4SplatW(V2));
-                            V3 = M3D_V4Divide(V3, M3D_V4SplatW(V3));
-
-                            V1 = M3D_V3TransformNDCToViewport(V1, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
-                            V2 = M3D_V3TransformNDCToViewport(V2, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
-                            V3 = M3D_V3TransformNDCToViewport(V3, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
-
+                        if (M3D_V4GetX(M3D_TNormal(V1,V2,V3))*0.5f <= 0) {
                             if (objInFrustrum == DISJOINT) {
                                 v_tri[0].color = sf::Color::Red;
                                 v_tri[1].color = sf::Color::Red;
@@ -205,9 +204,9 @@ void Graphic3DRenderer::UpdateInternalTestObjects() {
     mRenderList[3]->SetRotation(0.f, thetaAngle, 0.f);
 }
 
-inline static bool VertexIsInsideClipSpace(M3D_F4& V) {
-    return (V.x > -V.w && V.x < V.w &&
-            V.y > -V.w && V.y < V.w &&
-            V.z > 0 && V.z < V.w
+inline static bool VertexClipTest(M3D_F4& V, sf::Vector2f& RTsize, float gb_factor) {
+    // Guard band are usually 2-3x the viewport size for the clipping test
+    return (V.x > -RTsize.x*gb_factor*V.w && V.x < RTsize.y*gb_factor*V.w &&
+            V.y > -RTsize.x*gb_factor*V.w && V.y < RTsize.y*gb_factor*V.w
         );
 }
\ No newline at end of file
diff --git a/Engine/Utils/3DMaths.hpp b/Engine/Utils/3DMaths.hpp
index 73abddf..51eb4c8 100644
--- a/Engine/Utils/3DMaths.hpp
+++ b/Engine/Utils/3DMaths.hpp
@@ -419,6 +419,11 @@ M3D_VECTOR M3D_V3Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept;
 M3D_VECTOR M3D_V3LengthSq(M3D_VECTOR V) noexcept;
 M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept;
 M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept;
+M3D_VECTOR M3D_V2Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept;
+M3D_VECTOR M3D_V2Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept;
+M3D_VECTOR M3D_V2LengthSq(M3D_VECTOR V) noexcept;
+M3D_VECTOR M3D_V2Length(M3D_VECTOR V) noexcept;
+M3D_VECTOR M3D_V2Normalize(M3D_VECTOR V) noexcept;
 
 
 #ifndef DISABLE_INTRINSICS
diff --git a/Engine/Utils/3DMaths_vec.inl b/Engine/Utils/3DMaths_vec.inl
index bd259de..ba107a7 100644
--- a/Engine/Utils/3DMaths_vec.inl
+++ b/Engine/Utils/3DMaths_vec.inl
@@ -65,16 +65,16 @@ inline M3D_VECTOR M3D_V4LoadF3(const M3D_F3* src) noexcept {
     V.v4f[2] = src->z;
     V.v4f[3] = 0.f;
     return V;
-/*
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
     __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(src)));
     __m128 z = _mm_load_ss(&src->z);
     return _mm_insert_ps(xy, z, 0x20);
-*/
-#else
+/*
+#elif defined(SSE_INTRINSICS)
     __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(src)));
     __m128 z = _mm_load_ss(&src->z);
     return _mm_movelh_ps(xy, z);
+*/
 #endif
 }
 
@@ -97,16 +97,16 @@ inline void M3D_V4StoreF3(M3D_F3* dst, M3D_VECTOR V) noexcept {
     dst->x = V.v4f[0];
     dst->y = V.v4f[1];
     dst->z = V.v4f[2];
-/*
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
     *reinterpret_cast<int*>(&dst->x) = _mm_extract_ps(V, 0);
     *reinterpret_cast<int*>(&dst->y) = _mm_extract_ps(V, 1);
     *reinterpret_cast<int*>(&dst->z) = _mm_extract_ps(V, 2);
-*/
-#else
+/*
+#elif defined(SSE_INTRINSICS)
     _mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
     __m128 z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
     _mm_store_ss(&dst->z, z);
+*/
 #endif
 }
 
@@ -115,15 +115,15 @@ inline void M3D_V4StoreF3A(M3D_F3A* dst, M3D_VECTOR V) noexcept {
     dst->x = V.v4f[0];
     dst->y = V.v4f[1];
     dst->z = V.v4f[2];
-/*
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
     _mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
     *reinterpret_cast<int*>(&dst->z) = _mm_extract_ps(V, 2);
-*/
-#else
+/*
+#elif defined(SSE_INTRINSICS)
     _mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
     __m128 z = _mm_movehl_ps(V, V);
     _mm_store_ss(&dst->z, z);
+*/
 #endif
 }
 
@@ -434,11 +434,12 @@ inline M3D_VECTOR M3D_V4SetY(M3D_VECTOR V, float y) noexcept {
             V.v4f[3]
         }}};
     return U.v;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
     M3D_VECTOR vResult = _mm_set_ss(y);
     vResult = _mm_insert_ps(V, vResult, 0x10);
     return vResult;
-#else
+/*
+#elif defined(SSE_INTRINSICS)
     // Swap y and x
     M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
     // Convert input to vector
@@ -448,6 +449,7 @@ inline M3D_VECTOR M3D_V4SetY(M3D_VECTOR V, float y) noexcept {
     // Swap y and x again
     vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
     return vResult;
+*/
 #endif
 }
 
@@ -460,11 +462,12 @@ inline M3D_VECTOR M3D_V4SetZ(M3D_VECTOR V, float z) noexcept {
             V.v4f[3]
         }}};
     return U.v;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
     M3D_VECTOR vResult = _mm_set_ss(z);
     vResult = _mm_insert_ps(V, vResult, 0x20);
     return vResult;
-#else
+/*
+#elif defined(SSE_INTRINSICS)
     // Swap z and x
     M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
     // Convert input to vector
@@ -474,6 +477,7 @@ inline M3D_VECTOR M3D_V4SetZ(M3D_VECTOR V, float z) noexcept {
     // Swap z and x again
     vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
     return vResult;
+*/
 #endif
 }
 
@@ -486,11 +490,12 @@ inline M3D_VECTOR M3D_V4SetW(M3D_VECTOR V, float w) noexcept {
             w
         }}};
     return U.v;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
     M3D_VECTOR vResult = _mm_set_ss(w);
     vResult = _mm_insert_ps(V, vResult, 0x30);
     return vResult;
-#else
+/*
+#elif defined(SSE_INTRINSICS)
     // Swap w and x
     M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
     // Convert input to vector
@@ -500,6 +505,7 @@ inline M3D_VECTOR M3D_V4SetW(M3D_VECTOR V, float w) noexcept {
     // Swap w and x again
     vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
     return vResult;
+*/
 #endif
 }
 
@@ -693,9 +699,10 @@ inline M3D_VECTOR M3D_V4Round(M3D_VECTOR V) noexcept {
             M3D_Internal::round_to_nearest(V.v4f[3])
         } } };
     return Result.v;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
     return _mm_round_ps(V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-#else
+/*
+#elif defined(SSE_INTRINSICS)
     __m128 sign = _mm_and_ps(V, M3D_MNegativeZero);
     __m128 sMagic = _mm_or_ps(M3D_MNoFraction, sign);
     __m128 R1 = _mm_add_ps(V, sMagic);
@@ -706,6 +713,7 @@ inline M3D_VECTOR M3D_V4Round(M3D_VECTOR V) noexcept {
     R1 = _mm_and_ps(R1, mask);
     M3D_VECTOR vResult = _mm_xor_ps(R1, R2);
     return vResult;
+*/
 #endif
 }
 
@@ -827,8 +835,9 @@ inline M3D_VECTOR M3D_V4Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
         Result.f[2] =
         Result.f[3] = V1.v4f[0] * V2.v4f[0] + V1.v4f[1] * V2.v4f[1] + V1.v4f[2] * V2.v4f[2] + V1.v4f[3] * V2.v4f[3];
     return Result.v;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
     return _mm_dp_ps(V1, V2, 0xff);
+/*
 #elif defined(SSE3_INTRINSICS)
     M3D_VECTOR vTemp = _mm_mul_ps(V1, V2);
     vTemp = _mm_hadd_ps(vTemp, vTemp);
@@ -841,6 +850,7 @@ inline M3D_VECTOR M3D_V4Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
     vTemp = _mm_shuffle_ps(vTemp, vTemp2, _MM_SHUFFLE(0, 3, 0, 0));  // Copy W to the Z position
     vTemp = _mm_add_ps(vTemp, vTemp2);           // Add Z and W together
     return M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(2, 2, 2, 2));    // Splat Z and return
+*/
 #endif
 }
 
@@ -856,16 +866,17 @@ inline M3D_VECTOR M3D_V4Length(M3D_VECTOR V) noexcept {
     Result = M3D_V4Sqrt(Result);
 
     return Result;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
     M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0xff);
     return _mm_sqrt_ps(vTemp);
+/*
 #elif defined(SSE3_INTRINSICS)
     M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
     vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
     vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
     vLengthSq = _mm_sqrt_ps(vLengthSq);
     return vLengthSq;
-#else
+#elif defined(SSE_INTRINSICS)
     // Perform the dot product on x,y,z and w
     M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
     // vTemp has z and w
@@ -883,6 +894,7 @@ inline M3D_VECTOR M3D_V4Length(M3D_VECTOR V) noexcept {
     // Get the length
     vLengthSq = _mm_sqrt_ps(vLengthSq);
     return vLengthSq;
+*/
 #endif
 }
 
@@ -1066,14 +1078,15 @@ inline M3D_VECTOR M3D_V3Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
         vResult.f[2] =
         vResult.f[3] = fValue;
     return vResult.v;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
     return _mm_dp_ps(V1, V2, 0x7f);
+/*
 #elif defined(SSE3_INTRINSICS)
     M3D_VECTOR vTemp = _mm_mul_ps(V1, V2);
     vTemp = _mm_and_ps(vTemp, g_XMMask3);
     vTemp = _mm_hadd_ps(vTemp, vTemp);
     return _mm_hadd_ps(vTemp, vTemp);
-#else
+#elif defined(SSE_INTRINSICS)
     // Perform the dot product
     M3D_VECTOR vDot = _mm_mul_ps(V1, V2);
     // x=Dot.v4f[1], y=Dot.v4f[2]
@@ -1086,6 +1099,7 @@ inline M3D_VECTOR M3D_V3Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
     vDot = _mm_add_ss(vDot, vTemp);
     // Splat x
     return M3D_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
+*/
 #endif
 }
 
@@ -1130,9 +1144,10 @@ inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept {
     Result = M3D_V4Sqrt(Result);
 
     return Result;
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
     M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
     return _mm_sqrt_ps(vTemp);
+/*
 #elif defined(SSE3_INTRINSICS)
     M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
     vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
@@ -1140,7 +1155,7 @@ inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept {
     vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
     vLengthSq = _mm_sqrt_ps(vLengthSq);
     return vLengthSq;
-#else
+#elif defined(SSE_INTRINSICS)
     // Perform the dot product on x,y and z
     M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
     // vTemp has z and y
@@ -1156,6 +1171,7 @@ inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept {
     // Get the length
     vLengthSq = _mm_sqrt_ps(vLengthSq);
     return vLengthSq;
+*/
 #endif
 }
 
@@ -1174,8 +1190,7 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
     vResult.v4f[2] = V.v4f[2] * fLength;
     vResult.v4f[3] = V.v4f[3] * fLength;
     return vResult;
-
-#elif defined(SSE4_INTRINSICS)
+#else  // SSE4_INTRINSICS
     M3D_VECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f);
     // Prepare for the division
     M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
@@ -1185,16 +1200,17 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
     vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
     // Failsafe on zero (Or epsilon) length planes
     // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
     // Divide to perform the normalization
     vResult = _mm_div_ps(V, vResult);
     // Any that are infinity, set to zero
     vResult = _mm_and_ps(vResult, vZeroMask);
     // Select qnan or result based on infinite length
-    M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
+    M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
     M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
     vResult = _mm_or_ps(vTemp1, vTemp2);
     return vResult;
+/*
 #elif defined(SSE3_INTRINSICS)
     // Perform the dot product on x,y and z only
     M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
@@ -1209,17 +1225,17 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
     vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
     // Failsafe on zero (Or epsilon) length planes
     // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
     // Divide to perform the normalization
     vResult = _mm_div_ps(V, vResult);
     // Any that are infinity, set to zero
     vResult = _mm_and_ps(vResult, vZeroMask);
     // Select qnan or result based on infinite length
-    M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
+    M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
     M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
     vResult = _mm_or_ps(vTemp1, vTemp2);
     return vResult;
-#else
+#elif defined(SSE_INTRINSICS)
     // Perform the dot product on x,y and z only
     M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
     M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1));
@@ -1245,6 +1261,182 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
     M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
     vResult = _mm_or_ps(vTemp1, vTemp2);
     return vResult;
+*/
+#endif
+}
+
+inline M3D_VECTOR M3D_V2Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
+#ifdef DISABLE_INTRINSICS
+    M3D_V4F32 Result;
+    Result.f[0] =
+        Result.f[1] =
+        Result.f[2] =
+        Result.f[3] = V1.v4f[0] * V2.v4f[0] + V1.v4f[1] * V2.v4f[1];
+    return Result.v;
+#else  // SSE4_INTRINSICS
+    return _mm_dp_ps(V1, V2, 0x3f);
+/*
+#elif defined(SSE3_INTRINSICS)
+    M3D_VECTOR vDot = _mm_mul_ps(V1, V2);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_moveldup_ps(vDot);
+    return vDot;
+#elif defined(SSE_INTRINSICS)
+    // Perform the dot product on x and y
+    M3D_VECTOR vLengthSq = _mm_mul_ps(V1, V2);
+    // vTemp has y splatted
+    M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+*/
+#endif
+}
+
+inline M3D_VECTOR M3D_V2Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
+    // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ]
+#ifdef DISABLE_INTRINSICS
+    float fCross = (V1.v4f[0] * V2.v4f[1]) - (V1.v4f[1] * V2.v4f[0]);
+    M3D_V4F32 vResult;
+    vResult.f[0] =
+        vResult.f[1] =
+        vResult.f[2] =
+        vResult.f[3] = fCross;
+    return vResult.v;
+#else
+    // Swap x and y
+    M3D_VECTOR vResult = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 1, 0, 1));
+    // Perform the muls
+    vResult = _mm_mul_ps(vResult, V1);
+    // Splat y
+    M3D_VECTOR vTemp = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(1, 1, 1, 1));
+    // Sub the values
+    vResult = _mm_sub_ss(vResult, vTemp);
+    // Splat the cross product
+    vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 0, 0, 0));
+    return vResult;
+#endif
+}
+
+inline M3D_VECTOR M3D_V2LengthSq(M3D_VECTOR V) noexcept {
+    return M3D_V2Dot(V, V);
+}
+
+inline M3D_VECTOR M3D_V2Length(M3D_VECTOR V) noexcept {
+#ifdef DISABLE_INTRINSICS
+    M3D_VECTOR Result;
+    Result = M3D_V2LengthSq(V);
+    Result = M3D_V4Sqrt(Result);
+    return Result;
+#else  // SSE4_INTRINSICS
+    M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
+    return _mm_sqrt_ps(vTemp);
+/*
+#elif defined(SSE3_INTRINSICS)
+    M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
+    M3D_VECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ss(vTemp);
+    vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+#elif defined(SSE_INTRINSICS)
+    // Perform the dot product on x and y
+    M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has y splatted
+    M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+*/
+#endif
+}
+
+inline M3D_VECTOR M3D_V2Normalize(M3D_VECTOR V) noexcept {
+#ifdef DISABLE_INTRINSICS
+    M3D_VECTOR vResult = M3D_V2Length(V);
+    float fLength = vResult.v4f[0];
+
+    // Prevent divide by zero
+    if (fLength > 0) {
+        fLength = 1.0f / fLength;
+    }
+
+    vResult.v4f[0] = V.v4f[0] * fLength;
+    vResult.v4f[1] = V.v4f[1] * fLength;
+    vResult.v4f[2] = V.v4f[2] * fLength;
+    vResult.v4f[3] = V.v4f[3] * fLength;
+    return vResult;
+#else  // SSE4_INTRINSICS
+    M3D_VECTOR vLengthSq = _mm_dp_ps(V, V, 0x3f);
+    // Prepare for the division
+    M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    M3D_VECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
+    M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+/*
+#elif defined(SSE3_INTRINSICS)
+    // Perform the dot product on x and y only
+    M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_moveldup_ps(vLengthSq);
+    // Prepare for the division
+    M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    M3D_VECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
+    M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+#elif defined(SSE_INTRINSICS)
+    // Perform the dot product on x and y only
+    M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
+    M3D_VECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    // Prepare for the division
+    M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    M3D_VECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
+    M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+*/
 #endif
 }
 
@@ -1317,7 +1509,7 @@ inline M3D_VECTOR M3D_TNormal(M3D_VECTOR P1, M3D_VECTOR P2, M3D_VECTOR P3) noexc
     M3D_VECTOR L1 = M3D_V4Subtract(P2, P1);
     M3D_VECTOR L2 = M3D_V4Subtract(P3, P1);
 
-    return M3D_V3Normalize(M3D_V3Cross(L2, L1));
+    return M3D_V2Normalize(M3D_V2Cross(L2, L1));
 }