Improved clipping and fix culling normal computation

This commit is contained in:
JackCarterSmith 2024-11-03 14:12:36 +01:00
parent f1a1c2199f
commit debc5b219b
Signed by: JackCarterSmith
GPG Key ID: 832E52F4E23F8F24
3 changed files with 251 additions and 55 deletions

View File

@ -25,7 +25,7 @@
// * https://en.wikipedia.org/wiki/Hidden-surface_determination#Occlusion_culling
// * https://en.wikipedia.org/wiki/Bounding_volume_hierarchy
static bool VertexIsInsideClipSpace(M3D_F4& V);
static bool VertexClipTest(M3D_F4& V, sf::Vector2f& RTsize, float gb_factor);
Graphic3DRenderer::Graphic3DRenderer() {
mRTSize = {1280.f, 324.f};
@ -140,19 +140,16 @@ void Graphic3DRenderer::Draw(sf::RenderTexture& context) {
break;
// Triangle clipping
//TODO: implement complete Cohen-Sutherland algo or similar
if (VertexIsInsideClipSpace(projVertices[indicePtr[i]]) &&
VertexIsInsideClipSpace(projVertices[indicePtr[i+1]]) &&
VertexIsInsideClipSpace(projVertices[indicePtr[i+2]]))
//TODO: scissor/clipping depending of how many vertices are outside/inside the clipspace, implement complete Cohen-Sutherland algo or similar
if (VertexClipTest(projVertices[indicePtr[i]], mRTSize, 2.5f) &&
VertexClipTest(projVertices[indicePtr[i+1]], mRTSize, 2.5f) &&
VertexClipTest(projVertices[indicePtr[i+2]], mRTSize, 2.5f))
{
M3D_VECTOR V1 = M3D_V4LoadF4(&projVertices[indicePtr[i]]);
M3D_VECTOR V2 = M3D_V4LoadF4(&projVertices[indicePtr[i+1]]);
M3D_VECTOR V3 = M3D_V4LoadF4(&projVertices[indicePtr[i+2]]);
// Face culling
M3D_VECTOR faceNormal = M3D_TNormal(V1,V2,V3);
if (M3D_V4GetX(M3D_V3Dot(V1, faceNormal)) >= 0) {
// Do the perspective divide
V1 = M3D_V4Divide(V1, M3D_V4SplatW(V1));
V2 = M3D_V4Divide(V2, M3D_V4SplatW(V2));
@ -162,6 +159,8 @@ void Graphic3DRenderer::Draw(sf::RenderTexture& context) {
V2 = M3D_V3TransformNDCToViewport(V2, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
V3 = M3D_V3TransformNDCToViewport(V3, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
// Face culling
if (M3D_V4GetX(M3D_TNormal(V1,V2,V3))*0.5f <= 0) {
if (objInFrustrum == DISJOINT) {
v_tri[0].color = sf::Color::Red;
v_tri[1].color = sf::Color::Red;
@ -205,9 +204,9 @@ void Graphic3DRenderer::UpdateInternalTestObjects() {
mRenderList[3]->SetRotation(0.f, thetaAngle, 0.f);
}
inline static bool VertexIsInsideClipSpace(M3D_F4& V) {
return (V.x > -V.w && V.x < V.w &&
V.y > -V.w && V.y < V.w &&
V.z > 0 && V.z < V.w
inline static bool VertexClipTest(M3D_F4& V, sf::Vector2f& RTsize, float gb_factor) {
// Guard band are usually 2-3x the viewport size for the clipping test
return (V.x > -RTsize.x*gb_factor*V.w && V.x < RTsize.y*gb_factor*V.w &&
V.y > -RTsize.x*gb_factor*V.w && V.y < RTsize.y*gb_factor*V.w
);
}

View File

@ -419,6 +419,11 @@ M3D_VECTOR M3D_V3Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept;
M3D_VECTOR M3D_V3LengthSq(M3D_VECTOR V) noexcept;
M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept;
M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept;
M3D_VECTOR M3D_V2Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept;
M3D_VECTOR M3D_V2Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept;
M3D_VECTOR M3D_V2LengthSq(M3D_VECTOR V) noexcept;
M3D_VECTOR M3D_V2Length(M3D_VECTOR V) noexcept;
M3D_VECTOR M3D_V2Normalize(M3D_VECTOR V) noexcept;
#ifndef DISABLE_INTRINSICS

View File

@ -65,16 +65,16 @@ inline M3D_VECTOR M3D_V4LoadF3(const M3D_F3* src) noexcept {
V.v4f[2] = src->z;
V.v4f[3] = 0.f;
return V;
/*
#elif defined(SSE4_INTRINSICS)
#else // SSE4_INTRINSICS
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(src)));
__m128 z = _mm_load_ss(&src->z);
return _mm_insert_ps(xy, z, 0x20);
*/
#else
/*
#elif defined(SSE_INTRINSICS)
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(src)));
__m128 z = _mm_load_ss(&src->z);
return _mm_movelh_ps(xy, z);
*/
#endif
}
@ -97,16 +97,16 @@ inline void M3D_V4StoreF3(M3D_F3* dst, M3D_VECTOR V) noexcept {
dst->x = V.v4f[0];
dst->y = V.v4f[1];
dst->z = V.v4f[2];
/*
#elif defined(SSE4_INTRINSICS)
#else // SSE4_INTRINSICS
*reinterpret_cast<int*>(&dst->x) = _mm_extract_ps(V, 0);
*reinterpret_cast<int*>(&dst->y) = _mm_extract_ps(V, 1);
*reinterpret_cast<int*>(&dst->z) = _mm_extract_ps(V, 2);
*/
#else
/*
#elif defined(SSE_INTRINSICS)
_mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
__m128 z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
_mm_store_ss(&dst->z, z);
*/
#endif
}
@ -115,15 +115,15 @@ inline void M3D_V4StoreF3A(M3D_F3A* dst, M3D_VECTOR V) noexcept {
dst->x = V.v4f[0];
dst->y = V.v4f[1];
dst->z = V.v4f[2];
/*
#elif defined(SSE4_INTRINSICS)
#else // SSE4_INTRINSICS
_mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
*reinterpret_cast<int*>(&dst->z) = _mm_extract_ps(V, 2);
*/
#else
/*
#elif defined(SSE_INTRINSICS)
_mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
__m128 z = _mm_movehl_ps(V, V);
_mm_store_ss(&dst->z, z);
*/
#endif
}
@ -434,11 +434,12 @@ inline M3D_VECTOR M3D_V4SetY(M3D_VECTOR V, float y) noexcept {
V.v4f[3]
}}};
return U.v;
#elif defined(SSE4_INTRINSICS)
#else // SSE4_INTRINSICS
M3D_VECTOR vResult = _mm_set_ss(y);
vResult = _mm_insert_ps(V, vResult, 0x10);
return vResult;
#else
/*
#elif defined(SSE_INTRINSICS)
// Swap y and x
M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
// Convert input to vector
@ -448,6 +449,7 @@ inline M3D_VECTOR M3D_V4SetY(M3D_VECTOR V, float y) noexcept {
// Swap y and x again
vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
return vResult;
*/
#endif
}
@ -460,11 +462,12 @@ inline M3D_VECTOR M3D_V4SetZ(M3D_VECTOR V, float z) noexcept {
V.v4f[3]
}}};
return U.v;
#elif defined(SSE4_INTRINSICS)
#else // SSE4_INTRINSICS
M3D_VECTOR vResult = _mm_set_ss(z);
vResult = _mm_insert_ps(V, vResult, 0x20);
return vResult;
#else
/*
#elif defined(SSE_INTRINSICS)
// Swap z and x
M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
// Convert input to vector
@ -474,6 +477,7 @@ inline M3D_VECTOR M3D_V4SetZ(M3D_VECTOR V, float z) noexcept {
// Swap z and x again
vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
return vResult;
*/
#endif
}
@ -486,11 +490,12 @@ inline M3D_VECTOR M3D_V4SetW(M3D_VECTOR V, float w) noexcept {
w
}}};
return U.v;
#elif defined(SSE4_INTRINSICS)
#else // SSE4_INTRINSICS
M3D_VECTOR vResult = _mm_set_ss(w);
vResult = _mm_insert_ps(V, vResult, 0x30);
return vResult;
#else
/*
#elif defined(SSE_INTRINSICS)
// Swap w and x
M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
// Convert input to vector
@ -500,6 +505,7 @@ inline M3D_VECTOR M3D_V4SetW(M3D_VECTOR V, float w) noexcept {
// Swap w and x again
vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
return vResult;
*/
#endif
}
@ -693,9 +699,10 @@ inline M3D_VECTOR M3D_V4Round(M3D_VECTOR V) noexcept {
M3D_Internal::round_to_nearest(V.v4f[3])
} } };
return Result.v;
#elif defined(SSE4_INTRINSICS)
#else // SSE4_INTRINSICS
return _mm_round_ps(V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
#else
/*
#elif defined(SSE_INTRINSICS)
__m128 sign = _mm_and_ps(V, M3D_MNegativeZero);
__m128 sMagic = _mm_or_ps(M3D_MNoFraction, sign);
__m128 R1 = _mm_add_ps(V, sMagic);
@ -706,6 +713,7 @@ inline M3D_VECTOR M3D_V4Round(M3D_VECTOR V) noexcept {
R1 = _mm_and_ps(R1, mask);
M3D_VECTOR vResult = _mm_xor_ps(R1, R2);
return vResult;
*/
#endif
}
@ -827,8 +835,9 @@ inline M3D_VECTOR M3D_V4Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
Result.f[2] =
Result.f[3] = V1.v4f[0] * V2.v4f[0] + V1.v4f[1] * V2.v4f[1] + V1.v4f[2] * V2.v4f[2] + V1.v4f[3] * V2.v4f[3];
return Result.v;
#elif defined(SSE4_INTRINSICS)
#else // SSE4_INTRINSICS
return _mm_dp_ps(V1, V2, 0xff);
/*
#elif defined(SSE3_INTRINSICS)
M3D_VECTOR vTemp = _mm_mul_ps(V1, V2);
vTemp = _mm_hadd_ps(vTemp, vTemp);
@ -841,6 +850,7 @@ inline M3D_VECTOR M3D_V4Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
vTemp = _mm_shuffle_ps(vTemp, vTemp2, _MM_SHUFFLE(0, 3, 0, 0)); // Copy W to the Z position
vTemp = _mm_add_ps(vTemp, vTemp2); // Add Z and W together
return M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(2, 2, 2, 2)); // Splat Z and return
*/
#endif
}
@ -856,16 +866,17 @@ inline M3D_VECTOR M3D_V4Length(M3D_VECTOR V) noexcept {
Result = M3D_V4Sqrt(Result);
return Result;
#elif defined(SSE4_INTRINSICS)
#else // SSE4_INTRINSICS
M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0xff);
return _mm_sqrt_ps(vTemp);
/*
#elif defined(SSE3_INTRINSICS)
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
vLengthSq = _mm_sqrt_ps(vLengthSq);
return vLengthSq;
#else
#elif defined(SSE_INTRINSICS)
// Perform the dot product on x,y,z and w
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
// vTemp has z and w
@ -883,6 +894,7 @@ inline M3D_VECTOR M3D_V4Length(M3D_VECTOR V) noexcept {
// Get the length
vLengthSq = _mm_sqrt_ps(vLengthSq);
return vLengthSq;
*/
#endif
}
@ -1066,14 +1078,15 @@ inline M3D_VECTOR M3D_V3Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
vResult.f[2] =
vResult.f[3] = fValue;
return vResult.v;
#elif defined(SSE4_INTRINSICS)
#else // SSE4_INTRINSICS
return _mm_dp_ps(V1, V2, 0x7f);
/*
#elif defined(SSE3_INTRINSICS)
M3D_VECTOR vTemp = _mm_mul_ps(V1, V2);
vTemp = _mm_and_ps(vTemp, g_XMMask3);
vTemp = _mm_hadd_ps(vTemp, vTemp);
return _mm_hadd_ps(vTemp, vTemp);
#else
#elif defined(SSE_INTRINSICS)
// Perform the dot product
M3D_VECTOR vDot = _mm_mul_ps(V1, V2);
// x=Dot.v4f[1], y=Dot.v4f[2]
@ -1086,6 +1099,7 @@ inline M3D_VECTOR M3D_V3Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
vDot = _mm_add_ss(vDot, vTemp);
// Splat x
return M3D_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
*/
#endif
}
@ -1130,9 +1144,10 @@ inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept {
Result = M3D_V4Sqrt(Result);
return Result;
#elif defined(SSE4_INTRINSICS)
#else // SSE4_INTRINSICS
M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
return _mm_sqrt_ps(vTemp);
/*
#elif defined(SSE3_INTRINSICS)
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
@ -1140,7 +1155,7 @@ inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept {
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
vLengthSq = _mm_sqrt_ps(vLengthSq);
return vLengthSq;
#else
#elif defined(SSE_INTRINSICS)
// Perform the dot product on x,y and z
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
// vTemp has z and y
@ -1156,6 +1171,7 @@ inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept {
// Get the length
vLengthSq = _mm_sqrt_ps(vLengthSq);
return vLengthSq;
*/
#endif
}
@ -1174,8 +1190,7 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
vResult.v4f[2] = V.v4f[2] * fLength;
vResult.v4f[3] = V.v4f[3] * fLength;
return vResult;
#elif defined(SSE4_INTRINSICS)
#else // SSE4_INTRINSICS
M3D_VECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f);
// Prepare for the division
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
@ -1185,16 +1200,17 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
vResult = _mm_or_ps(vTemp1, vTemp2);
return vResult;
/*
#elif defined(SSE3_INTRINSICS)
// Perform the dot product on x,y and z only
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
@ -1209,17 +1225,17 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
vResult = _mm_or_ps(vTemp1, vTemp2);
return vResult;
#else
#elif defined(SSE_INTRINSICS)
// Perform the dot product on x,y and z only
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1));
@ -1245,6 +1261,182 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
vResult = _mm_or_ps(vTemp1, vTemp2);
return vResult;
*/
#endif
}
inline M3D_VECTOR M3D_V2Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 Result;
Result.f[0] =
Result.f[1] =
Result.f[2] =
Result.f[3] = V1.v4f[0] * V2.v4f[0] + V1.v4f[1] * V2.v4f[1];
return Result.v;
#else // SSE4_INTRINSICS
return _mm_dp_ps(V1, V2, 0x3f);
/*
#elif defined(SSE3_INTRINSICS)
M3D_VECTOR vDot = _mm_mul_ps(V1, V2);
vDot = _mm_hadd_ps(vDot, vDot);
vDot = _mm_moveldup_ps(vDot);
return vDot;
#elif defined(SSE_INTRINSICS)
// Perform the dot product on x and y
M3D_VECTOR vLengthSq = _mm_mul_ps(V1, V2);
// vTemp has y splatted
M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
// x+y
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
return vLengthSq;
*/
#endif
}
inline M3D_VECTOR M3D_V2Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
// [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ]
#ifdef DISABLE_INTRINSICS
float fCross = (V1.v4f[0] * V2.v4f[1]) - (V1.v4f[1] * V2.v4f[0]);
M3D_V4F32 vResult;
vResult.f[0] =
vResult.f[1] =
vResult.f[2] =
vResult.f[3] = fCross;
return vResult.v;
#else
// Swap x and y
M3D_VECTOR vResult = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 1, 0, 1));
// Perform the muls
vResult = _mm_mul_ps(vResult, V1);
// Splat y
M3D_VECTOR vTemp = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(1, 1, 1, 1));
// Sub the values
vResult = _mm_sub_ss(vResult, vTemp);
// Splat the cross product
vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 0, 0, 0));
return vResult;
#endif
}
inline M3D_VECTOR M3D_V2LengthSq(M3D_VECTOR V) noexcept {
return M3D_V2Dot(V, V);
}
inline M3D_VECTOR M3D_V2Length(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_VECTOR Result;
Result = M3D_V2LengthSq(V);
Result = M3D_V4Sqrt(Result);
return Result;
#else // SSE4_INTRINSICS
M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
return _mm_sqrt_ps(vTemp);
/*
#elif defined(SSE3_INTRINSICS)
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
M3D_VECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
vLengthSq = _mm_sqrt_ss(vTemp);
vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
return vLengthSq;
#elif defined(SSE_INTRINSICS)
// Perform the dot product on x and y
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
// vTemp has y splatted
M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
// x+y
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
vLengthSq = _mm_sqrt_ps(vLengthSq);
return vLengthSq;
*/
#endif
}
inline M3D_VECTOR M3D_V2Normalize(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_VECTOR vResult = M3D_V2Length(V);
float fLength = vResult.v4f[0];
// Prevent divide by zero
if (fLength > 0) {
fLength = 1.0f / fLength;
}
vResult.v4f[0] = V.v4f[0] * fLength;
vResult.v4f[1] = V.v4f[1] * fLength;
vResult.v4f[2] = V.v4f[2] * fLength;
vResult.v4f[3] = V.v4f[3] * fLength;
return vResult;
#else // SSE4_INTRINSICS
M3D_VECTOR vLengthSq = _mm_dp_ps(V, V, 0x3f);
// Prepare for the division
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
M3D_VECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
// Reciprocal mul to perform the normalization
vResult = _mm_div_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
vResult = _mm_or_ps(vTemp1, vTemp2);
return vResult;
/*
#elif defined(SSE3_INTRINSICS)
// Perform the dot product on x and y only
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
vLengthSq = _mm_moveldup_ps(vLengthSq);
// Prepare for the division
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
M3D_VECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
// Reciprocal mul to perform the normalization
vResult = _mm_div_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
vResult = _mm_or_ps(vTemp1, vTemp2);
return vResult;
#elif defined(SSE_INTRINSICS)
// Perform the dot product on x and y only
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
M3D_VECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
// Prepare for the division
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
M3D_VECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
// Reciprocal mul to perform the normalization
vResult = _mm_div_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
vResult = _mm_or_ps(vTemp1, vTemp2);
return vResult;
*/
#endif
}
@ -1317,7 +1509,7 @@ inline M3D_VECTOR M3D_TNormal(M3D_VECTOR P1, M3D_VECTOR P2, M3D_VECTOR P3) noexc
M3D_VECTOR L1 = M3D_V4Subtract(P2, P1);
M3D_VECTOR L2 = M3D_V4Subtract(P3, P1);
return M3D_V3Normalize(M3D_V3Cross(L2, L1));
return M3D_V2Normalize(M3D_V2Cross(L2, L1));
}