ProtoTank/Engine/Utils/3DMaths_vec.inl
JackCarterSmith 098345409f
Clipping and backface culling
Culling isn't bad, clipping need a better approach to cut partial triangles.
2024-11-02 00:43:44 +01:00

1403 lines
41 KiB
C++

#pragma once
inline void M3D_ScalarSinCos(float* pSin, float* pCos, float Value) noexcept {
// Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
float quotient = M3D_1DIV2PI * Value;
if (Value >= 0.0f)
quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
else
quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
float y = Value - M3D_2PI * quotient;
// Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
float sign;
if (y > M3D_PIDIV2) {
y = M3D_PI - y;
sign = -1.0f;
} else if (y < -M3D_PIDIV2) {
y = -M3D_PI - y;
sign = -1.0f;
} else {
sign = +1.0f;
}
float y2 = y * y;
// 11-degree minimax approximation
*pSin = (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y;
// 10-degree minimax approximation
float p = ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f;
*pCos = sign * p;
}
namespace M3D_Internal {
#ifdef DISABLE_INTRINSICS
// Round to nearest (even) a.k.a. banker's rounding
inline float round_to_nearest(float x) noexcept {
float i = floorf(x);
x -= i;
if (x < 0.5f)
return i;
if (x > 0.5f)
return i + 1.f;
float int_part;
(void)modff(i / 2.f, &int_part);
if ((2.f * int_part) == i)
return i;
return i + 1.f;
}
#endif
}
/* -------------------------------------------------------------------------------------------------------------------------- */
inline M3D_VECTOR M3D_V4LoadF3(const M3D_F3* src) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_VECTOR V;
V.v4f[0] = src->x;
V.v4f[1] = src->y;
V.v4f[2] = src->z;
V.v4f[3] = 0.f;
return V;
/*
#elif defined(SSE4_INTRINSICS)
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(src)));
__m128 z = _mm_load_ss(&src->z);
return _mm_insert_ps(xy, z, 0x20);
*/
#else
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(src)));
__m128 z = _mm_load_ss(&src->z);
return _mm_movelh_ps(xy, z);
#endif
}
inline M3D_VECTOR M3D_V4LoadF3A(const M3D_F3A* src) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_VECTOR V;
V.v4f[0] = src->x;
V.v4f[1] = src->y;
V.v4f[2] = src->z;
V.v4f[3] = 0.f;
return V;
#else
__m128 V = _mm_load_ps(&src->x); // Reads an extra float which is zero'd
return _mm_and_ps(V, M3D_MMask3);
#endif
}
inline void M3D_V4StoreF3(M3D_F3* dst, M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
dst->x = V.v4f[0];
dst->y = V.v4f[1];
dst->z = V.v4f[2];
/*
#elif defined(SSE4_INTRINSICS)
*reinterpret_cast<int*>(&dst->x) = _mm_extract_ps(V, 0);
*reinterpret_cast<int*>(&dst->y) = _mm_extract_ps(V, 1);
*reinterpret_cast<int*>(&dst->z) = _mm_extract_ps(V, 2);
*/
#else
_mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
__m128 z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
_mm_store_ss(&dst->z, z);
#endif
}
inline void M3D_V4StoreF3A(M3D_F3A* dst, M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
dst->x = V.v4f[0];
dst->y = V.v4f[1];
dst->z = V.v4f[2];
/*
#elif defined(SSE4_INTRINSICS)
_mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
*reinterpret_cast<int*>(&dst->z) = _mm_extract_ps(V, 2);
*/
#else
_mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
__m128 z = _mm_movehl_ps(V, V);
_mm_store_ss(&dst->z, z);
#endif
}
inline M3D_VECTOR M3D_V4LoadF4(const M3D_F4* src) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_VECTOR V;
V.v4f[0] = src->x;
V.v4f[1] = src->y;
V.v4f[2] = src->z;
V.v4f[3] = src->w;
return V;
#else
return _mm_loadu_ps(&src->x);
#endif
}
inline M3D_VECTOR M3D_V4LoadV4A(const M3D_F4A* src) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_VECTOR V;
V.v4f[0] = src->x;
V.v4f[1] = src->y;
V.v4f[2] = src->z;
V.v4f[3] = src->w;
return V;
#else
return _mm_load_ps(&src->x);
#endif
}
inline void M3D_V4StoreF4(M3D_F4* dst, M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
dst->x = V.v4f[0];
dst->y = V.v4f[1];
dst->z = V.v4f[2];
dst->w = V.v4f[3];
#else
_mm_storeu_ps(&dst->x, V);
#endif
}
inline void M3D_V4StoreF4A(M3D_F4A* dst, M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
dst->x = V.v4f[0];
dst->y = V.v4f[1];
dst->z = V.v4f[2];
dst->w = V.v4f[3];
#else
_mm_store_ps(&dst->x, V);
#endif
}
INLINE_AVX_FIX M3D_MATRIX M3D_V4LoadF4x4(const M3D_F4X4* src) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_MATRIX ret;
ret.rows[0].v4f[0] = src->mat[0][0];
ret.rows[0].v4f[1] = src->mat[0][1];
ret.rows[0].v4f[2] = src->mat[0][2];
ret.rows[0].v4f[3] = src->mat[0][3];
ret.rows[1].v4f[0] = src->mat[1][0];
ret.rows[1].v4f[1] = src->mat[1][1];
ret.rows[1].v4f[2] = src->mat[1][2];
ret.rows[1].v4f[3] = src->mat[1][3];
ret.rows[2].v4f[0] = src->mat[2][0];
ret.rows[2].v4f[1] = src->mat[2][1];
ret.rows[2].v4f[2] = src->mat[2][2];
ret.rows[2].v4f[3] = src->mat[2][3];
ret.rows[3].v4f[0] = src->mat[3][0];
ret.rows[3].v4f[1] = src->mat[3][1];
ret.rows[3].v4f[2] = src->mat[3][2];
ret.rows[3].v4f[3] = src->mat[3][3];
return ret;
#else
M3D_MATRIX ret;
ret.rows[0] = _mm_loadu_ps(&src->_00);
ret.rows[1] = _mm_loadu_ps(&src->_10);
ret.rows[2] = _mm_loadu_ps(&src->_20);
ret.rows[3] = _mm_loadu_ps(&src->_30);
return ret;
#endif
}
INLINE_AVX_FIX M3D_MATRIX M3D_V4LoadF4x4A(const M3D_F4X4A* src) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_MATRIX ret;
ret.rows[0].v4f[0] = src->mat[0][0];
ret.rows[0].v4f[1] = src->mat[0][1];
ret.rows[0].v4f[2] = src->mat[0][2];
ret.rows[0].v4f[3] = src->mat[0][3];
ret.rows[1].v4f[0] = src->mat[1][0];
ret.rows[1].v4f[1] = src->mat[1][1];
ret.rows[1].v4f[2] = src->mat[1][2];
ret.rows[1].v4f[3] = src->mat[1][3];
ret.rows[2].v4f[0] = src->mat[2][0];
ret.rows[2].v4f[1] = src->mat[2][1];
ret.rows[2].v4f[2] = src->mat[2][2];
ret.rows[2].v4f[3] = src->mat[2][3];
ret.rows[3].v4f[0] = src->mat[3][0];
ret.rows[3].v4f[1] = src->mat[3][1];
ret.rows[3].v4f[2] = src->mat[3][2];
ret.rows[3].v4f[3] = src->mat[3][3];
return ret;
#else
M3D_MATRIX ret;
ret.rows[0] = _mm_load_ps(&src->_00);
ret.rows[1] = _mm_load_ps(&src->_10);
ret.rows[2] = _mm_load_ps(&src->_20);
ret.rows[3] = _mm_load_ps(&src->_30);
return ret;
#endif
}
INLINE_AVX_FIX void M3D_V4StoreF4x4(M3D_F4X4* dst, M3D_MATRIX M) noexcept {
#ifdef DISABLE_INTRINSICS
dst->mat[0][0] = M.rows[0].v4f[0];
dst->mat[0][1] = M.rows[0].v4f[1];
dst->mat[0][2] = M.rows[0].v4f[2];
dst->mat[0][3] = M.rows[0].v4f[3];
dst->mat[1][0] = M.rows[1].v4f[0];
dst->mat[1][1] = M.rows[1].v4f[1];
dst->mat[1][2] = M.rows[1].v4f[2];
dst->mat[1][3] = M.rows[1].v4f[3];
dst->mat[2][0] = M.rows[2].v4f[0];
dst->mat[2][1] = M.rows[2].v4f[1];
dst->mat[2][2] = M.rows[2].v4f[2];
dst->mat[2][3] = M.rows[2].v4f[3];
dst->mat[3][0] = M.rows[3].v4f[0];
dst->mat[3][1] = M.rows[3].v4f[1];
dst->mat[3][2] = M.rows[3].v4f[2];
dst->mat[3][3] = M.rows[3].v4f[3];
#else
_mm_storeu_ps(&dst->_00, M.rows[0]);
_mm_storeu_ps(&dst->_10, M.rows[1]);
_mm_storeu_ps(&dst->_20, M.rows[2]);
_mm_storeu_ps(&dst->_30, M.rows[3]);
#endif
}
INLINE_AVX_FIX void M3D_V4StoreF4x4A(M3D_F4X4A* dst, M3D_MATRIX M) noexcept {
#ifdef DISABLE_INTRINSICS
dst->mat[0][0] = M.rows[0].v4f[0];
dst->mat[0][1] = M.rows[0].v4f[1];
dst->mat[0][2] = M.rows[0].v4f[2];
dst->mat[0][3] = M.rows[0].v4f[3];
dst->mat[1][0] = M.rows[1].v4f[0];
dst->mat[1][1] = M.rows[1].v4f[1];
dst->mat[1][2] = M.rows[1].v4f[2];
dst->mat[1][3] = M.rows[1].v4f[3];
dst->mat[2][0] = M.rows[2].v4f[0];
dst->mat[2][1] = M.rows[2].v4f[1];
dst->mat[2][2] = M.rows[2].v4f[2];
dst->mat[2][3] = M.rows[2].v4f[3];
dst->mat[3][0] = M.rows[3].v4f[0];
dst->mat[3][1] = M.rows[3].v4f[1];
dst->mat[3][2] = M.rows[3].v4f[2];
dst->mat[3][3] = M.rows[3].v4f[3];
#else
_mm_store_ps(&dst->_00, M.rows[0]);
_mm_store_ps(&dst->_10, M.rows[1]);
_mm_store_ps(&dst->_20, M.rows[2]);
_mm_store_ps(&dst->_30, M.rows[3]);
#endif
}
/* -------------------------------------------------------------------------------------------------------------------------- */
inline M3D_VECTOR M3D_V4Zero() noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 vResult = { { { 0.0f, 0.0f, 0.0f, 0.0f } } };
return vResult.v;
#else
return _mm_setzero_ps();
#endif
}
inline M3D_VECTOR M3D_V4Set(float x, float y, float z, float w) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 ret = {{{x, y, z, w}}};
return ret.v;
#else
return _mm_set_ps(w, z, y, x);
#endif
}
inline M3D_VECTOR M3D_V4Negate(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 ret = {{{
-V.v4f[0],
-V.v4f[1],
-V.v4f[2],
-V.v4f[3]
}}};
return ret.v;
#else
M3D_VECTOR Z = _mm_setzero_ps();
return _mm_sub_ps(Z, V);
#endif
}
inline M3D_VECTOR M3D_V4Replicate(float val) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 ret;
ret.f[0] =
ret.f[1] =
ret.f[2] =
ret.f[3] = val;
return ret.v;
#else
return _mm_set_ps1(val);
#endif
}
inline M3D_VECTOR M3D_V4ReplicatePtr(const float* pValue) noexcept {
#ifdef DISABLE_INTRINSICS
float Value = pValue[0];
M3D_V4F32 vResult;
vResult.f[0] =
vResult.f[1] =
vResult.f[2] =
vResult.f[3] = Value;
return vResult.v;
#elif defined(AVX_INTRINSICS)
return _mm_broadcast_ss(pValue);
#else
return _mm_load_ps1(pValue);
#endif
}
inline M3D_VECTOR M3D_V4TrueInt() noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4U32 vResult = { { { 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU } } };
return vResult.v;
#else
__m128i V = _mm_set1_epi32(-1);
return _mm_castsi128_ps(V);
#endif
}
inline float M3D_V4GetX(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
return V.v4f[0];
#else
return _mm_cvtss_f32(V);
#endif
}
inline float M3D_V4GetY(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
return V.v4f[1];
#else
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
return _mm_cvtss_f32(vTemp);
#endif
}
inline float M3D_V4GetZ(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
return V.v4f[2];
#else
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
return _mm_cvtss_f32(vTemp);
#endif
}
inline float M3D_V4GetW(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
return V.v4f[3];
#else
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
return _mm_cvtss_f32(vTemp);
#endif
}
inline M3D_VECTOR M3D_V4SetX(M3D_VECTOR V, float x) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 U = {{{
x,
V.v4f[1],
V.v4f[2],
V.v4f[3]
}}};
return U.v;
#else
M3D_VECTOR vResult = _mm_set_ss(x);
vResult = _mm_move_ss(V, vResult);
return vResult;
#endif
}
inline M3D_VECTOR M3D_V4SetY(M3D_VECTOR V, float y) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 U = {{{
V.v4f[0],
y,
V.v4f[2],
V.v4f[3]
}}};
return U.v;
#elif defined(SSE4_INTRINSICS)
M3D_VECTOR vResult = _mm_set_ss(y);
vResult = _mm_insert_ps(V, vResult, 0x10);
return vResult;
#else
// Swap y and x
M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
// Convert input to vector
M3D_VECTOR vTemp = _mm_set_ss(y);
// Replace the x component
vResult = _mm_move_ss(vResult, vTemp);
// Swap y and x again
vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
return vResult;
#endif
}
inline M3D_VECTOR M3D_V4SetZ(M3D_VECTOR V, float z) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 U = {{{
V.v4f[0],
V.v4f[1],
z,
V.v4f[3]
}}};
return U.v;
#elif defined(SSE4_INTRINSICS)
M3D_VECTOR vResult = _mm_set_ss(z);
vResult = _mm_insert_ps(V, vResult, 0x20);
return vResult;
#else
// Swap z and x
M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
// Convert input to vector
M3D_VECTOR vTemp = _mm_set_ss(z);
// Replace the x component
vResult = _mm_move_ss(vResult, vTemp);
// Swap z and x again
vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
return vResult;
#endif
}
inline M3D_VECTOR M3D_V4SetW(M3D_VECTOR V, float w) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 U = {{{
V.v4f[0],
V.v4f[1],
V.v4f[2],
w
}}};
return U.v;
#elif defined(SSE4_INTRINSICS)
M3D_VECTOR vResult = _mm_set_ss(w);
vResult = _mm_insert_ps(V, vResult, 0x30);
return vResult;
#else
// Swap w and x
M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
// Convert input to vector
M3D_VECTOR vTemp = _mm_set_ss(w);
// Replace the x component
vResult = _mm_move_ss(vResult, vTemp);
// Swap w and x again
vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
return vResult;
#endif
}
inline M3D_VECTOR M3D_V4Permute(M3D_VECTOR V1, M3D_VECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW) noexcept {
#if defined(AVX_INTRINSICS) && !defined(DISABLE_INTRINSICS)
static const M3D_V4U32 three = {{{3, 3, 3, 3}}};
M3D_ALIGNED_DATA(16) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
__m128i vControl = _mm_load_si128(reinterpret_cast<const __m128i*>(&elem[0]));
__m128i vSelect = _mm_cmpgt_epi32(vControl, three);
vControl = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(vControl), three));
__m128 shuffled1 = _mm_permutevar_ps(V1, vControl);
__m128 shuffled2 = _mm_permutevar_ps(V2, vControl);
__m128 masked1 = _mm_andnot_ps(_mm_castsi128_ps(vSelect), shuffled1);
__m128 masked2 = _mm_and_ps(_mm_castsi128_ps(vSelect), shuffled2);
return _mm_or_ps(masked1, masked2);
#else
const uint32_t* aPtr[2];
aPtr[0] = reinterpret_cast<const uint32_t*>(&V1);
aPtr[1] = reinterpret_cast<const uint32_t*>(&V2);
M3D_VECTOR Result;
auto pWork = reinterpret_cast<uint32_t*>(&Result);
const uint32_t i0 = PermuteX & 3;
const uint32_t vi0 = PermuteX >> 2;
pWork[0] = aPtr[vi0][i0];
const uint32_t i1 = PermuteY & 3;
const uint32_t vi1 = PermuteY >> 2;
pWork[1] = aPtr[vi1][i1];
const uint32_t i2 = PermuteZ & 3;
const uint32_t vi2 = PermuteZ >> 2;
pWork[2] = aPtr[vi2][i2];
const uint32_t i3 = PermuteW & 3;
const uint32_t vi3 = PermuteW >> 2;
pWork[3] = aPtr[vi3][i3];
return Result;
#endif
}
inline M3D_VECTOR M3D_V4Swizzle(M3D_VECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 Result = {{{
V.v4f[E0],
V.v4f[E1],
V.v4f[E2],
V.v4f[E3]
}}};
return Result.v;
#elif defined(AVX_INTRINSICS)
unsigned int elem[4] = { E0, E1, E2, E3 };
__m128i vControl = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&elem[0]));
return _mm_permutevar_ps(V, vControl);
#else
auto aPtr = reinterpret_cast<const uint32_t*>(&V);
M3D_VECTOR Result;
auto pWork = reinterpret_cast<uint32_t*>(&Result);
pWork[0] = aPtr[E0];
pWork[1] = aPtr[E1];
pWork[2] = aPtr[E2];
pWork[3] = aPtr[E3];
return Result;
#endif
}
inline M3D_VECTOR M3D_V4SplatOne() noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 vResult;
vResult.f[0] =
vResult.f[1] =
vResult.f[2] =
vResult.f[3] = 1.0f;
return vResult.v;
#else
return M3D_MOne;
#endif
}
inline M3D_VECTOR M3D_V4SplatInfinity() noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4U32 vResult;
vResult.u[0] =
vResult.u[1] =
vResult.u[2] =
vResult.u[3] = 0x7F800000;
return vResult.v;
#else
return M3D_MInfinity;
#endif
}
inline M3D_VECTOR M3D_V4SplatX(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 vResult;
vResult.f[0] =
vResult.f[1] =
vResult.f[2] =
vResult.f[3] = V.v4f[0];
return vResult.v;
#elif defined(AVX2_INTRINSICS) && defined(FAVOR_INTEL)
return _mm_broadcastss_ps(V);
#else
return M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
#endif
}
inline M3D_VECTOR M3D_V4SplatY(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 vResult;
vResult.f[0] =
vResult.f[1] =
vResult.f[2] =
vResult.f[3] = V.v4f[1];
return vResult.v;
#else
return M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
#endif
}
inline M3D_VECTOR M3D_V4SplatZ(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 vResult;
vResult.f[0] =
vResult.f[1] =
vResult.f[2] =
vResult.f[3] = V.v4f[2];
return vResult.v;
#else
return M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
#endif
}
inline M3D_VECTOR M3D_V4SplatW(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 vResult;
vResult.f[0] =
vResult.f[1] =
vResult.f[2] =
vResult.f[3] = V.v4f[3];
return vResult.v;
#else
return M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
#endif
}
inline M3D_VECTOR M3D_V4Min(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 Result = { { {
(V1.v4f[0] < V2.v4f[0]) ? V1.v4f[0] : V2.v4f[0],
(V1.v4f[1] < V2.v4f[1]) ? V1.v4f[1] : V2.v4f[1],
(V1.v4f[2] < V2.v4f[2]) ? V1.v4f[2] : V2.v4f[2],
(V1.v4f[3] < V2.v4f[3]) ? V1.v4f[3] : V2.v4f[3]
} } };
return Result.v;
#else
return _mm_min_ps(V1, V2);
#endif
}
inline M3D_VECTOR M3D_V4Max(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 Result = { { {
(V1.v4f[0] > V2.v4f[0]) ? V1.v4f[0] : V2.v4f[0],
(V1.v4f[1] > V2.v4f[1]) ? V1.v4f[1] : V2.v4f[1],
(V1.v4f[2] > V2.v4f[2]) ? V1.v4f[2] : V2.v4f[2],
(V1.v4f[3] > V2.v4f[3]) ? V1.v4f[3] : V2.v4f[3]
} } };
return Result.v;
#else
return _mm_max_ps(V1, V2);
#endif
}
inline M3D_VECTOR M3D_V4Round(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 Result = { { {
M3D_Internal::round_to_nearest(V.v4f[0]),
M3D_Internal::round_to_nearest(V.v4f[1]),
M3D_Internal::round_to_nearest(V.v4f[2]),
M3D_Internal::round_to_nearest(V.v4f[3])
} } };
return Result.v;
#elif defined(SSE4_INTRINSICS)
return _mm_round_ps(V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
#else
__m128 sign = _mm_and_ps(V, M3D_MNegativeZero);
__m128 sMagic = _mm_or_ps(M3D_MNoFraction, sign);
__m128 R1 = _mm_add_ps(V, sMagic);
R1 = _mm_sub_ps(R1, sMagic);
__m128 R2 = _mm_and_ps(V, M3D_MAbsMask);
__m128 mask = _mm_cmple_ps(R2, M3D_MNoFraction);
R2 = _mm_andnot_ps(mask, V);
R1 = _mm_and_ps(R1, mask);
M3D_VECTOR vResult = _mm_xor_ps(R1, R2);
return vResult;
#endif
}
inline M3D_VECTOR M3D_V4Add(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 ret = {{{
V1.v4f[0] + V2.v4f[0],
V1.v4f[1] + V2.v4f[1],
V1.v4f[2] + V2.v4f[2],
V1.v4f[3] + V2.v4f[3]
}}};
return ret.v;
#else
return _mm_add_ps(V1, V2);
#endif
}
inline M3D_VECTOR M3D_V4Subtract(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 ret = {{{
V1.v4f[0] - V2.v4f[0],
V1.v4f[1] - V2.v4f[1],
V1.v4f[2] - V2.v4f[2],
V1.v4f[3] - V2.v4f[3]
}}};
return ret.v;
#else
return _mm_sub_ps(V1, V2);
#endif
}
inline M3D_VECTOR M3D_V4Multiply(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 Result = {{{
V1.v4f[0] * V2.v4f[0],
V1.v4f[1] * V2.v4f[1],
V1.v4f[2] * V2.v4f[2],
V1.v4f[3] * V2.v4f[3]
}}};
return Result.v;
#else
return _mm_mul_ps(V1, V2);
#endif
}
inline M3D_VECTOR M3D_V4MultiplyAdd(M3D_VECTOR V1, M3D_VECTOR V2, M3D_VECTOR V3) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 ret = {{{
V1.v4f[0] * V2.v4f[0] + V3.v4f[0],
V1.v4f[1] * V2.v4f[1] + V3.v4f[1],
V1.v4f[2] * V2.v4f[2] + V3.v4f[2],
V1.v4f[3] * V2.v4f[3] + V3.v4f[3]
}}};
return ret.v;
#else
return M3D_FMADD_PS(V1, V2, V3);
#endif
}
inline M3D_VECTOR M3D_V4Divide(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 ret = {{{
V1.v4f[0] / V2.v4f[0],
V1.v4f[1] / V2.v4f[1],
V1.v4f[2] / V2.v4f[2],
V1.v4f[3] / V2.v4f[3]
}}};
return ret.v;
#else
return _mm_div_ps(V1, V2);
#endif
}
inline M3D_VECTOR M3D_V4NegativeMultiplySubtract(M3D_VECTOR V1, M3D_VECTOR V2, M3D_VECTOR V3) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 Result = {{{
V3.v4f[0] - (V1.v4f[0] * V2.v4f[0]),
V3.v4f[1] - (V1.v4f[1] * V2.v4f[1]),
V3.v4f[2] - (V1.v4f[2] * V2.v4f[2]),
V3.v4f[3] - (V1.v4f[3] * V2.v4f[3])
}}};
return Result;
#else
return M3D_FNMADD_PS(V1, V2, V3);
#endif
}
inline bool M3D_V4EqualInt(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
return (((V1.v4u[0] == V2.v4u[0]) && (V1.v4u[1] == V2.v4u[1]) && (V1.v4u[2] == V2.v4u[2]) && (V1.v4u[3] == V2.v4u[3])) != 0);
#else
__m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) == 0xf) != 0);
#endif
}
inline M3D_VECTOR M3D_V4Abs(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 vResult = {{{
fabsf(V.v4f[0]),
fabsf(V.v4f[1]),
fabsf(V.v4f[2]),
fabsf(V.v4f[3])
}}};
return vResult.v;
#else
M3D_VECTOR vResult = _mm_setzero_ps();
vResult = _mm_sub_ps(vResult, V);
vResult = _mm_max_ps(vResult, V);
return vResult;
#endif
}
inline M3D_VECTOR M3D_V4Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 Result;
Result.f[0] =
Result.f[1] =
Result.f[2] =
Result.f[3] = V1.v4f[0] * V2.v4f[0] + V1.v4f[1] * V2.v4f[1] + V1.v4f[2] * V2.v4f[2] + V1.v4f[3] * V2.v4f[3];
return Result.v;
#elif defined(SSE4_INTRINSICS)
return _mm_dp_ps(V1, V2, 0xff);
#elif defined(SSE3_INTRINSICS)
M3D_VECTOR vTemp = _mm_mul_ps(V1, V2);
vTemp = _mm_hadd_ps(vTemp, vTemp);
return _mm_hadd_ps(vTemp, vTemp);
#else
M3D_VECTOR vTemp2 = V2;
M3D_VECTOR vTemp = _mm_mul_ps(V1, vTemp2);
vTemp2 = _mm_shuffle_ps(vTemp2, vTemp, _MM_SHUFFLE(1, 0, 0, 0)); // Copy X to the Z position and Y to the W position
vTemp2 = _mm_add_ps(vTemp2, vTemp); // Add Z = X+Z; W = Y+W;
vTemp = _mm_shuffle_ps(vTemp, vTemp2, _MM_SHUFFLE(0, 3, 0, 0)); // Copy W to the Z position
vTemp = _mm_add_ps(vTemp, vTemp2); // Add Z and W together
return M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(2, 2, 2, 2)); // Splat Z and return
#endif
}
inline M3D_VECTOR M3D_V4LengthSq(M3D_VECTOR V) noexcept {
return M3D_V4Dot(V, V);
}
inline M3D_VECTOR M3D_V4Length(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_VECTOR Result;
Result = M3D_V4LengthSq(V);
Result = M3D_V4Sqrt(Result);
return Result;
#elif defined(SSE4_INTRINSICS)
M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0xff);
return _mm_sqrt_ps(vTemp);
#elif defined(SSE3_INTRINSICS)
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
vLengthSq = _mm_sqrt_ps(vLengthSq);
return vLengthSq;
#else
// Perform the dot product on x,y,z and w
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
// vTemp has z and w
M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
// x+z, y+w
vLengthSq = _mm_add_ps(vLengthSq, vTemp);
// x+z,x+z,x+z,y+w
vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
// ??,??,y+w,y+w
vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
// ??,??,x+z+y+w,??
vLengthSq = _mm_add_ps(vLengthSq, vTemp);
// Splat the length
vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
// Get the length
vLengthSq = _mm_sqrt_ps(vLengthSq);
return vLengthSq;
#endif
}
inline M3D_VECTOR M3D_V4Scale(M3D_VECTOR V, float scale) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 ret = {{{
V.v4f[0] * scale,
V.v4f[1] * scale,
V.v4f[2] * scale,
V.v4f[3] * scale
}}};
return ret.v;
#else
M3D_VECTOR ret = _mm_set_ps1(scale);
return _mm_mul_ps(ret, V);
#endif
}
inline M3D_VECTOR M3D_V4Select(M3D_VECTOR V1, M3D_VECTOR V2, M3D_VECTOR Control) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4U32 ret = {{{
(V1.v4u[0] & ~Control.v4u[0]) | (V2.v4u[0] & Control.v4u[0]),
(V1.v4u[1] & ~Control.v4u[1]) | (V2.v4u[1] & Control.v4u[1]),
(V1.v4u[2] & ~Control.v4u[2]) | (V2.v4u[2] & Control.v4u[2]),
(V1.v4u[3] & ~Control.v4u[3]) | (V2.v4u[3] & Control.v4u[3]),
}}};
return ret.v;
#else
M3D_VECTOR vTemp1 = _mm_andnot_ps(Control, V1);
M3D_VECTOR vTemp2 = _mm_and_ps(V2, Control);
return _mm_or_ps(vTemp1, vTemp2);
#endif
}
inline M3D_VECTOR M3D_V4MergeXY(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4U32 Result = {{{
V1.v4u[0],
V2.v4u[0],
V1.v4u[1],
V2.v4u[1],
}}};
return Result.v;
#else
return _mm_unpacklo_ps(V1, V2);
#endif
}
inline M3D_VECTOR M3D_V4MergeZW(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4U32 Result = {{{
V1.v4u[2],
V2.v4u[2],
V1.v4u[3],
V2.v4u[3]
}}};
return Result.v;
#else
return _mm_unpackhi_ps(V1, V2);
#endif
}
inline M3D_VECTOR M3D_V4Greater(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4U32 Control = {{{
(V1.v4f[0] > V2.v4f[0]) ? 0xFFFFFFFF : 0,
(V1.v4f[1] > V2.v4f[1]) ? 0xFFFFFFFF : 0,
(V1.v4f[2] > V2.v4f[2]) ? 0xFFFFFFFF : 0,
(V1.v4f[3] > V2.v4f[3]) ? 0xFFFFFFFF : 0
}}};
return Control.v;
#else
return _mm_cmpgt_ps(V1, V2);
#endif
}
inline M3D_VECTOR M3D_V4Less(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4U32 Control = {{{
(V1.v4f[0] < V2.v4f[0]) ? 0xFFFFFFFF : 0,
(V1.v4f[1] < V2.v4f[1]) ? 0xFFFFFFFF : 0,
(V1.v4f[2] < V2.v4f[2]) ? 0xFFFFFFFF : 0,
(V1.v4f[3] < V2.v4f[3]) ? 0xFFFFFFFF : 0
}}};
return Control.v;
#else
return _mm_cmplt_ps(V1, V2);
#endif
}
inline M3D_VECTOR M3D_V4AndInt(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4U32 Result = {{{
V1.v4u[0] & V2.v4u[0],
V1.v4u[1] & V2.v4u[1],
V1.v4u[2] & V2.v4u[2],
V1.v4u[3] & V2.v4u[3]
}}};
return Result;
#else
return _mm_and_ps(V1, V2);
#endif
}
inline M3D_VECTOR M3D_V4OrInt(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4U32 Result = {{{
V1.v4u[0] | V2.v4u[0],
V1.v4u[1] | V2.v4u[1],
V1.v4u[2] | V2.v4u[2],
V1.v4u[3] | V2.v4u[3]
}}};
return Result.v;
#else
__m128i V = _mm_or_si128(_mm_castps_si128(V1), _mm_castps_si128(V2));
return _mm_castsi128_ps(V);
#endif
}
inline M3D_VECTOR M3D_V4Reciprocal(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 Result = {{{
1.f / V.v4f[0],
1.f / V.v4f[1],
1.f / V.v4f[2],
1.f / V.v4f[3]
}}};
return Result.v;
#else
return _mm_div_ps(M3D_MOne, V);
#endif
}
inline M3D_VECTOR M3D_V4Sqrt(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 Result = {{{
sqrtf(V.v4f[0]),
sqrtf(V.v4f[1]),
sqrtf(V.v4f[2]),
sqrtf(V.v4f[3])
}}};
return Result.v;
#else
return _mm_sqrt_ps(V);
#endif
}
inline M3D_VECTOR M3D_V4ModAngles(M3D_VECTOR Angles) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_VECTOR V;
M3D_VECTOR Result;
// Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
V = M3D_V4Multiply(Angles, M3D_MReciprocalTwoPi.v);
V = M3D_V4Round(V);
Result = M3D_V4NegativeMultiplySubtract(M3D_MTwoPi.v, V, Angles);
return Result;
#else
// Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
M3D_VECTOR vResult = _mm_mul_ps(Angles, M3D_MReciprocalTwoPi);
// Use the inline function due to complexity for rounding
vResult = M3D_V4Round(vResult);
return M3D_FNMADD_PS(vResult, M3D_MTwoPi, Angles);
#endif
}
inline bool M3D_V3LessOrEqual(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
return (((V1.v4f[0] <= V2.v4f[0]) && (V1.v4f[1] <= V2.v4f[1]) && (V1.v4f[2] <= V2.v4f[2])) != 0);
#else
M3D_VECTOR vTemp = _mm_cmple_ps(V1, V2);
return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
#endif
}
inline M3D_VECTOR M3D_V3Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#ifdef DISABLE_INTRINSICS
float fValue = V1.v4f[0] * V2.v4f[0] + V1.v4f[1] * V2.v4f[1] + V1.v4f[2] * V2.v4f[2];
M3D_V4F32 vResult;
vResult.f[0] =
vResult.f[1] =
vResult.f[2] =
vResult.f[3] = fValue;
return vResult.v;
#elif defined(SSE4_INTRINSICS)
return _mm_dp_ps(V1, V2, 0x7f);
#elif defined(SSE3_INTRINSICS)
M3D_VECTOR vTemp = _mm_mul_ps(V1, V2);
vTemp = _mm_and_ps(vTemp, g_XMMask3);
vTemp = _mm_hadd_ps(vTemp, vTemp);
return _mm_hadd_ps(vTemp, vTemp);
#else
// Perform the dot product
M3D_VECTOR vDot = _mm_mul_ps(V1, V2);
// x=Dot.v4f[1], y=Dot.v4f[2]
M3D_VECTOR vTemp = M3D_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
// Result.v4f[0] = x+y
vDot = _mm_add_ss(vDot, vTemp);
// x=Dot.v4f[2]
vTemp = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
// Result.v4f[0] = (x+y)+z
vDot = _mm_add_ss(vDot, vTemp);
// Splat x
return M3D_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
#endif
}
inline M3D_VECTOR M3D_V3Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
// [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ]
#ifdef DISABLE_INTRINSICS
M3D_V4F32 vResult = {{{
(V1.v4f[1] * V2.v4f[2]) - (V1.v4f[2] * V2.v4f[1]),
(V1.v4f[2] * V2.v4f[0]) - (V1.v4f[0] * V2.v4f[2]),
(V1.v4f[0] * V2.v4f[1]) - (V1.v4f[1] * V2.v4f[0]),
0.0f
}}};
return vResult.v;
#else
// y1,z1,x1,w1
M3D_VECTOR vTemp1 = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(3, 0, 2, 1));
// z2,x2,y2,w2
M3D_VECTOR vTemp2 = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(3, 1, 0, 2));
// Perform the left operation
M3D_VECTOR vResult = _mm_mul_ps(vTemp1, vTemp2);
// z1,x1,y1,w1
vTemp1 = M3D_PERMUTE_PS(vTemp1, _MM_SHUFFLE(3, 0, 2, 1));
// y2,z2,x2,w2
vTemp2 = M3D_PERMUTE_PS(vTemp2, _MM_SHUFFLE(3, 1, 0, 2));
// Perform the right operation
vResult = M3D_FNMADD_PS(vTemp1, vTemp2, vResult);
// Set w to zero
return _mm_and_ps(vResult, M3D_MMask3);
#endif
}
inline M3D_VECTOR M3D_V3LengthSq(M3D_VECTOR V) noexcept {
return M3D_V3Dot(V, V);
}
inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_VECTOR Result;
Result = M3D_V3LengthSq(V);
Result = M3D_V4Sqrt(Result);
return Result;
#elif defined(SSE4_INTRINSICS)
M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
return _mm_sqrt_ps(vTemp);
#elif defined(SSE3_INTRINSICS)
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
vLengthSq = _mm_sqrt_ps(vLengthSq);
return vLengthSq;
#else
// Perform the dot product on x,y and z
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
// vTemp has z and y
M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2));
// x+z, y
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
// y,y,y,y
vTemp = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
// x+z+y,??,??,??
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
// Splat the length squared
vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
// Get the length
vLengthSq = _mm_sqrt_ps(vLengthSq);
return vLengthSq;
#endif
}
inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_VECTOR vResult = M3D_V3Length(V);
float fLength = vResult.v4f[0];
// Prevent divide by zero - uhuh
if (fLength > 0) {
fLength = 1.0f / fLength;
}
vResult.v4f[0] = V.v4f[0] * fLength;
vResult.v4f[1] = V.v4f[1] * fLength;
vResult.v4f[2] = V.v4f[2] * fLength;
vResult.v4f[3] = V.v4f[3] * fLength;
return vResult;
#elif defined(SSE4_INTRINSICS)
M3D_VECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f);
// Prepare for the division
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
M3D_VECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
vResult = _mm_or_ps(vTemp1, vTemp2);
return vResult;
#elif defined(SSE3_INTRINSICS)
// Perform the dot product on x,y and z only
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
// Prepare for the division
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
M3D_VECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
vResult = _mm_or_ps(vTemp1, vTemp2);
return vResult;
#else
// Perform the dot product on x,y and z only
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1));
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
vTemp = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
// Prepare for the division
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
M3D_VECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
vResult = _mm_or_ps(vTemp1, vTemp2);
return vResult;
#endif
}
/* -------------------------------------------------------------------------------------------------------------------------- */
inline M3D_VECTOR M3D_QMultiply(M3D_VECTOR Q1, M3D_VECTOR Q2) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 Result = {{{
(Q2.v4f[3] * Q1.v4f[0]) + (Q2.v4f[0] * Q1.v4f[3]) + (Q2.v4f[1] * Q1.v4f[2]) - (Q2.v4f[2] * Q1.v4f[1]),
(Q2.v4f[3] * Q1.v4f[1]) - (Q2.v4f[0] * Q1.v4f[2]) + (Q2.v4f[1] * Q1.v4f[3]) + (Q2.v4f[2] * Q1.v4f[0]),
(Q2.v4f[3] * Q1.v4f[2]) + (Q2.v4f[0] * Q1.v4f[1]) - (Q2.v4f[1] * Q1.v4f[0]) + (Q2.v4f[2] * Q1.v4f[3]),
(Q2.v4f[3] * Q1.v4f[3]) - (Q2.v4f[0] * Q1.v4f[0]) - (Q2.v4f[1] * Q1.v4f[1]) - (Q2.v4f[2] * Q1.v4f[2])
}}};
return Result.v;
#else
static const M3D_V4F32 ControlWZYX = {{{1.0f, -1.0f, 1.0f, -1.0f}}};
static const M3D_V4F32 ControlZWXY = {{{1.0f, 1.0f, -1.0f, -1.0f}}};
static const M3D_V4F32 ControlYXWZ = {{{-1.0f, 1.0f, 1.0f, -1.0f}}};
// Copy to SSE registers and use as few as possible for x86
M3D_VECTOR Q2X = Q2;
M3D_VECTOR Q2Y = Q2;
M3D_VECTOR Q2Z = Q2;
M3D_VECTOR vResult = Q2;
// Splat with one instruction
vResult = M3D_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 3, 3, 3));
Q2X = M3D_PERMUTE_PS(Q2X, _MM_SHUFFLE(0, 0, 0, 0));
Q2Y = M3D_PERMUTE_PS(Q2Y, _MM_SHUFFLE(1, 1, 1, 1));
Q2Z = M3D_PERMUTE_PS(Q2Z, _MM_SHUFFLE(2, 2, 2, 2));
// Retire Q1 and perform Q1*Q2W
vResult = _mm_mul_ps(vResult, Q1);
M3D_VECTOR Q1Shuffle = Q1;
// Shuffle the copies of Q1
Q1Shuffle = M3D_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(0, 1, 2, 3));
// Mul by Q1WZYX
Q2X = _mm_mul_ps(Q2X, Q1Shuffle);
Q1Shuffle = M3D_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(2, 3, 0, 1));
// Flip the signs on y and z
vResult = M3D_FMADD_PS(Q2X, ControlWZYX, vResult);
// Mul by Q1ZWXY
Q2Y = _mm_mul_ps(Q2Y, Q1Shuffle);
Q1Shuffle = M3D_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(0, 1, 2, 3));
// Flip the signs on z and w
Q2Y = _mm_mul_ps(Q2Y, ControlZWXY);
// Mul by Q1YXWZ
Q2Z = _mm_mul_ps(Q2Z, Q1Shuffle);
// Flip the signs on x and w
Q2Y = M3D_FMADD_PS(Q2Z, ControlYXWZ, Q2Y);
vResult = _mm_add_ps(vResult, Q2Y);
return vResult;
#endif
}
inline M3D_VECTOR M3D_QConjugate(M3D_VECTOR Q) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 Result = {{{
-Q.v4f[0],
-Q.v4f[1],
-Q.v4f[2],
Q.v4f[3]
}}};
return Result.v;
#else
static const M3D_V4F32 NegativeOne3 = {{{-1.0f, -1.0f, -1.0f, 1.0f}}};
return _mm_mul_ps(Q, NegativeOne3);
#endif
}
inline M3D_VECTOR M3D_TNormal(M3D_VECTOR P1, M3D_VECTOR P2, M3D_VECTOR P3) noexcept {
M3D_VECTOR L1 = M3D_V4Subtract(P2, P1);
M3D_VECTOR L2 = M3D_V4Subtract(P3, P1);
return M3D_V3Normalize(M3D_V3Cross(L2, L1));
}
/* -------------------------------------------------------------------------------------------------------------------------- */
inline void M3D_V4SinCos(M3D_VECTOR* pSin, M3D_VECTOR* pCos, M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 Sin = {{{
sinf(V.v4f[0]),
sinf(V.v4f[1]),
sinf(V.v4f[2]),
sinf(V.v4f[3])
}}};
M3D_V4F32 Cos = {{{
cosf(V.v4f[0]),
cosf(V.v4f[1]),
cosf(V.v4f[2]),
cosf(V.v4f[3])
}}};
*pSin = Sin.v;
*pCos = Cos.v;
#else
// Force the value within the bounds of pi
M3D_VECTOR x = M3D_V4ModAngles(V);
// Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
M3D_VECTOR sign = _mm_and_ps(x, M3D_MNegativeZero);
__m128 c = _mm_or_ps(M3D_MPi, sign); // pi when x >= 0, -pi when x < 0
__m128 absx = _mm_andnot_ps(sign, x); // |x|
__m128 rflx = _mm_sub_ps(c, x);
__m128 comp = _mm_cmple_ps(absx, M3D_MHalfPi);
__m128 select0 = _mm_and_ps(comp, x);
__m128 select1 = _mm_andnot_ps(comp, rflx);
x = _mm_or_ps(select0, select1);
select0 = _mm_and_ps(comp, M3D_MOne);
select1 = _mm_andnot_ps(comp, M3D_MNegativeOne);
sign = _mm_or_ps(select0, select1);
__m128 x2 = _mm_mul_ps(x, x);
// Compute polynomial approximation of sine
const M3D_VECTOR SC1 = M3D_MSinCoeff1;
__m128 vConstantsB = M3D_PERMUTE_PS(SC1, _MM_SHUFFLE(0, 0, 0, 0));
const M3D_VECTOR SC0 = M3D_MSinCoeff0;
__m128 vConstants = M3D_PERMUTE_PS(SC0, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Result = M3D_FMADD_PS(vConstantsB, x2, vConstants);
vConstants = M3D_PERMUTE_PS(SC0, _MM_SHUFFLE(2, 2, 2, 2));
Result = M3D_FMADD_PS(Result, x2, vConstants);
vConstants = M3D_PERMUTE_PS(SC0, _MM_SHUFFLE(1, 1, 1, 1));
Result = M3D_FMADD_PS(Result, x2, vConstants);
vConstants = M3D_PERMUTE_PS(SC0, _MM_SHUFFLE(0, 0, 0, 0));
Result = M3D_FMADD_PS(Result, x2, vConstants);
Result = M3D_FMADD_PS(Result, x2, M3D_MOne);
Result = _mm_mul_ps(Result, x);
*pSin = Result;
// Compute polynomial approximation of cosine
const M3D_VECTOR CC1 = M3D_MCosCoeff1;
vConstantsB = M3D_PERMUTE_PS(CC1, _MM_SHUFFLE(0, 0, 0, 0));
const M3D_VECTOR CC0 = M3D_MCosCoeff0;
vConstants = M3D_PERMUTE_PS(CC0, _MM_SHUFFLE(3, 3, 3, 3));
Result = M3D_FMADD_PS(vConstantsB, x2, vConstants);
vConstants = M3D_PERMUTE_PS(CC0, _MM_SHUFFLE(2, 2, 2, 2));
Result = M3D_FMADD_PS(Result, x2, vConstants);
vConstants = M3D_PERMUTE_PS(CC0, _MM_SHUFFLE(1, 1, 1, 1));
Result = M3D_FMADD_PS(Result, x2, vConstants);
vConstants = M3D_PERMUTE_PS(CC0, _MM_SHUFFLE(0, 0, 0, 0));
Result = M3D_FMADD_PS(Result, x2, vConstants);
Result = M3D_FMADD_PS(Result, x2, M3D_MOne);
Result = _mm_mul_ps(Result, sign);
*pCos = Result;
#endif
}