2431 lines
83 KiB
C++
2431 lines
83 KiB
C++
#pragma once
|
|
|
|
|
|
inline void M3D_ScalarSinCos(float* pSin, float* pCos, float Value) noexcept {
|
|
// Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
|
|
float quotient = M3D_1DIV2PI * Value;
|
|
if (Value >= 0.0f)
|
|
quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
|
|
else
|
|
quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
|
|
|
|
float y = Value - M3D_2PI * quotient;
|
|
|
|
// Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
|
|
float sign;
|
|
if (y > M3D_PIDIV2) {
|
|
y = M3D_PI - y;
|
|
sign = -1.0f;
|
|
} else if (y < -M3D_PIDIV2) {
|
|
y = -M3D_PI - y;
|
|
sign = -1.0f;
|
|
} else {
|
|
sign = +1.0f;
|
|
}
|
|
|
|
float y2 = y * y;
|
|
|
|
// 11-degree minimax approximation
|
|
*pSin = (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y;
|
|
|
|
// 10-degree minimax approximation
|
|
float p = ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f;
|
|
*pCos = sign * p;
|
|
}
|
|
|
|
namespace M3D_Internal {
|
|
#ifdef DISABLE_INTRINSICS
|
|
// Round to nearest (even) a.k.a. banker's rounding
|
|
inline float round_to_nearest(float x) noexcept {
|
|
float i = floorf(x);
|
|
x -= i;
|
|
if (x < 0.5f)
|
|
return i;
|
|
if (x > 0.5f)
|
|
return i + 1.f;
|
|
|
|
float int_part;
|
|
(void)modff(i / 2.f, &int_part);
|
|
if ((2.f * int_part) == i)
|
|
return i;
|
|
|
|
return i + 1.f;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_MATRIX::M3D_MATRIX(
|
|
float f00, float f01, float f02, float f03,
|
|
float f10, float f11, float f12, float f13,
|
|
float f20, float f21, float f22, float f23,
|
|
float f30, float f31, float f32, float f33
|
|
) noexcept {
|
|
rows[0] = M3D_V4Set(f00, f01, f02, f03);
|
|
rows[1] = M3D_V4Set(f10, f11, f12, f13);
|
|
rows[2] = M3D_V4Set(f20, f21, f22, f23);
|
|
rows[3] = M3D_V4Set(f30, f31, f32, f33);
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_MATRIX::operator- () const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Negate(rows[0]);
|
|
ret.rows[1] = M3D_V4Negate(rows[1]);
|
|
ret.rows[2] = M3D_V4Negate(rows[2]);
|
|
ret.rows[3] = M3D_V4Negate(rows[3]);
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator+= (M3D_MATRIX M) noexcept {
|
|
rows[0] = M3D_V4Add(rows[0], M.rows[0]);
|
|
rows[1] = M3D_V4Add(rows[1], M.rows[1]);
|
|
rows[2] = M3D_V4Add(rows[2], M.rows[2]);
|
|
rows[3] = M3D_V4Add(rows[3], M.rows[3]);
|
|
return *this;
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator+ (M3D_MATRIX M) const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Add(rows[0], M.rows[0]);
|
|
ret.rows[1] = M3D_V4Add(rows[1], M.rows[1]);
|
|
ret.rows[2] = M3D_V4Add(rows[2], M.rows[2]);
|
|
ret.rows[3] = M3D_V4Add(rows[3], M.rows[3]);
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator-= (M3D_MATRIX M) noexcept {
|
|
rows[0] = M3D_V4Subtract(rows[0], M.rows[0]);
|
|
rows[1] = M3D_V4Subtract(rows[1], M.rows[1]);
|
|
rows[2] = M3D_V4Subtract(rows[2], M.rows[2]);
|
|
rows[3] = M3D_V4Subtract(rows[3], M.rows[3]);
|
|
return *this;
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator- (M3D_MATRIX M) const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Subtract(rows[0], M.rows[0]);
|
|
ret.rows[1] = M3D_V4Subtract(rows[1], M.rows[1]);
|
|
ret.rows[2] = M3D_V4Subtract(rows[2], M.rows[2]);
|
|
ret.rows[3] = M3D_V4Subtract(rows[3], M.rows[3]);
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator*=(M3D_MATRIX M) noexcept {
|
|
*this = M3D_MMultiply(*this, M);
|
|
return *this;
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator*(M3D_MATRIX M) const noexcept {
|
|
return M3D_MMultiply(*this, M);
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator*= (float S) noexcept {
|
|
rows[0] = M3D_V4Scale(rows[0], S);
|
|
rows[1] = M3D_V4Scale(rows[1], S);
|
|
rows[2] = M3D_V4Scale(rows[2], S);
|
|
rows[3] = M3D_V4Scale(rows[3], S);
|
|
return *this;
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator* (float S) const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Scale(rows[0], S);
|
|
ret.rows[1] = M3D_V4Scale(rows[1], S);
|
|
ret.rows[2] = M3D_V4Scale(rows[2], S);
|
|
ret.rows[3] = M3D_V4Scale(rows[3], S);
|
|
return ret;
|
|
}
|
|
inline M3D_MATRIX operator* (float S, M3D_MATRIX M) noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Scale(M.rows[0], S);
|
|
ret.rows[1] = M3D_V4Scale(M.rows[1], S);
|
|
ret.rows[2] = M3D_V4Scale(M.rows[2], S);
|
|
ret.rows[3] = M3D_V4Scale(M.rows[3], S);
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator/= (float S) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR vS = M3D_V4Replicate(S);
|
|
rows[0] = M3D_V4Divide(rows[0], vS);
|
|
rows[1] = M3D_V4Divide(rows[1], vS);
|
|
rows[2] = M3D_V4Divide(rows[2], vS);
|
|
rows[3] = M3D_V4Divide(rows[3], vS);
|
|
return *this;
|
|
#else
|
|
__m128 vS = _mm_set_ps1(S);
|
|
rows[0] = _mm_div_ps(rows[0], vS);
|
|
rows[1] = _mm_div_ps(rows[1], vS);
|
|
rows[2] = _mm_div_ps(rows[2], vS);
|
|
rows[3] = _mm_div_ps(rows[3], vS);
|
|
return *this;
|
|
#endif
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator/ (float S) const noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR vS = M3D_V4Replicate(S);
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Divide(rows[0], vS);
|
|
ret.rows[1] = M3D_V4Divide(rows[1], vS);
|
|
ret.rows[2] = M3D_V4Divide(rows[2], vS);
|
|
ret.rows[3] = M3D_V4Divide(rows[3], vS);
|
|
return ret;
|
|
#else
|
|
__m128 vS = _mm_set_ps1(S);
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_div_ps(rows[0], vS);
|
|
ret.rows[1] = _mm_div_ps(rows[1], vS);
|
|
ret.rows[2] = _mm_div_ps(rows[2], vS);
|
|
ret.rows[3] = _mm_div_ps(rows[3], vS);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_VECTOR M3D_V4LoadF3(const M3D_F3* src) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR V;
|
|
V.v4f[0] = src->x;
|
|
V.v4f[1] = src->y;
|
|
V.v4f[2] = src->z;
|
|
V.v4f[3] = 0.f;
|
|
return V;
|
|
/*
|
|
#elif defined(SSE4_INTRINSICS)
|
|
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(src)));
|
|
__m128 z = _mm_load_ss(&src->z);
|
|
return _mm_insert_ps(xy, z, 0x20);
|
|
*/
|
|
#else
|
|
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(src)));
|
|
__m128 z = _mm_load_ss(&src->z);
|
|
return _mm_movelh_ps(xy, z);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4LoadF3A(const M3D_F3A* src) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR V;
|
|
V.v4f[0] = src->x;
|
|
V.v4f[1] = src->y;
|
|
V.v4f[2] = src->z;
|
|
V.v4f[3] = 0.f;
|
|
return V;
|
|
#else
|
|
__m128 V = _mm_load_ps(&src->x); // Reads an extra float which is zero'd
|
|
return _mm_and_ps(V, M3D_MMask3);
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V4StoreF3(M3D_F3* dst, M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
dst->x = V.v4f[0];
|
|
dst->y = V.v4f[1];
|
|
dst->z = V.v4f[2];
|
|
/*
|
|
#elif defined(SSE4_INTRINSICS)
|
|
*reinterpret_cast<int*>(&dst->x) = _mm_extract_ps(V, 0);
|
|
*reinterpret_cast<int*>(&dst->y) = _mm_extract_ps(V, 1);
|
|
*reinterpret_cast<int*>(&dst->z) = _mm_extract_ps(V, 2);
|
|
*/
|
|
#else
|
|
_mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
|
|
__m128 z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
_mm_store_ss(&dst->z, z);
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V4StoreF3A(M3D_F3A* dst, M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
dst->x = V.v4f[0];
|
|
dst->y = V.v4f[1];
|
|
dst->z = V.v4f[2];
|
|
/*
|
|
#elif defined(SSE4_INTRINSICS)
|
|
_mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
|
|
*reinterpret_cast<int*>(&dst->z) = _mm_extract_ps(V, 2);
|
|
*/
|
|
#else
|
|
_mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
|
|
__m128 z = _mm_movehl_ps(V, V);
|
|
_mm_store_ss(&dst->z, z);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4LoadF4(const M3D_F4* src) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR V;
|
|
V.v4f[0] = src->x;
|
|
V.v4f[1] = src->y;
|
|
V.v4f[2] = src->z;
|
|
V.v4f[3] = src->w;
|
|
return V;
|
|
#else
|
|
return _mm_loadu_ps(&src->x);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4LoadV4A(const M3D_F4A* src) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR V;
|
|
V.v4f[0] = src->x;
|
|
V.v4f[1] = src->y;
|
|
V.v4f[2] = src->z;
|
|
V.v4f[3] = src->w;
|
|
return V;
|
|
#else
|
|
return _mm_load_ps(&src->x);
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V4StoreF4(M3D_F4* dst, M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
dst->x = V.v4f[0];
|
|
dst->y = V.v4f[1];
|
|
dst->z = V.v4f[2];
|
|
dst->w = V.v4f[3];
|
|
#else
|
|
_mm_storeu_ps(&dst->x, V);
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V4StoreF4A(M3D_F4A* dst, M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
dst->x = V.v4f[0];
|
|
dst->y = V.v4f[1];
|
|
dst->z = V.v4f[2];
|
|
dst->w = V.v4f[3];
|
|
#else
|
|
_mm_store_ps(&dst->x, V);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_V4LoadF4x4(const M3D_F4X4* src) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.rows[0].v4f[0] = src->mat[0][0];
|
|
ret.rows[0].v4f[1] = src->mat[0][1];
|
|
ret.rows[0].v4f[2] = src->mat[0][2];
|
|
ret.rows[0].v4f[3] = src->mat[0][3];
|
|
|
|
ret.rows[1].v4f[0] = src->mat[1][0];
|
|
ret.rows[1].v4f[1] = src->mat[1][1];
|
|
ret.rows[1].v4f[2] = src->mat[1][2];
|
|
ret.rows[1].v4f[3] = src->mat[1][3];
|
|
|
|
ret.rows[2].v4f[0] = src->mat[2][0];
|
|
ret.rows[2].v4f[1] = src->mat[2][1];
|
|
ret.rows[2].v4f[2] = src->mat[2][2];
|
|
ret.rows[2].v4f[3] = src->mat[2][3];
|
|
|
|
ret.rows[3].v4f[0] = src->mat[3][0];
|
|
ret.rows[3].v4f[1] = src->mat[3][1];
|
|
ret.rows[3].v4f[2] = src->mat[3][2];
|
|
ret.rows[3].v4f[3] = src->mat[3][3];
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_loadu_ps(&src->_00);
|
|
ret.rows[1] = _mm_loadu_ps(&src->_10);
|
|
ret.rows[2] = _mm_loadu_ps(&src->_20);
|
|
ret.rows[3] = _mm_loadu_ps(&src->_30);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_V4LoadF4x4A(const M3D_F4X4A* src) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.rows[0].v4f[0] = src->mat[0][0];
|
|
ret.rows[0].v4f[1] = src->mat[0][1];
|
|
ret.rows[0].v4f[2] = src->mat[0][2];
|
|
ret.rows[0].v4f[3] = src->mat[0][3];
|
|
|
|
ret.rows[1].v4f[0] = src->mat[1][0];
|
|
ret.rows[1].v4f[1] = src->mat[1][1];
|
|
ret.rows[1].v4f[2] = src->mat[1][2];
|
|
ret.rows[1].v4f[3] = src->mat[1][3];
|
|
|
|
ret.rows[2].v4f[0] = src->mat[2][0];
|
|
ret.rows[2].v4f[1] = src->mat[2][1];
|
|
ret.rows[2].v4f[2] = src->mat[2][2];
|
|
ret.rows[2].v4f[3] = src->mat[2][3];
|
|
|
|
ret.rows[3].v4f[0] = src->mat[3][0];
|
|
ret.rows[3].v4f[1] = src->mat[3][1];
|
|
ret.rows[3].v4f[2] = src->mat[3][2];
|
|
ret.rows[3].v4f[3] = src->mat[3][3];
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_load_ps(&src->_00);
|
|
ret.rows[1] = _mm_load_ps(&src->_10);
|
|
ret.rows[2] = _mm_load_ps(&src->_20);
|
|
ret.rows[3] = _mm_load_ps(&src->_30);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V4StoreF4x4(M3D_F4X4* dst, M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
dst->mat[0][0] = M.rows[0].v4f[0];
|
|
dst->mat[0][1] = M.rows[0].v4f[1];
|
|
dst->mat[0][2] = M.rows[0].v4f[2];
|
|
dst->mat[0][3] = M.rows[0].v4f[3];
|
|
|
|
dst->mat[1][0] = M.rows[1].v4f[0];
|
|
dst->mat[1][1] = M.rows[1].v4f[1];
|
|
dst->mat[1][2] = M.rows[1].v4f[2];
|
|
dst->mat[1][3] = M.rows[1].v4f[3];
|
|
|
|
dst->mat[2][0] = M.rows[2].v4f[0];
|
|
dst->mat[2][1] = M.rows[2].v4f[1];
|
|
dst->mat[2][2] = M.rows[2].v4f[2];
|
|
dst->mat[2][3] = M.rows[2].v4f[3];
|
|
|
|
dst->mat[3][0] = M.rows[3].v4f[0];
|
|
dst->mat[3][1] = M.rows[3].v4f[1];
|
|
dst->mat[3][2] = M.rows[3].v4f[2];
|
|
dst->mat[3][3] = M.rows[3].v4f[3];
|
|
#else
|
|
_mm_storeu_ps(&dst->_00, M.rows[0]);
|
|
_mm_storeu_ps(&dst->_10, M.rows[1]);
|
|
_mm_storeu_ps(&dst->_20, M.rows[2]);
|
|
_mm_storeu_ps(&dst->_30, M.rows[3]);
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V4StoreF4x4A(M3D_F4X4A* dst, M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
dst->mat[0][0] = M.rows[0].v4f[0];
|
|
dst->mat[0][1] = M.rows[0].v4f[1];
|
|
dst->mat[0][2] = M.rows[0].v4f[2];
|
|
dst->mat[0][3] = M.rows[0].v4f[3];
|
|
|
|
dst->mat[1][0] = M.rows[1].v4f[0];
|
|
dst->mat[1][1] = M.rows[1].v4f[1];
|
|
dst->mat[1][2] = M.rows[1].v4f[2];
|
|
dst->mat[1][3] = M.rows[1].v4f[3];
|
|
|
|
dst->mat[2][0] = M.rows[2].v4f[0];
|
|
dst->mat[2][1] = M.rows[2].v4f[1];
|
|
dst->mat[2][2] = M.rows[2].v4f[2];
|
|
dst->mat[2][3] = M.rows[2].v4f[3];
|
|
|
|
dst->mat[3][0] = M.rows[3].v4f[0];
|
|
dst->mat[3][1] = M.rows[3].v4f[1];
|
|
dst->mat[3][2] = M.rows[3].v4f[2];
|
|
dst->mat[3][3] = M.rows[3].v4f[3];
|
|
#else
|
|
_mm_store_ps(&dst->_00, M.rows[0]);
|
|
_mm_store_ps(&dst->_10, M.rows[1]);
|
|
_mm_store_ps(&dst->_20, M.rows[2]);
|
|
_mm_store_ps(&dst->_30, M.rows[3]);
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_VECTOR M3D_V4Set(float x, float y, float z, float w) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret = {{{x, y, z, w}}};
|
|
return ret.v;
|
|
#else
|
|
return _mm_set_ps(w, z, y, x);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Negate(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret = {{{
|
|
-V.v4f[0],
|
|
-V.v4f[1],
|
|
-V.v4f[2],
|
|
-V.v4f[3]
|
|
}}};
|
|
return ret.v;
|
|
#else
|
|
M3D_VECTOR Z = _mm_setzero_ps();
|
|
return _mm_sub_ps(Z, V);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Replicate(float val) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret;
|
|
ret.f[0] =
|
|
ret.f[1] =
|
|
ret.f[2] =
|
|
ret.f[3] = val;
|
|
return ret.v;
|
|
#else
|
|
return _mm_set_ps1(val);
|
|
#endif
|
|
}
|
|
|
|
inline float M3D_V4GetX(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
return V.v4f[0];
|
|
#else
|
|
return _mm_cvtss_f32(V);
|
|
#endif
|
|
}
|
|
|
|
inline float M3D_V4GetY(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
return V.v4f[1];
|
|
#else
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
return _mm_cvtss_f32(vTemp);
|
|
#endif
|
|
}
|
|
|
|
inline float M3D_V4GetZ(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
return V.v4f[2];
|
|
#else
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
return _mm_cvtss_f32(vTemp);
|
|
#endif
|
|
}
|
|
|
|
inline float M3D_V4GetW(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
return V.v4f[3];
|
|
#else
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
|
|
return _mm_cvtss_f32(vTemp);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Permute(M3D_VECTOR V1, M3D_VECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW) noexcept {
|
|
#if defined(AVX_INTRINSICS) && !defined(DISABLE_INTRINSICS)
|
|
static const M3D_V4U32 three = {{{3, 3, 3, 3}}};
|
|
|
|
M3D_ALIGNED_DATA(16) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
|
|
__m128i vControl = _mm_load_si128(reinterpret_cast<const __m128i*>(&elem[0]));
|
|
|
|
__m128i vSelect = _mm_cmpgt_epi32(vControl, three);
|
|
vControl = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(vControl), three));
|
|
|
|
__m128 shuffled1 = _mm_permutevar_ps(V1, vControl);
|
|
__m128 shuffled2 = _mm_permutevar_ps(V2, vControl);
|
|
|
|
__m128 masked1 = _mm_andnot_ps(_mm_castsi128_ps(vSelect), shuffled1);
|
|
__m128 masked2 = _mm_and_ps(_mm_castsi128_ps(vSelect), shuffled2);
|
|
|
|
return _mm_or_ps(masked1, masked2);
|
|
#else
|
|
const uint32_t* aPtr[2];
|
|
aPtr[0] = reinterpret_cast<const uint32_t*>(&V1);
|
|
aPtr[1] = reinterpret_cast<const uint32_t*>(&V2);
|
|
|
|
M3D_VECTOR Result;
|
|
auto pWork = reinterpret_cast<uint32_t*>(&Result);
|
|
|
|
const uint32_t i0 = PermuteX & 3;
|
|
const uint32_t vi0 = PermuteX >> 2;
|
|
pWork[0] = aPtr[vi0][i0];
|
|
|
|
const uint32_t i1 = PermuteY & 3;
|
|
const uint32_t vi1 = PermuteY >> 2;
|
|
pWork[1] = aPtr[vi1][i1];
|
|
|
|
const uint32_t i2 = PermuteZ & 3;
|
|
const uint32_t vi2 = PermuteZ >> 2;
|
|
pWork[2] = aPtr[vi2][i2];
|
|
|
|
const uint32_t i3 = PermuteW & 3;
|
|
const uint32_t vi3 = PermuteW >> 2;
|
|
pWork[3] = aPtr[vi3][i3];
|
|
|
|
return Result;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4SplatX(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 vResult;
|
|
vResult.f[0] =
|
|
vResult.f[1] =
|
|
vResult.f[2] =
|
|
vResult.f[3] = V.v4f[0];
|
|
return vResult.v;
|
|
#elif defined(AVX2_INTRINSICS) && defined(FAVOR_INTEL)
|
|
return _mm_broadcastss_ps(V);
|
|
#else
|
|
return M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4SplatY(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 vResult;
|
|
vResult.f[0] =
|
|
vResult.f[1] =
|
|
vResult.f[2] =
|
|
vResult.f[3] = V.v4f[1];
|
|
return vResult.v;
|
|
#else
|
|
return M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4SplatZ(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 vResult;
|
|
vResult.f[0] =
|
|
vResult.f[1] =
|
|
vResult.f[2] =
|
|
vResult.f[3] = V.v4f[2];
|
|
return vResult.v;
|
|
#else
|
|
return M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4SplatW(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 vResult;
|
|
vResult.f[0] =
|
|
vResult.f[1] =
|
|
vResult.f[2] =
|
|
vResult.f[3] = V.v4f[3];
|
|
return vResult.v;
|
|
#else
|
|
return M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Round(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 Result = { { {
|
|
M3D_Internal::round_to_nearest(V.v4f[0]),
|
|
M3D_Internal::round_to_nearest(V.v4f[1]),
|
|
M3D_Internal::round_to_nearest(V.v4f[2]),
|
|
M3D_Internal::round_to_nearest(V.v4f[3])
|
|
} } };
|
|
return Result.v;
|
|
#elif defined(SSE4_INTRINSICS)
|
|
return _mm_round_ps(V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
|
#else
|
|
__m128 sign = _mm_and_ps(V, M3D_MNegativeZero);
|
|
__m128 sMagic = _mm_or_ps(M3D_MNoFraction, sign);
|
|
__m128 R1 = _mm_add_ps(V, sMagic);
|
|
R1 = _mm_sub_ps(R1, sMagic);
|
|
__m128 R2 = _mm_and_ps(V, M3D_MAbsMask);
|
|
__m128 mask = _mm_cmple_ps(R2, M3D_MNoFraction);
|
|
R2 = _mm_andnot_ps(mask, V);
|
|
R1 = _mm_and_ps(R1, mask);
|
|
M3D_VECTOR vResult = _mm_xor_ps(R1, R2);
|
|
return vResult;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Add(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret = {{{
|
|
V1.v4f[0] + V2.v4f[0],
|
|
V1.v4f[1] + V2.v4f[1],
|
|
V1.v4f[2] + V2.v4f[2],
|
|
V1.v4f[3] + V2.v4f[3]
|
|
}}};
|
|
return ret.v;
|
|
#else
|
|
return _mm_add_ps(V1, V2);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Subtract(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret = {{{
|
|
V1.v4f[0] - V2.v4f[0],
|
|
V1.v4f[1] - V2.v4f[1],
|
|
V1.v4f[2] - V2.v4f[2],
|
|
V1.v4f[3] - V2.v4f[3]
|
|
}}};
|
|
return ret.v;
|
|
#else
|
|
return _mm_sub_ps(V1, V2);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Multiply(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 Result = {{{
|
|
V1.v4f[0] * V2.v4f[0],
|
|
V1.v4f[1] * V2.v4f[1],
|
|
V1.v4f[2] * V2.v4f[2],
|
|
V1.v4f[3] * V2.v4f[3]
|
|
}}};
|
|
return Result.v;
|
|
#else
|
|
return _mm_mul_ps(V1, V2);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4MultiplyAdd(M3D_VECTOR V1, M3D_VECTOR V2, M3D_VECTOR V3) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret = {{{
|
|
V1.v4f[0] * V2.v4f[0] + V3.v4f[0],
|
|
V1.v4f[1] * V2.v4f[1] + V3.v4f[1],
|
|
V1.v4f[2] * V2.v4f[2] + V3.v4f[2],
|
|
V1.v4f[3] * V2.v4f[3] + V3.v4f[3]
|
|
}}};
|
|
return ret.v;
|
|
#else
|
|
return M3D_FMADD_PS(V1, V2, V3);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Divide(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret = {{{
|
|
V1.v4f[0] / V2.v4f[0],
|
|
V1.v4f[1] / V2.v4f[1],
|
|
V1.v4f[2] / V2.v4f[2],
|
|
V1.v4f[3] / V2.v4f[3]
|
|
}}};
|
|
return ret.v;
|
|
#else
|
|
return _mm_div_ps(V1, V2);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4NegativeMultiplySubtract(M3D_VECTOR V1, M3D_VECTOR V2, M3D_VECTOR V3) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 Result = {{{
|
|
V3.v4f[0] - (V1.v4f[0] * V2.v4f[0]),
|
|
V3.v4f[1] - (V1.v4f[1] * V2.v4f[1]),
|
|
V3.v4f[2] - (V1.v4f[2] * V2.v4f[2]),
|
|
V3.v4f[3] - (V1.v4f[3] * V2.v4f[3])
|
|
}}};
|
|
return Result;
|
|
#else
|
|
return M3D_FNMADD_PS(V1, V2, V3);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Scale(M3D_VECTOR V, float scale) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret = {{{
|
|
V.v4f[0] * scale,
|
|
V.v4f[1] * scale,
|
|
V.v4f[2] * scale,
|
|
V.v4f[3] * scale
|
|
}}};
|
|
return ret.v;
|
|
#else
|
|
M3D_VECTOR ret = _mm_set_ps1(scale);
|
|
return _mm_mul_ps(ret, V);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Select(M3D_VECTOR V1, M3D_VECTOR V2, M3D_VECTOR Control) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4U32 ret = {{{
|
|
(V1.v4u[0] & ~Control.v4u[0]) | (V2.v4u[0] & Control.v4u[0]),
|
|
(V1.v4u[1] & ~Control.v4u[1]) | (V2.v4u[1] & Control.v4u[1]),
|
|
(V1.v4u[2] & ~Control.v4u[2]) | (V2.v4u[2] & Control.v4u[2]),
|
|
(V1.v4u[3] & ~Control.v4u[3]) | (V2.v4u[3] & Control.v4u[3]),
|
|
}}};
|
|
return ret.v;
|
|
#else
|
|
M3D_VECTOR vTemp1 = _mm_andnot_ps(Control, V1);
|
|
M3D_VECTOR vTemp2 = _mm_and_ps(V2, Control);
|
|
return _mm_or_ps(vTemp1, vTemp2);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4MergeXY(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4U32 Result = { { {
|
|
V1.v4u[0],
|
|
V2.v4u[0],
|
|
V1.v4u[1],
|
|
V2.v4u[1],
|
|
} } };
|
|
return Result.v;
|
|
#else
|
|
return _mm_unpacklo_ps(V1, V2);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4MergeZW(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4U32 Result = { { {
|
|
V1.v4u[2],
|
|
V2.v4u[2],
|
|
V1.v4u[3],
|
|
V2.v4u[3]
|
|
} } };
|
|
return Result.v;
|
|
#else
|
|
return _mm_unpackhi_ps(V1, V2);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Sqrt(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 Result = { { {
|
|
sqrtf(V.v4f[0]),
|
|
sqrtf(V.v4f[1]),
|
|
sqrtf(V.v4f[2]),
|
|
sqrtf(V.v4f[3])
|
|
} } };
|
|
return Result.v;
|
|
#else
|
|
return _mm_sqrt_ps(V);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4ModAngles(M3D_VECTOR Angles) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR V;
|
|
M3D_VECTOR Result;
|
|
|
|
// Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
|
|
V = M3D_V4Multiply(Angles, M3D_MReciprocalTwoPi.v);
|
|
V = M3D_V4Round(V);
|
|
Result = M3D_V4NegativeMultiplySubtract(M3D_MTwoPi.v, V, Angles);
|
|
return Result;
|
|
#else
|
|
// Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
|
|
M3D_VECTOR vResult = _mm_mul_ps(Angles, M3D_MReciprocalTwoPi);
|
|
// Use the inline function due to complexity for rounding
|
|
vResult = M3D_V4Round(vResult);
|
|
return M3D_FNMADD_PS(vResult, M3D_MTwoPi, Angles);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
float fValue = V1.v4f[0] * V2.v4f[0] + V1.v4f[1] * V2.v4f[1] + V1.v4f[2] * V2.v4f[2];
|
|
M3D_V4F32 vResult;
|
|
vResult.f[0] =
|
|
vResult.f[1] =
|
|
vResult.f[2] =
|
|
vResult.f[3] = fValue;
|
|
return vResult.v;
|
|
#elif defined(SSE4_INTRINSICS)
|
|
return _mm_dp_ps(V1, V2, 0x7f);
|
|
#elif defined(SSE3_INTRINSICS)
|
|
M3D_VECTOR vTemp = _mm_mul_ps(V1, V2);
|
|
vTemp = _mm_and_ps(vTemp, g_XMMask3);
|
|
vTemp = _mm_hadd_ps(vTemp, vTemp);
|
|
return _mm_hadd_ps(vTemp, vTemp);
|
|
#else
|
|
// Perform the dot product
|
|
M3D_VECTOR vDot = _mm_mul_ps(V1, V2);
|
|
// x=Dot.v4f[1], y=Dot.v4f[2]
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
|
|
// Result.v4f[0] = x+y
|
|
vDot = _mm_add_ss(vDot, vTemp);
|
|
// x=Dot.v4f[2]
|
|
vTemp = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
|
|
// Result.v4f[0] = (x+y)+z
|
|
vDot = _mm_add_ss(vDot, vTemp);
|
|
// Splat x
|
|
return M3D_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
// [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ]
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 vResult = {{{
|
|
(V1.v4f[1] * V2.v4f[2]) - (V1.v4f[2] * V2.v4f[1]),
|
|
(V1.v4f[2] * V2.v4f[0]) - (V1.v4f[0] * V2.v4f[2]),
|
|
(V1.v4f[0] * V2.v4f[1]) - (V1.v4f[1] * V2.v4f[0]),
|
|
0.0f
|
|
}}};
|
|
return vResult.v;
|
|
#else
|
|
// y1,z1,x1,w1
|
|
M3D_VECTOR vTemp1 = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(3, 0, 2, 1));
|
|
// z2,x2,y2,w2
|
|
M3D_VECTOR vTemp2 = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(3, 1, 0, 2));
|
|
// Perform the left operation
|
|
M3D_VECTOR vResult = _mm_mul_ps(vTemp1, vTemp2);
|
|
// z1,x1,y1,w1
|
|
vTemp1 = M3D_PERMUTE_PS(vTemp1, _MM_SHUFFLE(3, 0, 2, 1));
|
|
// y2,z2,x2,w2
|
|
vTemp2 = M3D_PERMUTE_PS(vTemp2, _MM_SHUFFLE(3, 1, 0, 2));
|
|
// Perform the right operation
|
|
vResult = M3D_FMADD_PS(vTemp1, vTemp2, vResult);
|
|
// Set w to zero
|
|
return _mm_and_ps(vResult, M3D_MMask3);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3LengthSq(M3D_VECTOR V) noexcept {
|
|
return M3D_V3Dot(V, V);
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR Result;
|
|
|
|
Result = M3D_V3LengthSq(V);
|
|
Result = M3D_V4Sqrt(Result);
|
|
|
|
return Result;
|
|
#elif defined(SSE4_INTRINSICS)
|
|
M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
|
|
return _mm_sqrt_ps(vTemp);
|
|
#elif defined(SSE3_INTRINSICS)
|
|
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
|
|
vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
|
|
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
|
|
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
|
|
vLengthSq = _mm_sqrt_ps(vLengthSq);
|
|
return vLengthSq;
|
|
#else
|
|
// Perform the dot product on x,y and z
|
|
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
|
|
// vTemp has z and y
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2));
|
|
// x+z, y
|
|
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
|
|
// y,y,y,y
|
|
vTemp = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
|
|
// x+z+y,??,??,??
|
|
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
|
|
// Splat the length squared
|
|
vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
|
|
// Get the length
|
|
vLengthSq = _mm_sqrt_ps(vLengthSq);
|
|
return vLengthSq;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR vResult = M3D_V3Length(V);
|
|
float fLength = vResult.v4f[0];
|
|
|
|
// Prevent divide by zero - uhuh
|
|
if (fLength > 0) {
|
|
fLength = 1.0f / fLength;
|
|
}
|
|
|
|
vResult.v4f[0] = V.v4f[0] * fLength;
|
|
vResult.v4f[1] = V.v4f[1] * fLength;
|
|
vResult.v4f[2] = V.v4f[2] * fLength;
|
|
vResult.v4f[3] = V.v4f[3] * fLength;
|
|
return vResult;
|
|
|
|
#elif defined(SSE4_INTRINSICS)
|
|
M3D_VECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f);
|
|
// Prepare for the division
|
|
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
|
|
// Create zero with a single instruction
|
|
M3D_VECTOR vZeroMask = _mm_setzero_ps();
|
|
// Test for a divide by zero (Must be FP to detect -0.0)
|
|
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
|
|
// Failsafe on zero (Or epsilon) length planes
|
|
// If the length is infinity, set the elements to zero
|
|
vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
|
|
// Divide to perform the normalization
|
|
vResult = _mm_div_ps(V, vResult);
|
|
// Any that are infinity, set to zero
|
|
vResult = _mm_and_ps(vResult, vZeroMask);
|
|
// Select qnan or result based on infinite length
|
|
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
|
|
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
|
|
vResult = _mm_or_ps(vTemp1, vTemp2);
|
|
return vResult;
|
|
#elif defined(SSE3_INTRINSICS)
|
|
// Perform the dot product on x,y and z only
|
|
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
|
|
vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
|
|
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
|
|
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
|
|
// Prepare for the division
|
|
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
|
|
// Create zero with a single instruction
|
|
M3D_VECTOR vZeroMask = _mm_setzero_ps();
|
|
// Test for a divide by zero (Must be FP to detect -0.0)
|
|
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
|
|
// Failsafe on zero (Or epsilon) length planes
|
|
// If the length is infinity, set the elements to zero
|
|
vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
|
|
// Divide to perform the normalization
|
|
vResult = _mm_div_ps(V, vResult);
|
|
// Any that are infinity, set to zero
|
|
vResult = _mm_and_ps(vResult, vZeroMask);
|
|
// Select qnan or result based on infinite length
|
|
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
|
|
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
|
|
vResult = _mm_or_ps(vTemp1, vTemp2);
|
|
return vResult;
|
|
#else
|
|
// Perform the dot product on x,y and z only
|
|
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1));
|
|
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
|
|
vTemp = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
|
|
vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
|
|
// Prepare for the division
|
|
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
|
|
// Create zero with a single instruction
|
|
M3D_VECTOR vZeroMask = _mm_setzero_ps();
|
|
// Test for a divide by zero (Must be FP to detect -0.0)
|
|
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
|
|
// Failsafe on zero (Or epsilon) length planes
|
|
// If the length is infinity, set the elements to zero
|
|
vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
|
|
// Divide to perform the normalization
|
|
vResult = _mm_div_ps(V, vResult);
|
|
// Any that are infinity, set to zero
|
|
vResult = _mm_and_ps(vResult, vZeroMask);
|
|
// Select qnan or result based on infinite length
|
|
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
|
|
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
|
|
vResult = _mm_or_ps(vTemp1, vTemp2);
|
|
return vResult;
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_MATRIX M3D_MIdentity() noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0.v;
|
|
ret.rows[1] = M3D_MIdentityR1.v;
|
|
ret.rows[2] = M3D_MIdentityR2.v;
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_MMultiply(M3D_MATRIX M1, M3D_MATRIX& M2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
// Cache the invariants in registers
|
|
float x = M1.mat[0][0];
|
|
float y = M1.mat[0][1];
|
|
float z = M1.mat[0][2];
|
|
float w = M1.mat[0][3];
|
|
// Perform the operation on the first row
|
|
ret.mat[0][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[0][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[0][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[0][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
// Repeat for all the other rows
|
|
x = M1.mat[1][0];
|
|
y = M1.mat[1][1];
|
|
z = M1.mat[1][2];
|
|
w = M1.mat[1][3];
|
|
ret.mat[1][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[1][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[1][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[1][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
x = M1.mat[2][0];
|
|
y = M1.mat[2][1];
|
|
z = M1.mat[2][2];
|
|
w = M1.mat[2][3];
|
|
ret.mat[2][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[2][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[2][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[2][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
x = M1.mat[3][0];
|
|
y = M1.mat[3][1];
|
|
z = M1.mat[3][2];
|
|
w = M1.mat[3][3];
|
|
ret.mat[3][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[3][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[3][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[3][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
return ret;
|
|
#elif defined(AVX2_INTRINSICS)
|
|
__m256 t0 = _mm256_castps128_ps256(M1.rows[0]);
|
|
t0 = _mm256_insertf128_ps(t0, M1.rows[1], 1);
|
|
__m256 t1 = _mm256_castps128_ps256(M1.rows[2]);
|
|
t1 = _mm256_insertf128_ps(t1, M1.rows[3], 1);
|
|
|
|
__m256 u0 = _mm256_castps128_ps256(M2.rows[0]);
|
|
u0 = _mm256_insertf128_ps(u0, M2.rows[1], 1);
|
|
__m256 u1 = _mm256_castps128_ps256(M2.rows[2]);
|
|
u1 = _mm256_insertf128_ps(u1, M2.rows[3], 1);
|
|
|
|
__m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00);
|
|
__m256 c0 = _mm256_mul_ps(a0, b0);
|
|
__m256 c1 = _mm256_mul_ps(a1, b0);
|
|
|
|
a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1));
|
|
a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
b0 = _mm256_permute2f128_ps(u0, u0, 0x11);
|
|
__m256 c2 = _mm256_fmadd_ps(a0, b0, c0);
|
|
__m256 c3 = _mm256_fmadd_ps(a1, b0, c1);
|
|
|
|
a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2));
|
|
a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
__m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00);
|
|
__m256 c4 = _mm256_mul_ps(a0, b1);
|
|
__m256 c5 = _mm256_mul_ps(a1, b1);
|
|
|
|
a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3));
|
|
a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
|
|
b1 = _mm256_permute2f128_ps(u1, u1, 0x11);
|
|
__m256 c6 = _mm256_fmadd_ps(a0, b1, c4);
|
|
__m256 c7 = _mm256_fmadd_ps(a1, b1, c5);
|
|
|
|
t0 = _mm256_add_ps(c2, c6);
|
|
t1 = _mm256_add_ps(c3, c7);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm256_castps256_ps128(t0);
|
|
ret.rows[1] = _mm256_extractf128_ps(t0, 1);
|
|
ret.rows[2] = _mm256_castps256_ps128(t1);
|
|
ret.rows[3] = _mm256_extractf128_ps(t1, 1);
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
// Splat the component X,Y,Z then W
|
|
#ifdef AVX_INTRINSICS
|
|
XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 0);
|
|
XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 1);
|
|
XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 2);
|
|
XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 3);
|
|
#else
|
|
// Use vW to hold the original row
|
|
M3D_VECTOR vW = M1.rows[0];
|
|
M3D_VECTOR vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
M3D_VECTOR vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
// Perform the operation on the first row
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
// Perform a binary add to reduce cumulative errors
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[0] = vX;
|
|
// Repeat for the other 3 rows
|
|
#ifdef AVX_INTRINSICS
|
|
vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 0);
|
|
vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 1);
|
|
vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 2);
|
|
vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 3);
|
|
#else
|
|
vW = M1.rows[1];
|
|
vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[1] = vX;
|
|
#ifdef AVX_INTRINSICS
|
|
vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 0);
|
|
vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 1);
|
|
vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 2);
|
|
vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 3);
|
|
#else
|
|
vW = M1.rows[2];
|
|
vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[2] = vX;
|
|
#ifdef AVX_INTRINSICS
|
|
vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 0);
|
|
vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 1);
|
|
vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 2);
|
|
vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 3);
|
|
#else
|
|
vW = M1.rows[3];
|
|
vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[3] = vX;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_MTranspose(M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
// Original matrix:
|
|
//
|
|
// m00m01m02m03
|
|
// m10m11m12m13
|
|
// m20m21m22m23
|
|
// m30m31m32m33
|
|
|
|
M3D_MATRIX P;
|
|
P.rows[0] = M3D_V4MergeXY(M.rows[0], M.rows[2]); // m00m20m01m21
|
|
P.rows[1] = M3D_V4MergeXY(M.rows[1], M.rows[3]); // m10m30m11m31
|
|
P.rows[2] = M3D_V4MergeZW(M.rows[0], M.rows[2]); // m02m22m03m23
|
|
P.rows[3] = M3D_V4MergeZW(M.rows[1], M.rows[3]); // m12m32m13m33
|
|
|
|
M3D_MATRIX MT;
|
|
MT.rows[0] = M3D_V4MergeXY(P.rows[0], P.rows[1]); // m00m10m20m30
|
|
MT.rows[1] = M3D_V4MergeZW(P.rows[0], P.rows[1]); // m01m11m21m31
|
|
MT.rows[2] = M3D_V4MergeXY(P.rows[2], P.rows[3]); // m02m12m22m32
|
|
MT.rows[3] = M3D_V4MergeZW(P.rows[2], P.rows[3]); // m03m13m23m33
|
|
return MT;
|
|
#elif defined(AVX2_INTRINSICS)
|
|
__m256 t0 = _mm256_castps128_ps256(M.rows[0]);
|
|
t0 = _mm256_insertf128_ps(t0, M.rows[1], 1);
|
|
__m256 t1 = _mm256_castps128_ps256(M.rows[2]);
|
|
t1 = _mm256_insertf128_ps(t1, M.rows[3], 1);
|
|
|
|
__m256 vTemp = _mm256_unpacklo_ps(t0, t1);
|
|
__m256 vTemp2 = _mm256_unpackhi_ps(t0, t1);
|
|
__m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
|
|
__m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
|
|
vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4);
|
|
vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4);
|
|
t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
|
|
t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm256_castps256_ps128(t0);
|
|
ret.rows[1] = _mm256_extractf128_ps(t0, 1);
|
|
ret.rows[2] = _mm256_castps256_ps128(t1);
|
|
ret.rows[3] = _mm256_extractf128_ps(t1, 1);
|
|
return ret;
|
|
#else
|
|
// x.x,x.y,y.x,y.y
|
|
M3D_VECTOR vTemp1 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(1, 0, 1, 0));
|
|
// x.z,x.w,y.z,y.w
|
|
M3D_VECTOR vTemp3 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(3, 2, 3, 2));
|
|
// z.x,z.y,w.x,w.y
|
|
M3D_VECTOR vTemp2 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(1, 0, 1, 0));
|
|
// z.z,z.w,w.z,w.w
|
|
M3D_VECTOR vTemp4 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(3, 2, 3, 2));
|
|
|
|
M3D_MATRIX ret;
|
|
// x.x,y.x,z.x,w.x
|
|
ret.rows[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
|
|
// x.y,y.y,z.y,w.y
|
|
ret.rows[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
|
|
// x.z,y.z,z.z,w.z
|
|
ret.rows[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
|
|
// x.w,y.w,z.w,w.w
|
|
ret.rows[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_VECTOR M3D_V3Transform(M3D_VECTOR V, M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, M.rows[2], M.rows[3]);
|
|
Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result);
|
|
Result = M3D_V4MultiplyAdd(X, M.rows[0], Result);
|
|
|
|
return Result;
|
|
#else
|
|
M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z
|
|
vResult = M3D_FMADD_PS(vResult, M.rows[2], M.rows[3]);
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
|
|
vResult = M3D_FMADD_PS(vTemp, M.rows[1], vResult);
|
|
vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
|
|
vResult = M3D_FMADD_PS(vTemp, M.rows[0], vResult);
|
|
return vResult;
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V3Transform(
|
|
M3D_F4* pOutputStream,
|
|
size_t OutputStride,
|
|
const M3D_F3* pInputStream,
|
|
size_t InputStride,
|
|
size_t VectorCount,
|
|
M3D_MATRIX M
|
|
) noexcept {
|
|
auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
|
|
auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
|
|
|
|
const M3D_VECTOR row0 = M.rows[0];
|
|
const M3D_VECTOR row1 = M.rows[1];
|
|
const M3D_VECTOR row2 = M.rows[2];
|
|
const M3D_VECTOR row3 = M.rows[3];
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
for (size_t i = 0; i < VectorCount; i++) {
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, row2, row3);
|
|
Result = M3D_V4MultiplyAdd(Y, row1, Result);
|
|
Result = M3D_V4MultiplyAdd(X, row0, Result);
|
|
|
|
M3D_V4StoreF4(reinterpret_cast<M3D_F4*>(pOutputVector), Result);
|
|
|
|
pInputVector += InputStride;
|
|
pOutputVector += OutputStride;
|
|
}
|
|
#else
|
|
size_t i = 0;
|
|
size_t four = VectorCount >> 2;
|
|
if (four > 0) {
|
|
if (InputStride == sizeof(M3D_F3)) {
|
|
if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF)) {
|
|
// Packed input, aligned output
|
|
for (size_t j = 0; j < four; ++j) {
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
i += 4;
|
|
}
|
|
} else {
|
|
// Packed input, unaligned output
|
|
for (size_t j = 0; j < four; ++j)
|
|
{
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
i += 4;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF)) {
|
|
// Aligned output
|
|
for (; i < VectorCount; ++i) {
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
} else {
|
|
// Unaligned output
|
|
for (; i < VectorCount; ++i)
|
|
{
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
}
|
|
|
|
M3D_SFENCE();
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3TransformPersDiv(M3D_VECTOR V, M3D_MATRIX M) noexcept {
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, M.rows[2], M.rows[3]);
|
|
Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result);
|
|
Result = M3D_V4MultiplyAdd(X, M.rows[0], Result);
|
|
|
|
M3D_VECTOR W = M3D_V4SplatW(Result);
|
|
return M3D_V4Divide(Result, W);
|
|
}
|
|
|
|
inline void M3D_V3TransformPersDiv(
|
|
M3D_F3* pOutputStream,
|
|
size_t OutputStride,
|
|
const M3D_F3* pInputStream,
|
|
size_t InputStride,
|
|
size_t VectorCount,
|
|
M3D_MATRIX M
|
|
) noexcept {
|
|
auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
|
|
auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
|
|
|
|
const M3D_VECTOR row0 = M.rows[0];
|
|
const M3D_VECTOR row1 = M.rows[1];
|
|
const M3D_VECTOR row2 = M.rows[2];
|
|
const M3D_VECTOR row3 = M.rows[3];
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
for (size_t i = 0; i < VectorCount; i++)
|
|
{
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, row2, row3);
|
|
Result = M3D_V4MultiplyAdd(Y, row1, Result);
|
|
Result = M3D_V4MultiplyAdd(X, row0, Result);
|
|
|
|
M3D_VECTOR W = M3D_V4SplatW(Result);
|
|
|
|
Result = M3D_V4Divide(Result, W);
|
|
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), Result);
|
|
|
|
pInputVector += InputStride;
|
|
pOutputVector += OutputStride;
|
|
}
|
|
#else
|
|
size_t i = 0;
|
|
size_t four = VectorCount >> 2;
|
|
if (four > 0) {
|
|
if (InputStride == sizeof(M3D_F3)) {
|
|
if (OutputStride == sizeof(M3D_F3)) {
|
|
if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF)) {
|
|
// Packed input, aligned & packed output
|
|
for (size_t j = 0; j < four; ++j) {
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V1 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V2 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V3 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V4 = _mm_div_ps(vTemp, W);
|
|
|
|
// Pack and store the vectors
|
|
M3D_PACK4INTO3(vTemp);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), V1);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 32), V3);
|
|
pOutputVector += sizeof(M3D_F3) * 4;
|
|
i += 4;
|
|
}
|
|
} else {
|
|
// Packed input, unaligned & packed output
|
|
for (size_t j = 0; j < four; ++j) {
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V1 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V2 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V3 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V4 = _mm_div_ps(vTemp, W);
|
|
|
|
// Pack and store the vectors
|
|
M3D_PACK4INTO3(vTemp);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), V1);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 32), V3);
|
|
pOutputVector += sizeof(M3D_F3) * 4;
|
|
i += 4;
|
|
}
|
|
}
|
|
} else {
|
|
// Packed input, unpacked output
|
|
for (size_t j = 0; j < four; ++j)
|
|
{
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
i += 4;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (; i < VectorCount; i++) {
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
|
|
M3D_SFENCE();
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3TransformNDCToViewport(M3D_VECTOR V, float vpX, float vpY, float vpW, float vpH, float vpMinZ, float vpMaxZ) noexcept {
|
|
const float halfVPWidth = vpW * 0.5f;
|
|
const float halfVPHeight = vpH * 0.5f;
|
|
|
|
M3D_VECTOR s = M3D_V4Set(halfVPWidth, -halfVPHeight, vpMaxZ - vpMinZ, 0.0f);
|
|
M3D_VECTOR o = M3D_V4Set(vpX + halfVPWidth, vpY + halfVPHeight, vpMinZ, 0.0f);
|
|
|
|
return M3D_V4MultiplyAdd(V, s, o);
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixCamLookAtLH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept {
|
|
M3D_VECTOR dir = M3D_V4Subtract(focusPos, viewPos);
|
|
return M3D_TransformMatrixCamLookToLH(viewPos, dir, upDirection);
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixCamLookAtRH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept {
|
|
M3D_VECTOR dir_n = M3D_V4Subtract(viewPos, focusPos);
|
|
return M3D_TransformMatrixCamLookToLH(viewPos, dir_n, upDirection);
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixCamLookToLH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept {
|
|
// Keep viewer's axes orthogonal to each other and of unit length
|
|
M3D_VECTOR look_normal = M3D_V3Normalize(viewDirection);
|
|
M3D_VECTOR up_norm = M3D_V3Cross(upDirection, look_normal);
|
|
up_norm = M3D_V3Normalize(up_norm);
|
|
|
|
// U, L already ortho-normal, so no need to normalize cross product
|
|
M3D_VECTOR right_norm = M3D_V3Cross(look_normal, up_norm);
|
|
|
|
M3D_VECTOR viewPos_n = M3D_V4Negate(viewPos);
|
|
|
|
M3D_VECTOR right_vec = M3D_V3Dot(up_norm, viewPos_n);
|
|
M3D_VECTOR up_vec = M3D_V3Dot(right_norm, viewPos_n);
|
|
M3D_VECTOR look_vec = M3D_V3Dot(look_normal, viewPos_n);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Select(right_vec, up_norm, M3D_MSelect1110.v);
|
|
ret.rows[1] = M3D_V4Select(up_vec, right_norm, M3D_MSelect1110.v);
|
|
ret.rows[2] = M3D_V4Select(look_vec, look_normal, M3D_MSelect1110.v);
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
|
|
ret = M3D_MTranspose(ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixCamLookToRH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept {
|
|
M3D_VECTOR viewDirection_n = M3D_V4Negate(viewDirection);
|
|
return M3D_TransformMatrixCamLookToLH(viewPos, viewDirection_n, upDirection);
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixFrustrumFovLH(float fov, float ratio, float near, float far) noexcept {
|
|
float SinFov;
|
|
float CosFov;
|
|
M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov);
|
|
float fRange = far / (far - near);
|
|
float Height = CosFov / SinFov;
|
|
float Width = Height / ratio;
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = Width;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = Height;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = fRange;
|
|
ret.mat[2][3] = 1.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = -fRange * near;
|
|
ret.mat[3][3] = 0.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR rMem = {
|
|
Width,
|
|
Height,
|
|
fRange,
|
|
-fRange * near
|
|
};
|
|
|
|
// Copy from memory to SSE register
|
|
M3D_VECTOR vValues = rMem;
|
|
|
|
M3D_MATRIX ret;
|
|
M3D_VECTOR vTemp = _mm_setzero_ps();
|
|
vTemp = _mm_move_ss(vTemp, vValues);
|
|
ret.rows[0] = vTemp; // Width, 0, 0, 0
|
|
vTemp = vValues;
|
|
vTemp = _mm_and_ps(vTemp, M3D_MMaskY);
|
|
ret.rows[1] = vTemp; // 0, Height, 0, 0
|
|
vTemp = _mm_setzero_ps();
|
|
vValues = _mm_shuffle_ps(vValues, M3D_MIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
|
|
ret.rows[2] = vTemp; // 0, 0, fRange, 1.0f
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
|
|
ret.rows[3] = vTemp; // 0, 0, -fRange * near, 0.0f
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixFrustrumFovRH(float fov, float ratio, float near, float far) noexcept {
|
|
float SinFov;
|
|
float CosFov;
|
|
M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov);
|
|
float fRange = far / (near - far);
|
|
float Height = CosFov / SinFov;
|
|
float Width = Height / ratio;
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = Width;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = Height;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = fRange;
|
|
ret.mat[2][3] = -1.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = fRange * near;
|
|
ret.mat[3][3] = 0.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR rMem = {
|
|
Width,
|
|
Height,
|
|
fRange,
|
|
fRange * near
|
|
};
|
|
|
|
// Copy from memory to SSE register
|
|
M3D_VECTOR vValues = rMem;
|
|
|
|
M3D_MATRIX ret;
|
|
M3D_VECTOR vTemp = _mm_setzero_ps();
|
|
vTemp = _mm_move_ss(vTemp, vValues);
|
|
ret.rows[0] = vTemp; // Height / a_ratio, 0, 0, 0
|
|
vTemp = vValues;
|
|
vTemp = _mm_and_ps(vTemp, M3D_MMaskY);
|
|
ret.rows[1] = vTemp; // 0, CosFov / SinFov, 0, 0
|
|
vTemp = _mm_setzero_ps();
|
|
vValues = _mm_shuffle_ps(vValues, M3D_MIdentityR3_n, _MM_SHUFFLE(3, 2, 3, 2));
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
|
|
ret.rows[2] = vTemp; // 0, 0, fRange, -1.0f
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
|
|
ret.rows[3] = vTemp; // 0, 0, fRange * near, 0.0f
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixTranslate(M3D_VECTOR Offset) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = 1.0f;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = 1.0f;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = Offset.v4f[0];
|
|
ret.mat[3][1] = Offset.v4f[1];
|
|
ret.mat[3][2] = Offset.v4f[2];
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0.v;
|
|
ret.rows[1] = M3D_MIdentityR1.v;
|
|
ret.rows[2] = M3D_MIdentityR2.v;
|
|
ret.rows[3] = M3D_V4Select(M3D_MIdentityR3.v, Offset, M3D_MSelect1110.v);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixScale(float ScaleX, float ScaleY, float ScaleZ) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = ScaleX;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = ScaleY;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = ScaleZ;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_set_ps(0, 0, 0, ScaleX);
|
|
ret.rows[1] = _mm_set_ps(0, 0, ScaleY, 0);
|
|
ret.rows[2] = _mm_set_ps(0, ScaleZ, 0, 0);
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixScale(M3D_VECTOR Scale) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = Scale.v4f[0];
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = Scale.v4f[1];
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = Scale.v4f[2];
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_and_ps(Scale, M3D_MMaskX);
|
|
ret.rows[1] = _mm_and_ps(Scale, M3D_MMaskY);
|
|
ret.rows[2] = _mm_and_ps(Scale, M3D_MMaskZ);
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixTranslate(float OffsetX, float OffsetY, float OffsetZ) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = 1.0f;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = 1.0f;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = OffsetX;
|
|
ret.mat[3][1] = OffsetY;
|
|
ret.mat[3][2] = OffsetZ;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0.v;
|
|
ret.rows[1] = M3D_MIdentityR1.v;
|
|
ret.rows[2] = M3D_MIdentityR2.v;
|
|
ret.rows[3] = M3D_V4Set(OffsetX, OffsetY, OffsetZ, 1.f);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixRotationX(float Angle) noexcept {
|
|
float SinAngle;
|
|
float CosAngle;
|
|
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = 1.0f;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = CosAngle;
|
|
ret.mat[1][2] = SinAngle;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = -SinAngle;
|
|
ret.mat[2][2] = CosAngle;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR vSin = _mm_set_ss(SinAngle);
|
|
M3D_VECTOR vCos = _mm_set_ss(CosAngle);
|
|
// x = 0,y = cos,z = sin, w = 0
|
|
vCos = _mm_shuffle_ps(vCos, vSin, _MM_SHUFFLE(3, 0, 0, 3));
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0;
|
|
ret.rows[1] = vCos;
|
|
// x = 0,y = sin,z = cos, w = 0
|
|
vCos = M3D_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 1, 2, 0));
|
|
// x = 0,y = -sin,z = cos, w = 0
|
|
vCos = _mm_mul_ps(vCos, M3D_MNegateY);
|
|
ret.rows[2] = vCos;
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixRotationY(float Angle) noexcept {
|
|
float SinAngle;
|
|
float CosAngle;
|
|
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = CosAngle;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = -SinAngle;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = 1.0f;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = SinAngle;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = CosAngle;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR vSin = _mm_set_ss(SinAngle);
|
|
M3D_VECTOR vCos = _mm_set_ss(CosAngle);
|
|
// x = sin,y = 0,z = cos, w = 0
|
|
vSin = _mm_shuffle_ps(vSin, vCos, _MM_SHUFFLE(3, 0, 3, 0));
|
|
M3D_MATRIX ret;
|
|
ret.rows[2] = vSin;
|
|
ret.rows[1] = M3D_MIdentityR1;
|
|
// x = cos,y = 0,z = sin, w = 0
|
|
vSin = M3D_PERMUTE_PS(vSin, _MM_SHUFFLE(3, 0, 1, 2));
|
|
// x = cos,y = 0,z = -sin, w = 0
|
|
vSin = _mm_mul_ps(vSin, M3D_MNegateZ);
|
|
ret.rows[0] = vSin;
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixRotationZ(float Angle) noexcept {
|
|
float SinAngle;
|
|
float CosAngle;
|
|
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = CosAngle;
|
|
ret.mat[0][1] = SinAngle;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = -SinAngle;
|
|
ret.mat[1][1] = CosAngle;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR vSin = _mm_set_ss(SinAngle);
|
|
M3D_VECTOR vCos = _mm_set_ss(CosAngle);
|
|
// x = cos,y = sin,z = 0, w = 0
|
|
vCos = _mm_unpacklo_ps(vCos, vSin);
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = vCos;
|
|
// x = sin,y = cos,z = 0, w = 0
|
|
vCos = M3D_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 2, 0, 1));
|
|
// x = cos,y = -sin,z = 0, w = 0
|
|
vCos = _mm_mul_ps(vCos, M3D_MNegateX);
|
|
ret.rows[1] = vCos;
|
|
ret.rows[2] = M3D_MIdentityR2;
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixRotation(M3D_VECTOR Angles) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
float cp = cosf(Angles.v4f[0]);
|
|
float sp = sinf(Angles.v4f[0]);
|
|
|
|
float cy = cosf(Angles.v4f[1]);
|
|
float sy = sinf(Angles.v4f[1]);
|
|
|
|
float cr = cosf(Angles.v4f[2]);
|
|
float sr = sinf(Angles.v4f[2]);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = cr * cy + sr * sp * sy;
|
|
ret.mat[0][1] = sr * cp;
|
|
ret.mat[0][2] = sr * sp * cy - cr * sy;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = cr * sp * sy - sr * cy;
|
|
ret.mat[1][1] = cr * cp;
|
|
ret.mat[1][2] = sr * sy + cr * sp * cy;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = cp * sy;
|
|
ret.mat[2][1] = -sp;
|
|
ret.mat[2][2] = cp * cy;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
static const M3D_V4F32 Sign = {{{1.0f, -1.0f, -1.0f, 1.0f}}};
|
|
|
|
M3D_VECTOR SinAngles, CosAngles;
|
|
M3D_V4SinCos(&SinAngles, &CosAngles, Angles);
|
|
|
|
M3D_VECTOR P0 = M3D_V4Permute<M3D_PERMUTE_1X, M3D_PERMUTE_0Z, M3D_PERMUTE_1Z, M3D_PERMUTE_1X>(SinAngles, CosAngles);
|
|
M3D_VECTOR Y0 = M3D_V4Permute<M3D_PERMUTE_0Y, M3D_PERMUTE_1X, M3D_PERMUTE_1X, M3D_PERMUTE_1Y>(SinAngles, CosAngles);
|
|
M3D_VECTOR P1 = M3D_V4Permute<M3D_PERMUTE_1Z, M3D_PERMUTE_0Z, M3D_PERMUTE_1Z, M3D_PERMUTE_0Z>(SinAngles, CosAngles);
|
|
M3D_VECTOR Y1 = M3D_V4Permute<M3D_PERMUTE_1Y, M3D_PERMUTE_1Y, M3D_PERMUTE_0Y, M3D_PERMUTE_0Y>(SinAngles, CosAngles);
|
|
M3D_VECTOR P2 = M3D_V4Permute<M3D_PERMUTE_0Z, M3D_PERMUTE_1Z, M3D_PERMUTE_0Z, M3D_PERMUTE_1Z>(SinAngles, CosAngles);
|
|
M3D_VECTOR P3 = M3D_V4Permute<M3D_PERMUTE_0Y, M3D_PERMUTE_0Y, M3D_PERMUTE_1Y, M3D_PERMUTE_1Y>(SinAngles, CosAngles);
|
|
M3D_VECTOR Y2 = M3D_V4SplatX(SinAngles);
|
|
M3D_VECTOR NS = M3D_V4Negate(SinAngles);
|
|
|
|
M3D_VECTOR Q0 = M3D_V4Multiply(P0, Y0);
|
|
M3D_VECTOR Q1 = M3D_V4Multiply(P1, Sign.v);
|
|
Q1 = M3D_V4Multiply(Q1, Y1);
|
|
M3D_VECTOR Q2 = M3D_V4Multiply(P2, Y2);
|
|
Q2 = M3D_V4MultiplyAdd(Q2, P3, Q1);
|
|
|
|
M3D_VECTOR V0 = M3D_V4Permute<M3D_PERMUTE_1X, M3D_PERMUTE_0Y, M3D_PERMUTE_1Z, M3D_PERMUTE_0W>(Q0, Q2);
|
|
M3D_VECTOR V1 = M3D_V4Permute<M3D_PERMUTE_1Y, M3D_PERMUTE_0Z, M3D_PERMUTE_1W, M3D_PERMUTE_0W>(Q0, Q2);
|
|
M3D_VECTOR V2 = M3D_V4Permute<M3D_PERMUTE_0X, M3D_PERMUTE_1X, M3D_PERMUTE_0W, M3D_PERMUTE_0W>(Q0, NS);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Select(M3D_MZero, V0, M3D_MSelect1110.v);
|
|
ret.rows[1] = M3D_V4Select(M3D_MZero, V1, M3D_MSelect1110.v);
|
|
ret.rows[2] = M3D_V4Select(M3D_MZero, V2, M3D_MSelect1110.v);
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
//TODO: transform matrix is incomplete
|
|
//v_tri[v_cnt].position.z = ((far+near)/2)+((far-near)/2)*_2dCoord.z;
|
|
inline M3D_MATRIX M3D_TransformMatrixViewport(float _w, float _h, float _wOffset, float _hOffset) noexcept {
|
|
const float widthDiv2 = _w / 2;
|
|
const float heightDiv2 = _h / 2;
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = widthDiv2;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = -heightDiv2;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f; // maxZ-minZ ignored
|
|
ret.mat[2][3] = 0.0f; // minZ ignored
|
|
|
|
ret.mat[3][0] = _wOffset + widthDiv2;
|
|
ret.mat[3][1] = _hOffset + heightDiv2;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Set(widthDiv2, 0, 0, 0);
|
|
ret.rows[1] = M3D_V4Set(0, -heightDiv2, 0, 0);
|
|
ret.rows[2] = M3D_MIdentityR2.v; // maxZ-minZ and minZ are ignored
|
|
ret.rows[3] = M3D_V4Set(_wOffset + widthDiv2, _hOffset + heightDiv2, 0, 1);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V4SinCos(M3D_VECTOR* pSin, M3D_VECTOR* pCos, M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 Sin = { { {
|
|
sinf(V.v4f[0]),
|
|
sinf(V.v4f[1]),
|
|
sinf(V.v4f[2]),
|
|
sinf(V.v4f[3])
|
|
} } };
|
|
|
|
M3D_V4F32 Cos = { { {
|
|
cosf(V.v4f[0]),
|
|
cosf(V.v4f[1]),
|
|
cosf(V.v4f[2]),
|
|
cosf(V.v4f[3])
|
|
} } };
|
|
|
|
*pSin = Sin.v;
|
|
*pCos = Cos.v;
|
|
#else
|
|
// Force the value within the bounds of pi
|
|
M3D_VECTOR x = M3D_V4ModAngles(V);
|
|
|
|
// Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
|
|
M3D_VECTOR sign = _mm_and_ps(x, M3D_MNegativeZero);
|
|
__m128 c = _mm_or_ps(M3D_MPi, sign); // pi when x >= 0, -pi when x < 0
|
|
__m128 absx = _mm_andnot_ps(sign, x); // |x|
|
|
__m128 rflx = _mm_sub_ps(c, x);
|
|
__m128 comp = _mm_cmple_ps(absx, M3D_MHalfPi);
|
|
__m128 select0 = _mm_and_ps(comp, x);
|
|
__m128 select1 = _mm_andnot_ps(comp, rflx);
|
|
x = _mm_or_ps(select0, select1);
|
|
select0 = _mm_and_ps(comp, M3D_MOne);
|
|
select1 = _mm_andnot_ps(comp, M3D_MNegativeOne);
|
|
sign = _mm_or_ps(select0, select1);
|
|
|
|
__m128 x2 = _mm_mul_ps(x, x);
|
|
|
|
// Compute polynomial approximation of sine
|
|
const M3D_VECTOR SC1 = M3D_MSinCoeff1;
|
|
__m128 vConstantsB = M3D_PERMUTE_PS(SC1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
const M3D_VECTOR SC0 = M3D_MSinCoeff0;
|
|
__m128 vConstants = M3D_PERMUTE_PS(SC0, _MM_SHUFFLE(3, 3, 3, 3));
|
|
__m128 Result = M3D_FMADD_PS(vConstantsB, x2, vConstants);
|
|
|
|
vConstants = M3D_PERMUTE_PS(SC0, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Result = M3D_FMADD_PS(Result, x2, vConstants);
|
|
|
|
vConstants = M3D_PERMUTE_PS(SC0, _MM_SHUFFLE(1, 1, 1, 1));
|
|
Result = M3D_FMADD_PS(Result, x2, vConstants);
|
|
|
|
vConstants = M3D_PERMUTE_PS(SC0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
Result = M3D_FMADD_PS(Result, x2, vConstants);
|
|
|
|
Result = M3D_FMADD_PS(Result, x2, M3D_MOne);
|
|
Result = _mm_mul_ps(Result, x);
|
|
*pSin = Result;
|
|
|
|
// Compute polynomial approximation of cosine
|
|
const M3D_VECTOR CC1 = M3D_MCosCoeff1;
|
|
vConstantsB = M3D_PERMUTE_PS(CC1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
const M3D_VECTOR CC0 = M3D_MCosCoeff0;
|
|
vConstants = M3D_PERMUTE_PS(CC0, _MM_SHUFFLE(3, 3, 3, 3));
|
|
Result = M3D_FMADD_PS(vConstantsB, x2, vConstants);
|
|
|
|
vConstants = M3D_PERMUTE_PS(CC0, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Result = M3D_FMADD_PS(Result, x2, vConstants);
|
|
|
|
vConstants = M3D_PERMUTE_PS(CC0, _MM_SHUFFLE(1, 1, 1, 1));
|
|
Result = M3D_FMADD_PS(Result, x2, vConstants);
|
|
|
|
vConstants = M3D_PERMUTE_PS(CC0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
Result = M3D_FMADD_PS(Result, x2, vConstants);
|
|
|
|
Result = M3D_FMADD_PS(Result, x2, M3D_MOne);
|
|
Result = _mm_mul_ps(Result, sign);
|
|
*pCos = Result;
|
|
#endif
|
|
} |