2070 lines
71 KiB
C++
2070 lines
71 KiB
C++
#pragma once
|
|
|
|
|
|
inline void M3D_ScalarSinCos(float* pSin, float* pCos, float Value) noexcept {
|
|
// Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
|
|
float quotient = M3D_1DIV2PI * Value;
|
|
if (Value >= 0.0f)
|
|
quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
|
|
else
|
|
quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
|
|
|
|
float y = Value - M3D_2PI * quotient;
|
|
|
|
// Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
|
|
float sign;
|
|
if (y > M3D_PIDIV2) {
|
|
y = M3D_PI - y;
|
|
sign = -1.0f;
|
|
} else if (y < -M3D_PIDIV2) {
|
|
y = -M3D_PI - y;
|
|
sign = -1.0f;
|
|
} else {
|
|
sign = +1.0f;
|
|
}
|
|
|
|
float y2 = y * y;
|
|
|
|
// 11-degree minimax approximation
|
|
*pSin = (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y;
|
|
|
|
// 10-degree minimax approximation
|
|
float p = ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f;
|
|
*pCos = sign * p;
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_MATRIX::M3D_MATRIX(
|
|
float f00, float f01, float f02, float f03,
|
|
float f10, float f11, float f12, float f13,
|
|
float f20, float f21, float f22, float f23,
|
|
float f30, float f31, float f32, float f33
|
|
) noexcept {
|
|
rows[0] = M3D_V4Set(f00, f01, f02, f03);
|
|
rows[1] = M3D_V4Set(f10, f11, f12, f13);
|
|
rows[2] = M3D_V4Set(f20, f21, f22, f23);
|
|
rows[3] = M3D_V4Set(f30, f31, f32, f33);
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_MATRIX::operator- () const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Negate(rows[0]);
|
|
ret.rows[1] = M3D_V4Negate(rows[1]);
|
|
ret.rows[2] = M3D_V4Negate(rows[2]);
|
|
ret.rows[3] = M3D_V4Negate(rows[3]);
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator+= (M3D_MATRIX M) noexcept {
|
|
rows[0] = M3D_V4Add(rows[0], M.rows[0]);
|
|
rows[1] = M3D_V4Add(rows[1], M.rows[1]);
|
|
rows[2] = M3D_V4Add(rows[2], M.rows[2]);
|
|
rows[3] = M3D_V4Add(rows[3], M.rows[3]);
|
|
return *this;
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator+ (M3D_MATRIX M) const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Add(rows[0], M.rows[0]);
|
|
ret.rows[1] = M3D_V4Add(rows[1], M.rows[1]);
|
|
ret.rows[2] = M3D_V4Add(rows[2], M.rows[2]);
|
|
ret.rows[3] = M3D_V4Add(rows[3], M.rows[3]);
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator-= (M3D_MATRIX M) noexcept {
|
|
rows[0] = M3D_V4Subtract(rows[0], M.rows[0]);
|
|
rows[1] = M3D_V4Subtract(rows[1], M.rows[1]);
|
|
rows[2] = M3D_V4Subtract(rows[2], M.rows[2]);
|
|
rows[3] = M3D_V4Subtract(rows[3], M.rows[3]);
|
|
return *this;
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator- (M3D_MATRIX M) const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Subtract(rows[0], M.rows[0]);
|
|
ret.rows[1] = M3D_V4Subtract(rows[1], M.rows[1]);
|
|
ret.rows[2] = M3D_V4Subtract(rows[2], M.rows[2]);
|
|
ret.rows[3] = M3D_V4Subtract(rows[3], M.rows[3]);
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator*=(M3D_MATRIX M) noexcept {
|
|
*this = M3D_MMultiply(*this, M);
|
|
return *this;
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator*(M3D_MATRIX M) const noexcept {
|
|
return M3D_MMultiply(*this, M);
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator*= (float S) noexcept {
|
|
rows[0] = M3D_V4Scale(rows[0], S);
|
|
rows[1] = M3D_V4Scale(rows[1], S);
|
|
rows[2] = M3D_V4Scale(rows[2], S);
|
|
rows[3] = M3D_V4Scale(rows[3], S);
|
|
return *this;
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator* (float S) const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Scale(rows[0], S);
|
|
ret.rows[1] = M3D_V4Scale(rows[1], S);
|
|
ret.rows[2] = M3D_V4Scale(rows[2], S);
|
|
ret.rows[3] = M3D_V4Scale(rows[3], S);
|
|
return ret;
|
|
}
|
|
inline M3D_MATRIX operator* (float S, M3D_MATRIX M) noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Scale(M.rows[0], S);
|
|
ret.rows[1] = M3D_V4Scale(M.rows[1], S);
|
|
ret.rows[2] = M3D_V4Scale(M.rows[2], S);
|
|
ret.rows[3] = M3D_V4Scale(M.rows[3], S);
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator/= (float S) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR vS = M3D_V4Replicate(S);
|
|
rows[0] = M3D_V4Divide(rows[0], vS);
|
|
rows[1] = M3D_V4Divide(rows[1], vS);
|
|
rows[2] = M3D_V4Divide(rows[2], vS);
|
|
rows[3] = M3D_V4Divide(rows[3], vS);
|
|
return *this;
|
|
#else
|
|
__m128 vS = _mm_set_ps1(S);
|
|
rows[0] = _mm_div_ps(rows[0], vS);
|
|
rows[1] = _mm_div_ps(rows[1], vS);
|
|
rows[2] = _mm_div_ps(rows[2], vS);
|
|
rows[3] = _mm_div_ps(rows[3], vS);
|
|
return *this;
|
|
#endif
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator/ (float S) const noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR vS = M3D_V4Replicate(S);
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Divide(rows[0], vS);
|
|
ret.rows[1] = M3D_V4Divide(rows[1], vS);
|
|
ret.rows[2] = M3D_V4Divide(rows[2], vS);
|
|
ret.rows[3] = M3D_V4Divide(rows[3], vS);
|
|
return ret;
|
|
#else
|
|
__m128 vS = _mm_set_ps1(S);
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_div_ps(rows[0], vS);
|
|
ret.rows[1] = _mm_div_ps(rows[1], vS);
|
|
ret.rows[2] = _mm_div_ps(rows[2], vS);
|
|
ret.rows[3] = _mm_div_ps(rows[3], vS);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_VECTOR M3D_V4LoadF3(const M3D_F3* src) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR V;
|
|
V.v4f[0] = src->x;
|
|
V.v4f[1] = src->y;
|
|
V.v4f[2] = src->z;
|
|
V.v4f[3] = 0.f;
|
|
return V;
|
|
/*
|
|
#elif defined(SSE4_INTRINSICS)
|
|
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(src)));
|
|
__m128 z = _mm_load_ss(&src->z);
|
|
return _mm_insert_ps(xy, z, 0x20);
|
|
*/
|
|
#else
|
|
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(src)));
|
|
__m128 z = _mm_load_ss(&src->z);
|
|
return _mm_movelh_ps(xy, z);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4LoadF3A(const M3D_F3A* src) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR V;
|
|
V.v4f[0] = src->x;
|
|
V.v4f[1] = src->y;
|
|
V.v4f[2] = src->z;
|
|
V.v4f[3] = 0.f;
|
|
return V;
|
|
#else
|
|
__m128 V = _mm_load_ps(&src->x); // Reads an extra float which is zero'd
|
|
return _mm_and_ps(V, M3D_MMask3);
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V4StoreF3(M3D_F3* dst, M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
dst->x = V.v4f[0];
|
|
dst->y = V.v4f[1];
|
|
dst->z = V.v4f[2];
|
|
/*
|
|
#elif defined(SSE4_INTRINSICS)
|
|
*reinterpret_cast<int*>(&dst->x) = _mm_extract_ps(V, 0);
|
|
*reinterpret_cast<int*>(&dst->y) = _mm_extract_ps(V, 1);
|
|
*reinterpret_cast<int*>(&dst->z) = _mm_extract_ps(V, 2);
|
|
*/
|
|
#else
|
|
_mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
|
|
__m128 z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
_mm_store_ss(&dst->z, z);
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V4StoreF3A(M3D_F3A* dst, M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
dst->x = V.v4f[0];
|
|
dst->y = V.v4f[1];
|
|
dst->z = V.v4f[2];
|
|
/*
|
|
#elif defined(SSE4_INTRINSICS)
|
|
_mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
|
|
*reinterpret_cast<int*>(&dst->z) = _mm_extract_ps(V, 2);
|
|
*/
|
|
#else
|
|
_mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
|
|
__m128 z = _mm_movehl_ps(V, V);
|
|
_mm_store_ss(&dst->z, z);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4LoadF4(const M3D_F4* src) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR V;
|
|
V.v4f[0] = src->x;
|
|
V.v4f[1] = src->y;
|
|
V.v4f[2] = src->z;
|
|
V.v4f[3] = src->w;
|
|
return V;
|
|
#else
|
|
return _mm_loadu_ps(&src->x);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4LoadV4A(const M3D_F4A* src) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR V;
|
|
V.v4f[0] = src->x;
|
|
V.v4f[1] = src->y;
|
|
V.v4f[2] = src->z;
|
|
V.v4f[3] = src->w;
|
|
return V;
|
|
#else
|
|
return _mm_load_ps(&src->x);
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V4StoreF4(M3D_F4* dst, M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
dst->x = V.v4f[0];
|
|
dst->y = V.v4f[1];
|
|
dst->z = V.v4f[2];
|
|
dst->w = V.v4f[3];
|
|
#else
|
|
_mm_storeu_ps(&dst->x, V);
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V4StoreF4A(M3D_F4A* dst, M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
dst->x = V.v4f[0];
|
|
dst->y = V.v4f[1];
|
|
dst->z = V.v4f[2];
|
|
dst->w = V.v4f[3];
|
|
#else
|
|
_mm_store_ps(&dst->x, V);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_V4LoadF4x4(const M3D_F4X4* src) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.rows[0].v4f[0] = src->mat[0][0];
|
|
ret.rows[0].v4f[1] = src->mat[0][1];
|
|
ret.rows[0].v4f[2] = src->mat[0][2];
|
|
ret.rows[0].v4f[3] = src->mat[0][3];
|
|
|
|
ret.rows[1].v4f[0] = src->mat[1][0];
|
|
ret.rows[1].v4f[1] = src->mat[1][1];
|
|
ret.rows[1].v4f[2] = src->mat[1][2];
|
|
ret.rows[1].v4f[3] = src->mat[1][3];
|
|
|
|
ret.rows[2].v4f[0] = src->mat[2][0];
|
|
ret.rows[2].v4f[1] = src->mat[2][1];
|
|
ret.rows[2].v4f[2] = src->mat[2][2];
|
|
ret.rows[2].v4f[3] = src->mat[2][3];
|
|
|
|
ret.rows[3].v4f[0] = src->mat[3][0];
|
|
ret.rows[3].v4f[1] = src->mat[3][1];
|
|
ret.rows[3].v4f[2] = src->mat[3][2];
|
|
ret.rows[3].v4f[3] = src->mat[3][3];
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_loadu_ps(&src->_00);
|
|
ret.rows[1] = _mm_loadu_ps(&src->_10);
|
|
ret.rows[2] = _mm_loadu_ps(&src->_20);
|
|
ret.rows[3] = _mm_loadu_ps(&src->_30);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_V4LoadF4x4A(const M3D_F4X4A* src) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.rows[0].v4f[0] = src->mat[0][0];
|
|
ret.rows[0].v4f[1] = src->mat[0][1];
|
|
ret.rows[0].v4f[2] = src->mat[0][2];
|
|
ret.rows[0].v4f[3] = src->mat[0][3];
|
|
|
|
ret.rows[1].v4f[0] = src->mat[1][0];
|
|
ret.rows[1].v4f[1] = src->mat[1][1];
|
|
ret.rows[1].v4f[2] = src->mat[1][2];
|
|
ret.rows[1].v4f[3] = src->mat[1][3];
|
|
|
|
ret.rows[2].v4f[0] = src->mat[2][0];
|
|
ret.rows[2].v4f[1] = src->mat[2][1];
|
|
ret.rows[2].v4f[2] = src->mat[2][2];
|
|
ret.rows[2].v4f[3] = src->mat[2][3];
|
|
|
|
ret.rows[3].v4f[0] = src->mat[3][0];
|
|
ret.rows[3].v4f[1] = src->mat[3][1];
|
|
ret.rows[3].v4f[2] = src->mat[3][2];
|
|
ret.rows[3].v4f[3] = src->mat[3][3];
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_load_ps(&src->_00);
|
|
ret.rows[1] = _mm_load_ps(&src->_10);
|
|
ret.rows[2] = _mm_load_ps(&src->_20);
|
|
ret.rows[3] = _mm_load_ps(&src->_30);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V4StoreF4x4(M3D_F4X4* dst, M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
dst->mat[0][0] = M.rows[0].v4f[0];
|
|
dst->mat[0][1] = M.rows[0].v4f[1];
|
|
dst->mat[0][2] = M.rows[0].v4f[2];
|
|
dst->mat[0][3] = M.rows[0].v4f[3];
|
|
|
|
dst->mat[1][0] = M.rows[1].v4f[0];
|
|
dst->mat[1][1] = M.rows[1].v4f[1];
|
|
dst->mat[1][2] = M.rows[1].v4f[2];
|
|
dst->mat[1][3] = M.rows[1].v4f[3];
|
|
|
|
dst->mat[2][0] = M.rows[2].v4f[0];
|
|
dst->mat[2][1] = M.rows[2].v4f[1];
|
|
dst->mat[2][2] = M.rows[2].v4f[2];
|
|
dst->mat[2][3] = M.rows[2].v4f[3];
|
|
|
|
dst->mat[3][0] = M.rows[3].v4f[0];
|
|
dst->mat[3][1] = M.rows[3].v4f[1];
|
|
dst->mat[3][2] = M.rows[3].v4f[2];
|
|
dst->mat[3][3] = M.rows[3].v4f[3];
|
|
#else
|
|
_mm_storeu_ps(&dst->_00, M.rows[0]);
|
|
_mm_storeu_ps(&dst->_10, M.rows[1]);
|
|
_mm_storeu_ps(&dst->_20, M.rows[2]);
|
|
_mm_storeu_ps(&dst->_30, M.rows[3]);
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V4StoreF4x4A(M3D_F4X4A* dst, M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
dst->mat[0][0] = M.rows[0].v4f[0];
|
|
dst->mat[0][1] = M.rows[0].v4f[1];
|
|
dst->mat[0][2] = M.rows[0].v4f[2];
|
|
dst->mat[0][3] = M.rows[0].v4f[3];
|
|
|
|
dst->mat[1][0] = M.rows[1].v4f[0];
|
|
dst->mat[1][1] = M.rows[1].v4f[1];
|
|
dst->mat[1][2] = M.rows[1].v4f[2];
|
|
dst->mat[1][3] = M.rows[1].v4f[3];
|
|
|
|
dst->mat[2][0] = M.rows[2].v4f[0];
|
|
dst->mat[2][1] = M.rows[2].v4f[1];
|
|
dst->mat[2][2] = M.rows[2].v4f[2];
|
|
dst->mat[2][3] = M.rows[2].v4f[3];
|
|
|
|
dst->mat[3][0] = M.rows[3].v4f[0];
|
|
dst->mat[3][1] = M.rows[3].v4f[1];
|
|
dst->mat[3][2] = M.rows[3].v4f[2];
|
|
dst->mat[3][3] = M.rows[3].v4f[3];
|
|
#else
|
|
_mm_store_ps(&dst->_00, M.rows[0]);
|
|
_mm_store_ps(&dst->_10, M.rows[1]);
|
|
_mm_store_ps(&dst->_20, M.rows[2]);
|
|
_mm_store_ps(&dst->_30, M.rows[3]);
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_VECTOR M3D_V4Set(float x, float y, float z, float w) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret = {{{x, y, z, w}}};
|
|
return ret.v;
|
|
#else
|
|
return _mm_set_ps(w, z, y, x);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Negate(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret = {{{
|
|
-V.v4f[0],
|
|
-V.v4f[1],
|
|
-V.v4f[2],
|
|
-V.v4f[3]
|
|
}}};
|
|
return ret.v;
|
|
#else
|
|
M3D_VECTOR Z = _mm_setzero_ps();
|
|
return _mm_sub_ps(Z, V);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Replicate(float val) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret;
|
|
ret.f[0] =
|
|
ret.f[1] =
|
|
ret.f[2] =
|
|
ret.f[3] = val;
|
|
return ret.v;
|
|
#else
|
|
return _mm_set_ps1(val);
|
|
#endif
|
|
}
|
|
|
|
inline float M3D_V4GetX(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
return V.v4f[0];
|
|
#else
|
|
return _mm_cvtss_f32(V);
|
|
#endif
|
|
}
|
|
|
|
inline float M3D_V4GetY(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
return V.v4f[1];
|
|
#else
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
return _mm_cvtss_f32(vTemp);
|
|
#endif
|
|
}
|
|
|
|
inline float M3D_V4GetZ(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
return V.v4f[2];
|
|
#else
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
return _mm_cvtss_f32(vTemp);
|
|
#endif
|
|
}
|
|
|
|
inline float M3D_V4GetW(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
return V.v4f[3];
|
|
#else
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
|
|
return _mm_cvtss_f32(vTemp);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4SplatX(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 vResult;
|
|
vResult.f[0] =
|
|
vResult.f[1] =
|
|
vResult.f[2] =
|
|
vResult.f[3] = V.v4f[0];
|
|
return vResult.v;
|
|
#elif defined(AVX2_INTRINSICS) && defined(FAVOR_INTEL)
|
|
return _mm_broadcastss_ps(V);
|
|
#else
|
|
return M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4SplatY(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 vResult;
|
|
vResult.f[0] =
|
|
vResult.f[1] =
|
|
vResult.f[2] =
|
|
vResult.f[3] = V.v4f[1];
|
|
return vResult.v;
|
|
#else
|
|
return M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4SplatZ(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 vResult;
|
|
vResult.f[0] =
|
|
vResult.f[1] =
|
|
vResult.f[2] =
|
|
vResult.f[3] = V.v4f[2];
|
|
return vResult.v;
|
|
#else
|
|
return M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4SplatW(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 vResult;
|
|
vResult.f[0] =
|
|
vResult.f[1] =
|
|
vResult.f[2] =
|
|
vResult.f[3] = V.v4f[3];
|
|
return vResult.v;
|
|
#else
|
|
return M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Add(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret = {{{
|
|
V1.v4f[0] + V2.v4f[0],
|
|
V1.v4f[1] + V2.v4f[1],
|
|
V1.v4f[2] + V2.v4f[2],
|
|
V1.v4f[3] + V2.v4f[3]
|
|
}}};
|
|
return ret.v;
|
|
#else
|
|
return _mm_add_ps(V1, V2);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Subtract(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret = {{{
|
|
V1.v4f[0] - V2.v4f[0],
|
|
V1.v4f[1] - V2.v4f[1],
|
|
V1.v4f[2] - V2.v4f[2],
|
|
V1.v4f[3] - V2.v4f[3]
|
|
}}};
|
|
return ret.v;
|
|
#else
|
|
return _mm_sub_ps(V1, V2);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4MultiplyAdd(M3D_VECTOR V1, M3D_VECTOR V2, M3D_VECTOR V3) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret = {{{
|
|
V1.v4f[0] * V2.v4f[0] + V3.v4f[0],
|
|
V1.v4f[1] * V2.v4f[1] + V3.v4f[1],
|
|
V1.v4f[2] * V2.v4f[2] + V3.v4f[2],
|
|
V1.v4f[3] * V2.v4f[3] + V3.v4f[3]
|
|
}}};
|
|
return ret.v;
|
|
#else
|
|
return M3D_FMADD_PS(V1, V2, V3);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Divide(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret = {{{
|
|
V1.v4f[0] / V2.v4f[0],
|
|
V1.v4f[1] / V2.v4f[1],
|
|
V1.v4f[2] / V2.v4f[2],
|
|
V1.v4f[3] / V2.v4f[3]
|
|
}}};
|
|
return ret.v;
|
|
#else
|
|
return _mm_div_ps(V1, V2);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Scale(M3D_VECTOR V, float scale) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 ret = {{{
|
|
V.v4f[0] * scale,
|
|
V.v4f[1] * scale,
|
|
V.v4f[2] * scale,
|
|
V.v4f[3] * scale
|
|
}}};
|
|
return ret.v;
|
|
#else
|
|
M3D_VECTOR ret = _mm_set_ps1(scale);
|
|
return _mm_mul_ps(ret, V);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Select(M3D_VECTOR V1, M3D_VECTOR V2, M3D_VECTOR Control) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4U32 ret = {{{
|
|
(V1.v4u[0] & ~Control.v4u[0]) | (V2.v4u[0] & Control.v4u[0]),
|
|
(V1.v4u[1] & ~Control.v4u[1]) | (V2.v4u[1] & Control.v4u[1]),
|
|
(V1.v4u[2] & ~Control.v4u[2]) | (V2.v4u[2] & Control.v4u[2]),
|
|
(V1.v4u[3] & ~Control.v4u[3]) | (V2.v4u[3] & Control.v4u[3]),
|
|
}}};
|
|
return ret.v;
|
|
#else
|
|
M3D_VECTOR vTemp1 = _mm_andnot_ps(Control, V1);
|
|
M3D_VECTOR vTemp2 = _mm_and_ps(V2, Control);
|
|
return _mm_or_ps(vTemp1, vTemp2);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4MergeXY(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4U32 Result = { { {
|
|
V1.v4u[0],
|
|
V2.v4u[0],
|
|
V1.v4u[1],
|
|
V2.v4u[1],
|
|
} } };
|
|
return Result.v;
|
|
#else
|
|
return _mm_unpacklo_ps(V1, V2);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4MergeZW(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4U32 Result = { { {
|
|
V1.v4u[2],
|
|
V2.v4u[2],
|
|
V1.v4u[3],
|
|
V2.v4u[3]
|
|
} } };
|
|
return Result.v;
|
|
#else
|
|
return _mm_unpackhi_ps(V1, V2);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V4Sqrt(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 Result = { { {
|
|
sqrtf(V.v4f[0]),
|
|
sqrtf(V.v4f[1]),
|
|
sqrtf(V.v4f[2]),
|
|
sqrtf(V.v4f[3])
|
|
} } };
|
|
return Result.v;
|
|
#else
|
|
return _mm_sqrt_ps(V);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
float fValue = V1.v4f[0] * V2.v4f[0] + V1.v4f[1] * V2.v4f[1] + V1.v4f[2] * V2.v4f[2];
|
|
M3D_V4F32 vResult;
|
|
vResult.f[0] =
|
|
vResult.f[1] =
|
|
vResult.f[2] =
|
|
vResult.f[3] = fValue;
|
|
return vResult.v;
|
|
#elif defined(SSE4_INTRINSICS)
|
|
return _mm_dp_ps(V1, V2, 0x7f);
|
|
#elif defined(SSE3_INTRINSICS)
|
|
M3D_VECTOR vTemp = _mm_mul_ps(V1, V2);
|
|
vTemp = _mm_and_ps(vTemp, g_XMMask3);
|
|
vTemp = _mm_hadd_ps(vTemp, vTemp);
|
|
return _mm_hadd_ps(vTemp, vTemp);
|
|
#else
|
|
// Perform the dot product
|
|
M3D_VECTOR vDot = _mm_mul_ps(V1, V2);
|
|
// x=Dot.v4f[1], y=Dot.v4f[2]
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
|
|
// Result.v4f[0] = x+y
|
|
vDot = _mm_add_ss(vDot, vTemp);
|
|
// x=Dot.v4f[2]
|
|
vTemp = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
|
|
// Result.v4f[0] = (x+y)+z
|
|
vDot = _mm_add_ss(vDot, vTemp);
|
|
// Splat x
|
|
return M3D_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3Cross(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
|
|
// [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ]
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 vResult = {{{
|
|
(V1.v4f[1] * V2.v4f[2]) - (V1.v4f[2] * V2.v4f[1]),
|
|
(V1.v4f[2] * V2.v4f[0]) - (V1.v4f[0] * V2.v4f[2]),
|
|
(V1.v4f[0] * V2.v4f[1]) - (V1.v4f[1] * V2.v4f[0]),
|
|
0.0f
|
|
}}};
|
|
return vResult.v;
|
|
#else
|
|
// y1,z1,x1,w1
|
|
M3D_VECTOR vTemp1 = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(3, 0, 2, 1));
|
|
// z2,x2,y2,w2
|
|
M3D_VECTOR vTemp2 = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(3, 1, 0, 2));
|
|
// Perform the left operation
|
|
M3D_VECTOR vResult = _mm_mul_ps(vTemp1, vTemp2);
|
|
// z1,x1,y1,w1
|
|
vTemp1 = M3D_PERMUTE_PS(vTemp1, _MM_SHUFFLE(3, 0, 2, 1));
|
|
// y2,z2,x2,w2
|
|
vTemp2 = M3D_PERMUTE_PS(vTemp2, _MM_SHUFFLE(3, 1, 0, 2));
|
|
// Perform the right operation
|
|
vResult = M3D_FMADD_PS(vTemp1, vTemp2, vResult);
|
|
// Set w to zero
|
|
return _mm_and_ps(vResult, M3D_MMask3);
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3LengthSq(M3D_VECTOR V) noexcept {
|
|
return M3D_V3Dot(V, V);
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3Length(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR Result;
|
|
|
|
Result = M3D_V3LengthSq(V);
|
|
Result = M3D_V4Sqrt(Result);
|
|
|
|
return Result;
|
|
#elif defined(SSE4_INTRINSICS)
|
|
M3D_VECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
|
|
return _mm_sqrt_ps(vTemp);
|
|
#elif defined(SSE3_INTRINSICS)
|
|
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
|
|
vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
|
|
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
|
|
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
|
|
vLengthSq = _mm_sqrt_ps(vLengthSq);
|
|
return vLengthSq;
|
|
#else
|
|
// Perform the dot product on x,y and z
|
|
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
|
|
// vTemp has z and y
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2));
|
|
// x+z, y
|
|
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
|
|
// y,y,y,y
|
|
vTemp = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
|
|
// x+z+y,??,??,??
|
|
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
|
|
// Splat the length squared
|
|
vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
|
|
// Get the length
|
|
vLengthSq = _mm_sqrt_ps(vLengthSq);
|
|
return vLengthSq;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR vResult = M3D_V3Length(V);
|
|
float fLength = vResult.v4f[0];
|
|
|
|
// Prevent divide by zero - uhuh
|
|
if (fLength > 0) {
|
|
fLength = 1.0f / fLength;
|
|
}
|
|
|
|
vResult.v4f[0] = V.v4f[0] * fLength;
|
|
vResult.v4f[1] = V.v4f[1] * fLength;
|
|
vResult.v4f[2] = V.v4f[2] * fLength;
|
|
vResult.v4f[3] = V.v4f[3] * fLength;
|
|
return vResult;
|
|
|
|
#elif defined(SSE4_INTRINSICS)
|
|
M3D_VECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f);
|
|
// Prepare for the division
|
|
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
|
|
// Create zero with a single instruction
|
|
M3D_VECTOR vZeroMask = _mm_setzero_ps();
|
|
// Test for a divide by zero (Must be FP to detect -0.0)
|
|
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
|
|
// Failsafe on zero (Or epsilon) length planes
|
|
// If the length is infinity, set the elements to zero
|
|
vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
|
|
// Divide to perform the normalization
|
|
vResult = _mm_div_ps(V, vResult);
|
|
// Any that are infinity, set to zero
|
|
vResult = _mm_and_ps(vResult, vZeroMask);
|
|
// Select qnan or result based on infinite length
|
|
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
|
|
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
|
|
vResult = _mm_or_ps(vTemp1, vTemp2);
|
|
return vResult;
|
|
#elif defined(SSE3_INTRINSICS)
|
|
// Perform the dot product on x,y and z only
|
|
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
|
|
vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
|
|
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
|
|
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
|
|
// Prepare for the division
|
|
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
|
|
// Create zero with a single instruction
|
|
M3D_VECTOR vZeroMask = _mm_setzero_ps();
|
|
// Test for a divide by zero (Must be FP to detect -0.0)
|
|
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
|
|
// Failsafe on zero (Or epsilon) length planes
|
|
// If the length is infinity, set the elements to zero
|
|
vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
|
|
// Divide to perform the normalization
|
|
vResult = _mm_div_ps(V, vResult);
|
|
// Any that are infinity, set to zero
|
|
vResult = _mm_and_ps(vResult, vZeroMask);
|
|
// Select qnan or result based on infinite length
|
|
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
|
|
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
|
|
vResult = _mm_or_ps(vTemp1, vTemp2);
|
|
return vResult;
|
|
#else
|
|
// Perform the dot product on x,y and z only
|
|
M3D_VECTOR vLengthSq = _mm_mul_ps(V, V);
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1));
|
|
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
|
|
vTemp = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
|
|
vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
|
|
// Prepare for the division
|
|
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
|
|
// Create zero with a single instruction
|
|
M3D_VECTOR vZeroMask = _mm_setzero_ps();
|
|
// Test for a divide by zero (Must be FP to detect -0.0)
|
|
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
|
|
// Failsafe on zero (Or epsilon) length planes
|
|
// If the length is infinity, set the elements to zero
|
|
vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
|
|
// Divide to perform the normalization
|
|
vResult = _mm_div_ps(V, vResult);
|
|
// Any that are infinity, set to zero
|
|
vResult = _mm_and_ps(vResult, vZeroMask);
|
|
// Select qnan or result based on infinite length
|
|
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
|
|
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
|
|
vResult = _mm_or_ps(vTemp1, vTemp2);
|
|
return vResult;
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_MATRIX M3D_MIdentity() noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0.v;
|
|
ret.rows[1] = M3D_MIdentityR1.v;
|
|
ret.rows[2] = M3D_MIdentityR2.v;
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_MMultiply(M3D_MATRIX M1, M3D_MATRIX& M2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
// Cache the invariants in registers
|
|
float x = M1.mat[0][0];
|
|
float y = M1.mat[0][1];
|
|
float z = M1.mat[0][2];
|
|
float w = M1.mat[0][3];
|
|
// Perform the operation on the first row
|
|
ret.mat[0][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[0][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[0][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[0][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
// Repeat for all the other rows
|
|
x = M1.mat[1][0];
|
|
y = M1.mat[1][1];
|
|
z = M1.mat[1][2];
|
|
w = M1.mat[1][3];
|
|
ret.mat[1][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[1][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[1][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[1][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
x = M1.mat[2][0];
|
|
y = M1.mat[2][1];
|
|
z = M1.mat[2][2];
|
|
w = M1.mat[2][3];
|
|
ret.mat[2][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[2][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[2][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[2][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
x = M1.mat[3][0];
|
|
y = M1.mat[3][1];
|
|
z = M1.mat[3][2];
|
|
w = M1.mat[3][3];
|
|
ret.mat[3][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[3][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[3][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[3][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
return ret;
|
|
#elif defined(AVX2_INTRINSICS)
|
|
__m256 t0 = _mm256_castps128_ps256(M1.rows[0]);
|
|
t0 = _mm256_insertf128_ps(t0, M1.rows[1], 1);
|
|
__m256 t1 = _mm256_castps128_ps256(M1.rows[2]);
|
|
t1 = _mm256_insertf128_ps(t1, M1.rows[3], 1);
|
|
|
|
__m256 u0 = _mm256_castps128_ps256(M2.rows[0]);
|
|
u0 = _mm256_insertf128_ps(u0, M2.rows[1], 1);
|
|
__m256 u1 = _mm256_castps128_ps256(M2.rows[2]);
|
|
u1 = _mm256_insertf128_ps(u1, M2.rows[3], 1);
|
|
|
|
__m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00);
|
|
__m256 c0 = _mm256_mul_ps(a0, b0);
|
|
__m256 c1 = _mm256_mul_ps(a1, b0);
|
|
|
|
a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1));
|
|
a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
b0 = _mm256_permute2f128_ps(u0, u0, 0x11);
|
|
__m256 c2 = _mm256_fmadd_ps(a0, b0, c0);
|
|
__m256 c3 = _mm256_fmadd_ps(a1, b0, c1);
|
|
|
|
a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2));
|
|
a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
__m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00);
|
|
__m256 c4 = _mm256_mul_ps(a0, b1);
|
|
__m256 c5 = _mm256_mul_ps(a1, b1);
|
|
|
|
a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3));
|
|
a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
|
|
b1 = _mm256_permute2f128_ps(u1, u1, 0x11);
|
|
__m256 c6 = _mm256_fmadd_ps(a0, b1, c4);
|
|
__m256 c7 = _mm256_fmadd_ps(a1, b1, c5);
|
|
|
|
t0 = _mm256_add_ps(c2, c6);
|
|
t1 = _mm256_add_ps(c3, c7);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm256_castps256_ps128(t0);
|
|
ret.rows[1] = _mm256_extractf128_ps(t0, 1);
|
|
ret.rows[2] = _mm256_castps256_ps128(t1);
|
|
ret.rows[3] = _mm256_extractf128_ps(t1, 1);
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
// Splat the component X,Y,Z then W
|
|
#ifdef AVX_INTRINSICS
|
|
XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 0);
|
|
XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 1);
|
|
XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 2);
|
|
XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 3);
|
|
#else
|
|
// Use vW to hold the original row
|
|
M3D_VECTOR vW = M1.rows[0];
|
|
M3D_VECTOR vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
M3D_VECTOR vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
// Perform the operation on the first row
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
// Perform a binary add to reduce cumulative errors
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[0] = vX;
|
|
// Repeat for the other 3 rows
|
|
#ifdef AVX_INTRINSICS
|
|
vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 0);
|
|
vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 1);
|
|
vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 2);
|
|
vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 3);
|
|
#else
|
|
vW = M1.rows[1];
|
|
vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[1] = vX;
|
|
#ifdef AVX_INTRINSICS
|
|
vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 0);
|
|
vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 1);
|
|
vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 2);
|
|
vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 3);
|
|
#else
|
|
vW = M1.rows[2];
|
|
vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[2] = vX;
|
|
#ifdef AVX_INTRINSICS
|
|
vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 0);
|
|
vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 1);
|
|
vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 2);
|
|
vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 3);
|
|
#else
|
|
vW = M1.rows[3];
|
|
vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[3] = vX;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_MTranspose(M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
// Original matrix:
|
|
//
|
|
// m00m01m02m03
|
|
// m10m11m12m13
|
|
// m20m21m22m23
|
|
// m30m31m32m33
|
|
|
|
M3D_MATRIX P;
|
|
P.rows[0] = M3D_V4MergeXY(M.rows[0], M.rows[2]); // m00m20m01m21
|
|
P.rows[1] = M3D_V4MergeXY(M.rows[1], M.rows[3]); // m10m30m11m31
|
|
P.rows[2] = M3D_V4MergeZW(M.rows[0], M.rows[2]); // m02m22m03m23
|
|
P.rows[3] = M3D_V4MergeZW(M.rows[1], M.rows[3]); // m12m32m13m33
|
|
|
|
M3D_MATRIX MT;
|
|
MT.rows[0] = M3D_V4MergeXY(P.rows[0], P.rows[1]); // m00m10m20m30
|
|
MT.rows[1] = M3D_V4MergeZW(P.rows[0], P.rows[1]); // m01m11m21m31
|
|
MT.rows[2] = M3D_V4MergeXY(P.rows[2], P.rows[3]); // m02m12m22m32
|
|
MT.rows[3] = M3D_V4MergeZW(P.rows[2], P.rows[3]); // m03m13m23m33
|
|
return MT;
|
|
#elif defined(AVX2_INTRINSICS)
|
|
__m256 t0 = _mm256_castps128_ps256(M.rows[0]);
|
|
t0 = _mm256_insertf128_ps(t0, M.rows[1], 1);
|
|
__m256 t1 = _mm256_castps128_ps256(M.rows[2]);
|
|
t1 = _mm256_insertf128_ps(t1, M.rows[3], 1);
|
|
|
|
__m256 vTemp = _mm256_unpacklo_ps(t0, t1);
|
|
__m256 vTemp2 = _mm256_unpackhi_ps(t0, t1);
|
|
__m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
|
|
__m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
|
|
vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4);
|
|
vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4);
|
|
t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
|
|
t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm256_castps256_ps128(t0);
|
|
ret.rows[1] = _mm256_extractf128_ps(t0, 1);
|
|
ret.rows[2] = _mm256_castps256_ps128(t1);
|
|
ret.rows[3] = _mm256_extractf128_ps(t1, 1);
|
|
return ret;
|
|
#else
|
|
// x.x,x.y,y.x,y.y
|
|
M3D_VECTOR vTemp1 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(1, 0, 1, 0));
|
|
// x.z,x.w,y.z,y.w
|
|
M3D_VECTOR vTemp3 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(3, 2, 3, 2));
|
|
// z.x,z.y,w.x,w.y
|
|
M3D_VECTOR vTemp2 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(1, 0, 1, 0));
|
|
// z.z,z.w,w.z,w.w
|
|
M3D_VECTOR vTemp4 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(3, 2, 3, 2));
|
|
|
|
M3D_MATRIX ret;
|
|
// x.x,y.x,z.x,w.x
|
|
ret.rows[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
|
|
// x.y,y.y,z.y,w.y
|
|
ret.rows[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
|
|
// x.z,y.z,z.z,w.z
|
|
ret.rows[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
|
|
// x.w,y.w,z.w,w.w
|
|
ret.rows[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_VECTOR M3D_V3Transform(M3D_VECTOR V, M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, M.rows[2], M.rows[3]);
|
|
Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result);
|
|
Result = M3D_V4MultiplyAdd(X, M.rows[0], Result);
|
|
|
|
return Result;
|
|
#else
|
|
M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z
|
|
vResult = M3D_FMADD_PS(vResult, M.rows[2], M.rows[3]);
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
|
|
vResult = M3D_FMADD_PS(vTemp, M.rows[1], vResult);
|
|
vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
|
|
vResult = M3D_FMADD_PS(vTemp, M.rows[0], vResult);
|
|
return vResult;
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V3Transform(
|
|
M3D_F4* pOutputStream,
|
|
size_t OutputStride,
|
|
const M3D_F3* pInputStream,
|
|
size_t InputStride,
|
|
size_t VectorCount,
|
|
M3D_MATRIX M
|
|
) noexcept {
|
|
auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
|
|
auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
|
|
|
|
const M3D_VECTOR row0 = M.rows[0];
|
|
const M3D_VECTOR row1 = M.rows[1];
|
|
const M3D_VECTOR row2 = M.rows[2];
|
|
const M3D_VECTOR row3 = M.rows[3];
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
for (size_t i = 0; i < VectorCount; i++) {
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, row2, row3);
|
|
Result = M3D_V4MultiplyAdd(Y, row1, Result);
|
|
Result = M3D_V4MultiplyAdd(X, row0, Result);
|
|
|
|
M3D_V4StoreF4(reinterpret_cast<M3D_F4*>(pOutputVector), Result);
|
|
|
|
pInputVector += InputStride;
|
|
pOutputVector += OutputStride;
|
|
}
|
|
#else
|
|
size_t i = 0;
|
|
size_t four = VectorCount >> 2;
|
|
if (four > 0) {
|
|
if (InputStride == sizeof(M3D_F3)) {
|
|
if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF)) {
|
|
// Packed input, aligned output
|
|
for (size_t j = 0; j < four; ++j) {
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
i += 4;
|
|
}
|
|
} else {
|
|
// Packed input, unaligned output
|
|
for (size_t j = 0; j < four; ++j)
|
|
{
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
i += 4;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF)) {
|
|
// Aligned output
|
|
for (; i < VectorCount; ++i) {
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
} else {
|
|
// Unaligned output
|
|
for (; i < VectorCount; ++i)
|
|
{
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
}
|
|
|
|
M3D_SFENCE();
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3TransformPersDiv(M3D_VECTOR V, M3D_MATRIX M) noexcept {
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, M.rows[2], M.rows[3]);
|
|
Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result);
|
|
Result = M3D_V4MultiplyAdd(X, M.rows[0], Result);
|
|
|
|
M3D_VECTOR W = M3D_V4SplatW(Result);
|
|
return M3D_V4Divide(Result, W);
|
|
}
|
|
|
|
inline void M3D_V3TransformPersDiv(
|
|
M3D_F3* pOutputStream,
|
|
size_t OutputStride,
|
|
const M3D_F3* pInputStream,
|
|
size_t InputStride,
|
|
size_t VectorCount,
|
|
M3D_MATRIX M
|
|
) noexcept {
|
|
auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
|
|
auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
|
|
|
|
const M3D_VECTOR row0 = M.rows[0];
|
|
const M3D_VECTOR row1 = M.rows[1];
|
|
const M3D_VECTOR row2 = M.rows[2];
|
|
const M3D_VECTOR row3 = M.rows[3];
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
for (size_t i = 0; i < VectorCount; i++)
|
|
{
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, row2, row3);
|
|
Result = M3D_V4MultiplyAdd(Y, row1, Result);
|
|
Result = M3D_V4MultiplyAdd(X, row0, Result);
|
|
|
|
M3D_VECTOR W = M3D_V4SplatW(Result);
|
|
|
|
Result = M3D_V4Divide(Result, W);
|
|
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), Result);
|
|
|
|
pInputVector += InputStride;
|
|
pOutputVector += OutputStride;
|
|
}
|
|
#else
|
|
size_t i = 0;
|
|
size_t four = VectorCount >> 2;
|
|
if (four > 0) {
|
|
if (InputStride == sizeof(M3D_F3)) {
|
|
if (OutputStride == sizeof(M3D_F3)) {
|
|
if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF)) {
|
|
// Packed input, aligned & packed output
|
|
for (size_t j = 0; j < four; ++j) {
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V1 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V2 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V3 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V4 = _mm_div_ps(vTemp, W);
|
|
|
|
// Pack and store the vectors
|
|
M3D_PACK4INTO3(vTemp);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), V1);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 32), V3);
|
|
pOutputVector += sizeof(M3D_F3) * 4;
|
|
i += 4;
|
|
}
|
|
} else {
|
|
// Packed input, unaligned & packed output
|
|
for (size_t j = 0; j < four; ++j) {
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V1 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V2 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V3 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V4 = _mm_div_ps(vTemp, W);
|
|
|
|
// Pack and store the vectors
|
|
M3D_PACK4INTO3(vTemp);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), V1);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 32), V3);
|
|
pOutputVector += sizeof(M3D_F3) * 4;
|
|
i += 4;
|
|
}
|
|
}
|
|
} else {
|
|
// Packed input, unpacked output
|
|
for (size_t j = 0; j < four; ++j)
|
|
{
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
i += 4;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (; i < VectorCount; i++) {
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
|
|
M3D_SFENCE();
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixCamLookAtLH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept {
|
|
M3D_VECTOR dir = M3D_V4Subtract(focusPos, viewPos);
|
|
return M3D_TransformMatrixCamLookToLH(viewPos, dir, upDirection);
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixCamLookAtRH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept {
|
|
M3D_VECTOR dir_n = M3D_V4Subtract(viewPos, focusPos);
|
|
return M3D_TransformMatrixCamLookToLH(viewPos, dir_n, upDirection);
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixCamLookToLH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept {
|
|
// Keep viewer's axes orthogonal to each other and of unit length
|
|
M3D_VECTOR look_normal = M3D_V3Normalize(viewDirection);
|
|
M3D_VECTOR up_norm = M3D_V3Cross(upDirection, look_normal);
|
|
up_norm = M3D_V3Normalize(up_norm);
|
|
|
|
// U, L already ortho-normal, so no need to normalize cross product
|
|
M3D_VECTOR right_norm = M3D_V3Cross(look_normal, up_norm);
|
|
|
|
M3D_VECTOR viewPos_n = M3D_V4Negate(viewPos);
|
|
|
|
M3D_VECTOR right_vec = M3D_V3Dot(up_norm, viewPos_n);
|
|
M3D_VECTOR up_vec = M3D_V3Dot(right_norm, viewPos_n);
|
|
M3D_VECTOR look_vec = M3D_V3Dot(look_normal, viewPos_n);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Select(right_vec, up_norm, M3D_MSelect1110.v);
|
|
ret.rows[1] = M3D_V4Select(up_vec, right_norm, M3D_MSelect1110.v);
|
|
ret.rows[2] = M3D_V4Select(look_vec, look_normal, M3D_MSelect1110.v);
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
|
|
ret = M3D_MTranspose(ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixCamLookToRH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept {
|
|
M3D_VECTOR viewDirection_n = M3D_V4Negate(viewDirection);
|
|
return M3D_TransformMatrixCamLookToLH(viewPos, viewDirection_n, upDirection);
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixFrustrumFovLH(float fov, float ratio, float near, float far) noexcept {
|
|
float SinFov;
|
|
float CosFov;
|
|
M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov);
|
|
float fRange = far / (far - near);
|
|
float Height = CosFov / SinFov;
|
|
float Width = Height / ratio;
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = Width;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = Height;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = fRange;
|
|
ret.mat[2][3] = 1.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = -fRange * near;
|
|
ret.mat[3][3] = 0.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR rMem = {
|
|
Width,
|
|
Height,
|
|
fRange,
|
|
-fRange * near
|
|
};
|
|
|
|
// Copy from memory to SSE register
|
|
M3D_VECTOR vValues = rMem;
|
|
|
|
M3D_MATRIX ret;
|
|
M3D_VECTOR vTemp = _mm_setzero_ps();
|
|
vTemp = _mm_move_ss(vTemp, vValues);
|
|
ret.rows[0] = vTemp; // Width, 0, 0, 0
|
|
vTemp = vValues;
|
|
vTemp = _mm_and_ps(vTemp, M3D_MMaskY);
|
|
ret.rows[1] = vTemp; // 0, Height, 0, 0
|
|
vTemp = _mm_setzero_ps();
|
|
vValues = _mm_shuffle_ps(vValues, M3D_MIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
|
|
ret.rows[2] = vTemp; // 0, 0, fRange, 1.0f
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
|
|
ret.rows[3] = vTemp; // 0, 0, -fRange * near, 0.0f
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixFrustrumFovRH(float fov, float ratio, float near, float far) noexcept {
|
|
float SinFov;
|
|
float CosFov;
|
|
M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov);
|
|
float fRange = far / (near - far);
|
|
float Height = CosFov / SinFov;
|
|
float Width = Height / ratio;
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = Width;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = Height;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = fRange;
|
|
ret.mat[2][3] = -1.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = fRange * near;
|
|
ret.mat[3][3] = 0.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR rMem = {
|
|
Width,
|
|
Height,
|
|
fRange,
|
|
fRange * near
|
|
};
|
|
|
|
// Copy from memory to SSE register
|
|
M3D_VECTOR vValues = rMem;
|
|
|
|
M3D_MATRIX ret;
|
|
M3D_VECTOR vTemp = _mm_setzero_ps();
|
|
vTemp = _mm_move_ss(vTemp, vValues);
|
|
ret.rows[0] = vTemp; // Height / a_ratio, 0, 0, 0
|
|
vTemp = vValues;
|
|
vTemp = _mm_and_ps(vTemp, M3D_MMaskY);
|
|
ret.rows[1] = vTemp; // 0, CosFov / SinFov, 0, 0
|
|
vTemp = _mm_setzero_ps();
|
|
vValues = _mm_shuffle_ps(vValues, M3D_MIdentityR3_n, _MM_SHUFFLE(3, 2, 3, 2));
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
|
|
ret.rows[2] = vTemp; // 0, 0, fRange, -1.0f
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
|
|
ret.rows[3] = vTemp; // 0, 0, fRange * near, 0.0f
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixScaling(float ScaleX, float ScaleY, float ScaleZ) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = ScaleX;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = ScaleY;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = ScaleZ;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_set_ps(0, 0, 0, ScaleX);
|
|
ret.rows[1] = _mm_set_ps(0, 0, ScaleY, 0);
|
|
ret.rows[2] = _mm_set_ps(0, ScaleZ, 0, 0);
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixTranslate(float OffsetX, float OffsetY, float OffsetZ) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = 1.0f;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = 1.0f;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = OffsetX;
|
|
ret.mat[3][1] = OffsetY;
|
|
ret.mat[3][2] = OffsetZ;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0.v;
|
|
ret.rows[1] = M3D_MIdentityR1.v;
|
|
ret.rows[2] = M3D_MIdentityR2.v;
|
|
ret.rows[3] = M3D_V4Set(OffsetX, OffsetY, OffsetZ, 1.f);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixRotationX(float Angle) noexcept {
|
|
float SinAngle;
|
|
float CosAngle;
|
|
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = 1.0f;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = CosAngle;
|
|
ret.mat[1][2] = SinAngle;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = -SinAngle;
|
|
ret.mat[2][2] = CosAngle;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR vSin = _mm_set_ss(SinAngle);
|
|
M3D_VECTOR vCos = _mm_set_ss(CosAngle);
|
|
// x = 0,y = cos,z = sin, w = 0
|
|
vCos = _mm_shuffle_ps(vCos, vSin, _MM_SHUFFLE(3, 0, 0, 3));
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0;
|
|
ret.rows[1] = vCos;
|
|
// x = 0,y = sin,z = cos, w = 0
|
|
vCos = M3D_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 1, 2, 0));
|
|
// x = 0,y = -sin,z = cos, w = 0
|
|
vCos = _mm_mul_ps(vCos, M3D_MNegateY);
|
|
ret.rows[2] = vCos;
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixRotationY(float Angle) noexcept {
|
|
float SinAngle;
|
|
float CosAngle;
|
|
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = CosAngle;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = -SinAngle;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = 1.0f;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = SinAngle;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = CosAngle;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR vSin = _mm_set_ss(SinAngle);
|
|
M3D_VECTOR vCos = _mm_set_ss(CosAngle);
|
|
// x = sin,y = 0,z = cos, w = 0
|
|
vSin = _mm_shuffle_ps(vSin, vCos, _MM_SHUFFLE(3, 0, 3, 0));
|
|
M3D_MATRIX ret;
|
|
ret.rows[2] = vSin;
|
|
ret.rows[1] = M3D_MIdentityR1;
|
|
// x = cos,y = 0,z = sin, w = 0
|
|
vSin = M3D_PERMUTE_PS(vSin, _MM_SHUFFLE(3, 0, 1, 2));
|
|
// x = cos,y = 0,z = -sin, w = 0
|
|
vSin = _mm_mul_ps(vSin, M3D_MNegateZ);
|
|
ret.rows[0] = vSin;
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixRotationZ(float Angle) noexcept {
|
|
float SinAngle;
|
|
float CosAngle;
|
|
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = CosAngle;
|
|
ret.mat[0][1] = SinAngle;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = -SinAngle;
|
|
ret.mat[1][1] = CosAngle;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR vSin = _mm_set_ss(SinAngle);
|
|
M3D_VECTOR vCos = _mm_set_ss(CosAngle);
|
|
// x = cos,y = sin,z = 0, w = 0
|
|
vCos = _mm_unpacklo_ps(vCos, vSin);
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = vCos;
|
|
// x = sin,y = cos,z = 0, w = 0
|
|
vCos = M3D_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 2, 0, 1));
|
|
// x = cos,y = -sin,z = 0, w = 0
|
|
vCos = _mm_mul_ps(vCos, M3D_MNegateX);
|
|
ret.rows[1] = vCos;
|
|
ret.rows[2] = M3D_MIdentityR2;
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixViewport(float _w, float _h, float _wOffset, float _hOffset) noexcept {
|
|
const float widthDiv2 = _w / 2;
|
|
const float heightDiv2 = _h / 2;
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = widthDiv2;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = -heightDiv2;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f; // maxZ-minZ ignored
|
|
ret.mat[2][3] = 0.0f; // minZ ignored
|
|
|
|
ret.mat[3][0] = _wOffset + widthDiv2;
|
|
ret.mat[3][1] = _hOffset + heightDiv2;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Set(widthDiv2, 0, 0, 0);
|
|
ret.rows[1] = M3D_V4Set(0, -heightDiv2, 0, 0);
|
|
ret.rows[2] = M3D_MIdentityR2.v; // maxZ-minZ and minZ are ignored
|
|
ret.rows[3] = M3D_V4Set(_wOffset + widthDiv2, _hOffset + heightDiv2, 0, 1);
|
|
return ret;
|
|
#endif
|
|
} |