1484 lines
55 KiB
C++
1484 lines
55 KiB
C++
#pragma once
|
|
|
|
|
|
inline M3D_MATRIX::M3D_MATRIX(
|
|
float f00, float f01, float f02, float f03,
|
|
float f10, float f11, float f12, float f13,
|
|
float f20, float f21, float f22, float f23,
|
|
float f30, float f31, float f32, float f33
|
|
) noexcept {
|
|
rows[0] = M3D_V4Set(f00, f01, f02, f03);
|
|
rows[1] = M3D_V4Set(f10, f11, f12, f13);
|
|
rows[2] = M3D_V4Set(f20, f21, f22, f23);
|
|
rows[3] = M3D_V4Set(f30, f31, f32, f33);
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_MATRIX::operator- () const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Negate(rows[0]);
|
|
ret.rows[1] = M3D_V4Negate(rows[1]);
|
|
ret.rows[2] = M3D_V4Negate(rows[2]);
|
|
ret.rows[3] = M3D_V4Negate(rows[3]);
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator+= (M3D_MATRIX M) noexcept {
|
|
rows[0] = M3D_V4Add(rows[0], M.rows[0]);
|
|
rows[1] = M3D_V4Add(rows[1], M.rows[1]);
|
|
rows[2] = M3D_V4Add(rows[2], M.rows[2]);
|
|
rows[3] = M3D_V4Add(rows[3], M.rows[3]);
|
|
return *this;
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator+ (M3D_MATRIX M) const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Add(rows[0], M.rows[0]);
|
|
ret.rows[1] = M3D_V4Add(rows[1], M.rows[1]);
|
|
ret.rows[2] = M3D_V4Add(rows[2], M.rows[2]);
|
|
ret.rows[3] = M3D_V4Add(rows[3], M.rows[3]);
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator-= (M3D_MATRIX M) noexcept {
|
|
rows[0] = M3D_V4Subtract(rows[0], M.rows[0]);
|
|
rows[1] = M3D_V4Subtract(rows[1], M.rows[1]);
|
|
rows[2] = M3D_V4Subtract(rows[2], M.rows[2]);
|
|
rows[3] = M3D_V4Subtract(rows[3], M.rows[3]);
|
|
return *this;
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator- (M3D_MATRIX M) const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Subtract(rows[0], M.rows[0]);
|
|
ret.rows[1] = M3D_V4Subtract(rows[1], M.rows[1]);
|
|
ret.rows[2] = M3D_V4Subtract(rows[2], M.rows[2]);
|
|
ret.rows[3] = M3D_V4Subtract(rows[3], M.rows[3]);
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator*=(M3D_MATRIX M) noexcept {
|
|
*this = M3D_MMultiply(*this, M);
|
|
return *this;
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator*(M3D_MATRIX M) const noexcept {
|
|
return M3D_MMultiply(*this, M);
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator*= (float S) noexcept {
|
|
rows[0] = M3D_V4Scale(rows[0], S);
|
|
rows[1] = M3D_V4Scale(rows[1], S);
|
|
rows[2] = M3D_V4Scale(rows[2], S);
|
|
rows[3] = M3D_V4Scale(rows[3], S);
|
|
return *this;
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator* (float S) const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Scale(rows[0], S);
|
|
ret.rows[1] = M3D_V4Scale(rows[1], S);
|
|
ret.rows[2] = M3D_V4Scale(rows[2], S);
|
|
ret.rows[3] = M3D_V4Scale(rows[3], S);
|
|
return ret;
|
|
}
|
|
inline M3D_MATRIX operator* (float S, M3D_MATRIX M) noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Scale(M.rows[0], S);
|
|
ret.rows[1] = M3D_V4Scale(M.rows[1], S);
|
|
ret.rows[2] = M3D_V4Scale(M.rows[2], S);
|
|
ret.rows[3] = M3D_V4Scale(M.rows[3], S);
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX& M3D_MATRIX::operator/= (float S) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR vS = M3D_V4Replicate(S);
|
|
rows[0] = M3D_V4Divide(rows[0], vS);
|
|
rows[1] = M3D_V4Divide(rows[1], vS);
|
|
rows[2] = M3D_V4Divide(rows[2], vS);
|
|
rows[3] = M3D_V4Divide(rows[3], vS);
|
|
return *this;
|
|
#else
|
|
__m128 vS = _mm_set_ps1(S);
|
|
rows[0] = _mm_div_ps(rows[0], vS);
|
|
rows[1] = _mm_div_ps(rows[1], vS);
|
|
rows[2] = _mm_div_ps(rows[2], vS);
|
|
rows[3] = _mm_div_ps(rows[3], vS);
|
|
return *this;
|
|
#endif
|
|
}
|
|
inline M3D_MATRIX M3D_MATRIX::operator/ (float S) const noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR vS = M3D_V4Replicate(S);
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Divide(rows[0], vS);
|
|
ret.rows[1] = M3D_V4Divide(rows[1], vS);
|
|
ret.rows[2] = M3D_V4Divide(rows[2], vS);
|
|
ret.rows[3] = M3D_V4Divide(rows[3], vS);
|
|
return ret;
|
|
#else
|
|
__m128 vS = _mm_set_ps1(S);
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_div_ps(rows[0], vS);
|
|
ret.rows[1] = _mm_div_ps(rows[1], vS);
|
|
ret.rows[2] = _mm_div_ps(rows[2], vS);
|
|
ret.rows[3] = _mm_div_ps(rows[3], vS);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_MATRIX M3D_MIdentity() noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0.v;
|
|
ret.rows[1] = M3D_MIdentityR1.v;
|
|
ret.rows[2] = M3D_MIdentityR2.v;
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_MMultiply(M3D_MATRIX M1, M3D_MATRIX& M2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
// Cache the invariants in registers
|
|
float x = M1.mat[0][0];
|
|
float y = M1.mat[0][1];
|
|
float z = M1.mat[0][2];
|
|
float w = M1.mat[0][3];
|
|
// Perform the operation on the first row
|
|
ret.mat[0][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[0][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[0][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[0][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
// Repeat for all the other rows
|
|
x = M1.mat[1][0];
|
|
y = M1.mat[1][1];
|
|
z = M1.mat[1][2];
|
|
w = M1.mat[1][3];
|
|
ret.mat[1][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[1][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[1][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[1][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
x = M1.mat[2][0];
|
|
y = M1.mat[2][1];
|
|
z = M1.mat[2][2];
|
|
w = M1.mat[2][3];
|
|
ret.mat[2][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[2][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[2][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[2][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
x = M1.mat[3][0];
|
|
y = M1.mat[3][1];
|
|
z = M1.mat[3][2];
|
|
w = M1.mat[3][3];
|
|
ret.mat[3][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[3][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[3][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[3][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
return ret;
|
|
#elif defined(AVX2_INTRINSICS)
|
|
__m256 t0 = _mm256_castps128_ps256(M1.rows[0]);
|
|
t0 = _mm256_insertf128_ps(t0, M1.rows[1], 1);
|
|
__m256 t1 = _mm256_castps128_ps256(M1.rows[2]);
|
|
t1 = _mm256_insertf128_ps(t1, M1.rows[3], 1);
|
|
|
|
__m256 u0 = _mm256_castps128_ps256(M2.rows[0]);
|
|
u0 = _mm256_insertf128_ps(u0, M2.rows[1], 1);
|
|
__m256 u1 = _mm256_castps128_ps256(M2.rows[2]);
|
|
u1 = _mm256_insertf128_ps(u1, M2.rows[3], 1);
|
|
|
|
__m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00);
|
|
__m256 c0 = _mm256_mul_ps(a0, b0);
|
|
__m256 c1 = _mm256_mul_ps(a1, b0);
|
|
|
|
a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1));
|
|
a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
b0 = _mm256_permute2f128_ps(u0, u0, 0x11);
|
|
__m256 c2 = _mm256_fmadd_ps(a0, b0, c0);
|
|
__m256 c3 = _mm256_fmadd_ps(a1, b0, c1);
|
|
|
|
a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2));
|
|
a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
__m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00);
|
|
__m256 c4 = _mm256_mul_ps(a0, b1);
|
|
__m256 c5 = _mm256_mul_ps(a1, b1);
|
|
|
|
a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3));
|
|
a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
|
|
b1 = _mm256_permute2f128_ps(u1, u1, 0x11);
|
|
__m256 c6 = _mm256_fmadd_ps(a0, b1, c4);
|
|
__m256 c7 = _mm256_fmadd_ps(a1, b1, c5);
|
|
|
|
t0 = _mm256_add_ps(c2, c6);
|
|
t1 = _mm256_add_ps(c3, c7);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm256_castps256_ps128(t0);
|
|
ret.rows[1] = _mm256_extractf128_ps(t0, 1);
|
|
ret.rows[2] = _mm256_castps256_ps128(t1);
|
|
ret.rows[3] = _mm256_extractf128_ps(t1, 1);
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
// Splat the component X,Y,Z then W
|
|
#ifdef AVX_INTRINSICS
|
|
XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 0);
|
|
XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 1);
|
|
XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 2);
|
|
XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 3);
|
|
#else
|
|
// Use vW to hold the original row
|
|
M3D_VECTOR vW = M1.rows[0];
|
|
M3D_VECTOR vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
M3D_VECTOR vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
// Perform the operation on the first row
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
// Perform a binary add to reduce cumulative errors
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[0] = vX;
|
|
// Repeat for the other 3 rows
|
|
#ifdef AVX_INTRINSICS
|
|
vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 0);
|
|
vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 1);
|
|
vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 2);
|
|
vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 3);
|
|
#else
|
|
vW = M1.rows[1];
|
|
vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[1] = vX;
|
|
#ifdef AVX_INTRINSICS
|
|
vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 0);
|
|
vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 1);
|
|
vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 2);
|
|
vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 3);
|
|
#else
|
|
vW = M1.rows[2];
|
|
vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[2] = vX;
|
|
#ifdef AVX_INTRINSICS
|
|
vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 0);
|
|
vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 1);
|
|
vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 2);
|
|
vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 3);
|
|
#else
|
|
vW = M1.rows[3];
|
|
vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[3] = vX;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_MTranspose(M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
// Original matrix:
|
|
//
|
|
// m00m01m02m03
|
|
// m10m11m12m13
|
|
// m20m21m22m23
|
|
// m30m31m32m33
|
|
|
|
M3D_MATRIX P;
|
|
P.rows[0] = M3D_V4MergeXY(M.rows[0], M.rows[2]); // m00m20m01m21
|
|
P.rows[1] = M3D_V4MergeXY(M.rows[1], M.rows[3]); // m10m30m11m31
|
|
P.rows[2] = M3D_V4MergeZW(M.rows[0], M.rows[2]); // m02m22m03m23
|
|
P.rows[3] = M3D_V4MergeZW(M.rows[1], M.rows[3]); // m12m32m13m33
|
|
|
|
M3D_MATRIX MT;
|
|
MT.rows[0] = M3D_V4MergeXY(P.rows[0], P.rows[1]); // m00m10m20m30
|
|
MT.rows[1] = M3D_V4MergeZW(P.rows[0], P.rows[1]); // m01m11m21m31
|
|
MT.rows[2] = M3D_V4MergeXY(P.rows[2], P.rows[3]); // m02m12m22m32
|
|
MT.rows[3] = M3D_V4MergeZW(P.rows[2], P.rows[3]); // m03m13m23m33
|
|
return MT;
|
|
#elif defined(AVX2_INTRINSICS)
|
|
__m256 t0 = _mm256_castps128_ps256(M.rows[0]);
|
|
t0 = _mm256_insertf128_ps(t0, M.rows[1], 1);
|
|
__m256 t1 = _mm256_castps128_ps256(M.rows[2]);
|
|
t1 = _mm256_insertf128_ps(t1, M.rows[3], 1);
|
|
|
|
__m256 vTemp = _mm256_unpacklo_ps(t0, t1);
|
|
__m256 vTemp2 = _mm256_unpackhi_ps(t0, t1);
|
|
__m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
|
|
__m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
|
|
vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4);
|
|
vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4);
|
|
t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
|
|
t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm256_castps256_ps128(t0);
|
|
ret.rows[1] = _mm256_extractf128_ps(t0, 1);
|
|
ret.rows[2] = _mm256_castps256_ps128(t1);
|
|
ret.rows[3] = _mm256_extractf128_ps(t1, 1);
|
|
return ret;
|
|
#else
|
|
// x.x,x.y,y.x,y.y
|
|
M3D_VECTOR vTemp1 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(1, 0, 1, 0));
|
|
// x.z,x.w,y.z,y.w
|
|
M3D_VECTOR vTemp3 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(3, 2, 3, 2));
|
|
// z.x,z.y,w.x,w.y
|
|
M3D_VECTOR vTemp2 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(1, 0, 1, 0));
|
|
// z.z,z.w,w.z,w.w
|
|
M3D_VECTOR vTemp4 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(3, 2, 3, 2));
|
|
|
|
M3D_MATRIX ret;
|
|
// x.x,y.x,z.x,w.x
|
|
ret.rows[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
|
|
// x.y,y.y,z.y,w.y
|
|
ret.rows[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
|
|
// x.z,y.z,z.z,w.z
|
|
ret.rows[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
|
|
// x.w,y.w,z.w,w.w
|
|
ret.rows[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_VECTOR M3D_V3Transform(M3D_VECTOR V, M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, M.rows[2], M.rows[3]);
|
|
Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result);
|
|
Result = M3D_V4MultiplyAdd(X, M.rows[0], Result);
|
|
|
|
return Result;
|
|
#else
|
|
M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z
|
|
vResult = M3D_FMADD_PS(vResult, M.rows[2], M.rows[3]);
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
|
|
vResult = M3D_FMADD_PS(vTemp, M.rows[1], vResult);
|
|
vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
|
|
vResult = M3D_FMADD_PS(vTemp, M.rows[0], vResult);
|
|
return vResult;
|
|
#endif
|
|
}
|
|
|
|
inline void M3D_V3Transform(
|
|
M3D_F4* pOutputStream,
|
|
size_t OutputStride,
|
|
const M3D_F3* pInputStream,
|
|
size_t InputStride,
|
|
size_t VectorCount,
|
|
M3D_MATRIX M
|
|
) noexcept {
|
|
auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
|
|
auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
|
|
|
|
const M3D_VECTOR row0 = M.rows[0];
|
|
const M3D_VECTOR row1 = M.rows[1];
|
|
const M3D_VECTOR row2 = M.rows[2];
|
|
const M3D_VECTOR row3 = M.rows[3];
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
for (size_t i = 0; i < VectorCount; i++) {
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, row2, row3);
|
|
Result = M3D_V4MultiplyAdd(Y, row1, Result);
|
|
Result = M3D_V4MultiplyAdd(X, row0, Result);
|
|
|
|
M3D_V4StoreF4(reinterpret_cast<M3D_F4*>(pOutputVector), Result);
|
|
|
|
pInputVector += InputStride;
|
|
pOutputVector += OutputStride;
|
|
}
|
|
#else
|
|
size_t i = 0;
|
|
size_t four = VectorCount >> 2;
|
|
if (four > 0) {
|
|
if (InputStride == sizeof(M3D_F3)) {
|
|
if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF)) {
|
|
// Packed input, aligned output
|
|
for (size_t j = 0; j < four; ++j) {
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
i += 4;
|
|
}
|
|
} else {
|
|
// Packed input, unaligned output
|
|
for (size_t j = 0; j < four; ++j)
|
|
{
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
i += 4;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF)) {
|
|
// Aligned output
|
|
for (; i < VectorCount; ++i) {
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
} else {
|
|
// Unaligned output
|
|
for (; i < VectorCount; ++i)
|
|
{
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
}
|
|
|
|
M3D_SFENCE();
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3TransformPersDiv(M3D_VECTOR V, M3D_MATRIX M) noexcept {
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, M.rows[2], M.rows[3]);
|
|
Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result);
|
|
Result = M3D_V4MultiplyAdd(X, M.rows[0], Result);
|
|
|
|
M3D_VECTOR W = M3D_V4SplatW(Result);
|
|
return M3D_V4Divide(Result, W);
|
|
}
|
|
|
|
inline void M3D_V3TransformPersDiv(
|
|
M3D_F3* pOutputStream,
|
|
size_t OutputStride,
|
|
const M3D_F3* pInputStream,
|
|
size_t InputStride,
|
|
size_t VectorCount,
|
|
M3D_MATRIX M
|
|
) noexcept {
|
|
auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
|
|
auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
|
|
|
|
const M3D_VECTOR row0 = M.rows[0];
|
|
const M3D_VECTOR row1 = M.rows[1];
|
|
const M3D_VECTOR row2 = M.rows[2];
|
|
const M3D_VECTOR row3 = M.rows[3];
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
for (size_t i = 0; i < VectorCount; i++)
|
|
{
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, row2, row3);
|
|
Result = M3D_V4MultiplyAdd(Y, row1, Result);
|
|
Result = M3D_V4MultiplyAdd(X, row0, Result);
|
|
|
|
M3D_VECTOR W = M3D_V4SplatW(Result);
|
|
|
|
Result = M3D_V4Divide(Result, W);
|
|
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), Result);
|
|
|
|
pInputVector += InputStride;
|
|
pOutputVector += OutputStride;
|
|
}
|
|
#else
|
|
size_t i = 0;
|
|
size_t four = VectorCount >> 2;
|
|
if (four > 0) {
|
|
if (InputStride == sizeof(M3D_F3)) {
|
|
if (OutputStride == sizeof(M3D_F3)) {
|
|
if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF)) {
|
|
// Packed input, aligned & packed output
|
|
for (size_t j = 0; j < four; ++j) {
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V1 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V2 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V3 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V4 = _mm_div_ps(vTemp, W);
|
|
|
|
// Pack and store the vectors
|
|
M3D_PACK4INTO3(vTemp);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), V1);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 32), V3);
|
|
pOutputVector += sizeof(M3D_F3) * 4;
|
|
i += 4;
|
|
}
|
|
} else {
|
|
// Packed input, unaligned & packed output
|
|
for (size_t j = 0; j < four; ++j) {
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V1 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V2 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V3 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V4 = _mm_div_ps(vTemp, W);
|
|
|
|
// Pack and store the vectors
|
|
M3D_PACK4INTO3(vTemp);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), V1);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 32), V3);
|
|
pOutputVector += sizeof(M3D_F3) * 4;
|
|
i += 4;
|
|
}
|
|
}
|
|
} else {
|
|
// Packed input, unpacked output
|
|
for (size_t j = 0; j < four; ++j)
|
|
{
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
i += 4;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (; i < VectorCount; i++) {
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
|
|
M3D_SFENCE();
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3TransformNDCToViewport(M3D_VECTOR V, float vpX, float vpY, float vpW, float vpH, float vpMinZ, float vpMaxZ) noexcept {
|
|
const float halfVPWidth = vpW * 0.5f;
|
|
const float halfVPHeight = vpH * 0.5f;
|
|
|
|
M3D_VECTOR s = M3D_V4Set(halfVPWidth, -halfVPHeight, vpMaxZ - vpMinZ, 0.0f);
|
|
M3D_VECTOR o = M3D_V4Set(vpX + halfVPWidth, vpY + halfVPHeight, vpMinZ, 0.0f);
|
|
|
|
return M3D_V4MultiplyAdd(V, s, o);
|
|
}
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixCamLookAtLH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept {
|
|
M3D_VECTOR dir = M3D_V4Subtract(focusPos, viewPos);
|
|
return M3D_TransformMatrixCamLookToLH(viewPos, dir, upDirection);
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixCamLookAtRH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept {
|
|
M3D_VECTOR dir_n = M3D_V4Subtract(viewPos, focusPos);
|
|
return M3D_TransformMatrixCamLookToLH(viewPos, dir_n, upDirection);
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixCamLookToLH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept {
|
|
// Keep viewer's axes orthogonal to each other and of unit length
|
|
M3D_VECTOR look_normal = M3D_V3Normalize(viewDirection);
|
|
M3D_VECTOR up_norm = M3D_V3Cross(upDirection, look_normal);
|
|
up_norm = M3D_V3Normalize(up_norm);
|
|
|
|
// U, L already ortho-normal, so no need to normalize cross product
|
|
M3D_VECTOR right_norm = M3D_V3Cross(look_normal, up_norm);
|
|
|
|
M3D_VECTOR viewPos_n = M3D_V4Negate(viewPos);
|
|
|
|
M3D_VECTOR right_vec = M3D_V3Dot(up_norm, viewPos_n);
|
|
M3D_VECTOR up_vec = M3D_V3Dot(right_norm, viewPos_n);
|
|
M3D_VECTOR look_vec = M3D_V3Dot(look_normal, viewPos_n);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Select(right_vec, up_norm, M3D_MSelect1110.v);
|
|
ret.rows[1] = M3D_V4Select(up_vec, right_norm, M3D_MSelect1110.v);
|
|
ret.rows[2] = M3D_V4Select(look_vec, look_normal, M3D_MSelect1110.v);
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
|
|
ret = M3D_MTranspose(ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixCamLookToRH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept {
|
|
M3D_VECTOR viewDirection_n = M3D_V4Negate(viewDirection);
|
|
return M3D_TransformMatrixCamLookToLH(viewPos, viewDirection_n, upDirection);
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixFrustrumFovLH(float fov, float ratio, float near, float far) noexcept {
|
|
float SinFov;
|
|
float CosFov;
|
|
M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov);
|
|
float fRange = far / (far - near);
|
|
float Height = CosFov / SinFov;
|
|
float Width = Height / ratio;
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = Width;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = Height;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = fRange;
|
|
ret.mat[2][3] = 1.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = -fRange * near;
|
|
ret.mat[3][3] = 0.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR rMem = {
|
|
Width,
|
|
Height,
|
|
fRange,
|
|
-fRange * near
|
|
};
|
|
|
|
// Copy from memory to SSE register
|
|
M3D_VECTOR vValues = rMem;
|
|
|
|
M3D_MATRIX ret;
|
|
M3D_VECTOR vTemp = _mm_setzero_ps();
|
|
vTemp = _mm_move_ss(vTemp, vValues);
|
|
ret.rows[0] = vTemp; // Width, 0, 0, 0
|
|
vTemp = vValues;
|
|
vTemp = _mm_and_ps(vTemp, M3D_MMaskY);
|
|
ret.rows[1] = vTemp; // 0, Height, 0, 0
|
|
vTemp = _mm_setzero_ps();
|
|
vValues = _mm_shuffle_ps(vValues, M3D_MIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
|
|
ret.rows[2] = vTemp; // 0, 0, fRange, 1.0f
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
|
|
ret.rows[3] = vTemp; // 0, 0, -fRange * near, 0.0f
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixFrustrumFovRH(float fov, float ratio, float near, float far) noexcept {
|
|
float SinFov;
|
|
float CosFov;
|
|
M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov);
|
|
float fRange = far / (near - far);
|
|
float Height = CosFov / SinFov;
|
|
float Width = Height / ratio;
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = Width;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = Height;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = fRange;
|
|
ret.mat[2][3] = -1.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = fRange * near;
|
|
ret.mat[3][3] = 0.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR rMem = {
|
|
Width,
|
|
Height,
|
|
fRange,
|
|
fRange * near
|
|
};
|
|
|
|
// Copy from memory to SSE register
|
|
M3D_VECTOR vValues = rMem;
|
|
|
|
M3D_MATRIX ret;
|
|
M3D_VECTOR vTemp = _mm_setzero_ps();
|
|
vTemp = _mm_move_ss(vTemp, vValues);
|
|
ret.rows[0] = vTemp; // Height / a_ratio, 0, 0, 0
|
|
vTemp = vValues;
|
|
vTemp = _mm_and_ps(vTemp, M3D_MMaskY);
|
|
ret.rows[1] = vTemp; // 0, CosFov / SinFov, 0, 0
|
|
vTemp = _mm_setzero_ps();
|
|
vValues = _mm_shuffle_ps(vValues, M3D_MIdentityR3_n, _MM_SHUFFLE(3, 2, 3, 2));
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
|
|
ret.rows[2] = vTemp; // 0, 0, fRange, -1.0f
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
|
|
ret.rows[3] = vTemp; // 0, 0, fRange * near, 0.0f
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixTranslate(M3D_VECTOR Offset) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = 1.0f;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = 1.0f;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = Offset.v4f[0];
|
|
ret.mat[3][1] = Offset.v4f[1];
|
|
ret.mat[3][2] = Offset.v4f[2];
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0.v;
|
|
ret.rows[1] = M3D_MIdentityR1.v;
|
|
ret.rows[2] = M3D_MIdentityR2.v;
|
|
ret.rows[3] = M3D_V4Select(M3D_MIdentityR3.v, Offset, M3D_MSelect1110.v);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixScale(float ScaleX, float ScaleY, float ScaleZ) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = ScaleX;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = ScaleY;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = ScaleZ;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_set_ps(0, 0, 0, ScaleX);
|
|
ret.rows[1] = _mm_set_ps(0, 0, ScaleY, 0);
|
|
ret.rows[2] = _mm_set_ps(0, ScaleZ, 0, 0);
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixScale(M3D_VECTOR Scale) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = Scale.v4f[0];
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = Scale.v4f[1];
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = Scale.v4f[2];
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_and_ps(Scale, M3D_MMaskX);
|
|
ret.rows[1] = _mm_and_ps(Scale, M3D_MMaskY);
|
|
ret.rows[2] = _mm_and_ps(Scale, M3D_MMaskZ);
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixTranslate(float OffsetX, float OffsetY, float OffsetZ) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = 1.0f;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = 1.0f;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = OffsetX;
|
|
ret.mat[3][1] = OffsetY;
|
|
ret.mat[3][2] = OffsetZ;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0.v;
|
|
ret.rows[1] = M3D_MIdentityR1.v;
|
|
ret.rows[2] = M3D_MIdentityR2.v;
|
|
ret.rows[3] = M3D_V4Set(OffsetX, OffsetY, OffsetZ, 1.f);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixRotationX(float Angle) noexcept {
|
|
float SinAngle;
|
|
float CosAngle;
|
|
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = 1.0f;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = CosAngle;
|
|
ret.mat[1][2] = SinAngle;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = -SinAngle;
|
|
ret.mat[2][2] = CosAngle;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR vSin = _mm_set_ss(SinAngle);
|
|
M3D_VECTOR vCos = _mm_set_ss(CosAngle);
|
|
// x = 0,y = cos,z = sin, w = 0
|
|
vCos = _mm_shuffle_ps(vCos, vSin, _MM_SHUFFLE(3, 0, 0, 3));
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0;
|
|
ret.rows[1] = vCos;
|
|
// x = 0,y = sin,z = cos, w = 0
|
|
vCos = M3D_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 1, 2, 0));
|
|
// x = 0,y = -sin,z = cos, w = 0
|
|
vCos = _mm_mul_ps(vCos, M3D_MNegateY);
|
|
ret.rows[2] = vCos;
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixRotationY(float Angle) noexcept {
|
|
float SinAngle;
|
|
float CosAngle;
|
|
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = CosAngle;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = -SinAngle;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = 1.0f;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = SinAngle;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = CosAngle;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR vSin = _mm_set_ss(SinAngle);
|
|
M3D_VECTOR vCos = _mm_set_ss(CosAngle);
|
|
// x = sin,y = 0,z = cos, w = 0
|
|
vSin = _mm_shuffle_ps(vSin, vCos, _MM_SHUFFLE(3, 0, 3, 0));
|
|
M3D_MATRIX ret;
|
|
ret.rows[2] = vSin;
|
|
ret.rows[1] = M3D_MIdentityR1;
|
|
// x = cos,y = 0,z = sin, w = 0
|
|
vSin = M3D_PERMUTE_PS(vSin, _MM_SHUFFLE(3, 0, 1, 2));
|
|
// x = cos,y = 0,z = -sin, w = 0
|
|
vSin = _mm_mul_ps(vSin, M3D_MNegateZ);
|
|
ret.rows[0] = vSin;
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixRotationZ(float Angle) noexcept {
|
|
float SinAngle;
|
|
float CosAngle;
|
|
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = CosAngle;
|
|
ret.mat[0][1] = SinAngle;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = -SinAngle;
|
|
ret.mat[1][1] = CosAngle;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR vSin = _mm_set_ss(SinAngle);
|
|
M3D_VECTOR vCos = _mm_set_ss(CosAngle);
|
|
// x = cos,y = sin,z = 0, w = 0
|
|
vCos = _mm_unpacklo_ps(vCos, vSin);
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = vCos;
|
|
// x = sin,y = cos,z = 0, w = 0
|
|
vCos = M3D_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 2, 0, 1));
|
|
// x = cos,y = -sin,z = 0, w = 0
|
|
vCos = _mm_mul_ps(vCos, M3D_MNegateX);
|
|
ret.rows[1] = vCos;
|
|
ret.rows[2] = M3D_MIdentityR2;
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
inline M3D_MATRIX M3D_TransformMatrixRotation(M3D_VECTOR Angles) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
float cp = cosf(Angles.v4f[0]);
|
|
float sp = sinf(Angles.v4f[0]);
|
|
|
|
float cy = cosf(Angles.v4f[1]);
|
|
float sy = sinf(Angles.v4f[1]);
|
|
|
|
float cr = cosf(Angles.v4f[2]);
|
|
float sr = sinf(Angles.v4f[2]);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = cr * cy + sr * sp * sy;
|
|
ret.mat[0][1] = sr * cp;
|
|
ret.mat[0][2] = sr * sp * cy - cr * sy;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = cr * sp * sy - sr * cy;
|
|
ret.mat[1][1] = cr * cp;
|
|
ret.mat[1][2] = sr * sy + cr * sp * cy;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = cp * sy;
|
|
ret.mat[2][1] = -sp;
|
|
ret.mat[2][2] = cp * cy;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
static const M3D_V4F32 Sign = {{{1.0f, -1.0f, -1.0f, 1.0f}}};
|
|
|
|
M3D_VECTOR SinAngles, CosAngles;
|
|
M3D_V4SinCos(&SinAngles, &CosAngles, Angles);
|
|
|
|
M3D_VECTOR P0 = M3D_V4Permute<M3D_PERMUTE_1X, M3D_PERMUTE_0Z, M3D_PERMUTE_1Z, M3D_PERMUTE_1X>(SinAngles, CosAngles);
|
|
M3D_VECTOR Y0 = M3D_V4Permute<M3D_PERMUTE_0Y, M3D_PERMUTE_1X, M3D_PERMUTE_1X, M3D_PERMUTE_1Y>(SinAngles, CosAngles);
|
|
M3D_VECTOR P1 = M3D_V4Permute<M3D_PERMUTE_1Z, M3D_PERMUTE_0Z, M3D_PERMUTE_1Z, M3D_PERMUTE_0Z>(SinAngles, CosAngles);
|
|
M3D_VECTOR Y1 = M3D_V4Permute<M3D_PERMUTE_1Y, M3D_PERMUTE_1Y, M3D_PERMUTE_0Y, M3D_PERMUTE_0Y>(SinAngles, CosAngles);
|
|
M3D_VECTOR P2 = M3D_V4Permute<M3D_PERMUTE_0Z, M3D_PERMUTE_1Z, M3D_PERMUTE_0Z, M3D_PERMUTE_1Z>(SinAngles, CosAngles);
|
|
M3D_VECTOR P3 = M3D_V4Permute<M3D_PERMUTE_0Y, M3D_PERMUTE_0Y, M3D_PERMUTE_1Y, M3D_PERMUTE_1Y>(SinAngles, CosAngles);
|
|
M3D_VECTOR Y2 = M3D_V4SplatX(SinAngles);
|
|
M3D_VECTOR NS = M3D_V4Negate(SinAngles);
|
|
|
|
M3D_VECTOR Q0 = M3D_V4Multiply(P0, Y0);
|
|
M3D_VECTOR Q1 = M3D_V4Multiply(P1, Sign.v);
|
|
Q1 = M3D_V4Multiply(Q1, Y1);
|
|
M3D_VECTOR Q2 = M3D_V4Multiply(P2, Y2);
|
|
Q2 = M3D_V4MultiplyAdd(Q2, P3, Q1);
|
|
|
|
M3D_VECTOR V0 = M3D_V4Permute<M3D_PERMUTE_1X, M3D_PERMUTE_0Y, M3D_PERMUTE_1Z, M3D_PERMUTE_0W>(Q0, Q2);
|
|
M3D_VECTOR V1 = M3D_V4Permute<M3D_PERMUTE_1Y, M3D_PERMUTE_0Z, M3D_PERMUTE_1W, M3D_PERMUTE_0W>(Q0, Q2);
|
|
M3D_VECTOR V2 = M3D_V4Permute<M3D_PERMUTE_0X, M3D_PERMUTE_1X, M3D_PERMUTE_0W, M3D_PERMUTE_0W>(Q0, NS);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Select(M3D_MZero, V0, M3D_MSelect1110.v);
|
|
ret.rows[1] = M3D_V4Select(M3D_MZero, V1, M3D_MSelect1110.v);
|
|
ret.rows[2] = M3D_V4Select(M3D_MZero, V2, M3D_MSelect1110.v);
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
//TODO: transform matrix is incomplete
|
|
//v_tri[v_cnt].position.z = ((far+near)/2)+((far-near)/2)*_2dCoord.z;
|
|
inline M3D_MATRIX M3D_TransformMatrixViewport(float _w, float _h, float _wOffset, float _hOffset) noexcept {
|
|
const float widthDiv2 = _w / 2;
|
|
const float heightDiv2 = _h / 2;
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = widthDiv2;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = -heightDiv2;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f; // maxZ-minZ ignored
|
|
ret.mat[2][3] = 0.0f; // minZ ignored
|
|
|
|
ret.mat[3][0] = _wOffset + widthDiv2;
|
|
ret.mat[3][1] = _hOffset + heightDiv2;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Set(widthDiv2, 0, 0, 0);
|
|
ret.rows[1] = M3D_V4Set(0, -heightDiv2, 0, 0);
|
|
ret.rows[2] = M3D_MIdentityR2.v; // maxZ-minZ and minZ are ignored
|
|
ret.rows[3] = M3D_V4Set(_wOffset + widthDiv2, _hOffset + heightDiv2, 0, 1);
|
|
return ret;
|
|
#endif
|
|
} |