2313 lines
90 KiB
C++
2313 lines
90 KiB
C++
#pragma once
|
|
|
|
|
|
inline M3D_MATRIX::M3D_MATRIX(
|
|
float f00, float f01, float f02, float f03,
|
|
float f10, float f11, float f12, float f13,
|
|
float f20, float f21, float f22, float f23,
|
|
float f30, float f31, float f32, float f33
|
|
) noexcept {
|
|
rows[0] = M3D_V4Set(f00, f01, f02, f03);
|
|
rows[1] = M3D_V4Set(f10, f11, f12, f13);
|
|
rows[2] = M3D_V4Set(f20, f21, f22, f23);
|
|
rows[3] = M3D_V4Set(f30, f31, f32, f33);
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator- () const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Negate(rows[0]);
|
|
ret.rows[1] = M3D_V4Negate(rows[1]);
|
|
ret.rows[2] = M3D_V4Negate(rows[2]);
|
|
ret.rows[3] = M3D_V4Negate(rows[3]);
|
|
return ret;
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator+= (M3D_MATRIX M) noexcept {
|
|
rows[0] = M3D_V4Add(rows[0], M.rows[0]);
|
|
rows[1] = M3D_V4Add(rows[1], M.rows[1]);
|
|
rows[2] = M3D_V4Add(rows[2], M.rows[2]);
|
|
rows[3] = M3D_V4Add(rows[3], M.rows[3]);
|
|
return *this;
|
|
}
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator+ (M3D_MATRIX M) const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Add(rows[0], M.rows[0]);
|
|
ret.rows[1] = M3D_V4Add(rows[1], M.rows[1]);
|
|
ret.rows[2] = M3D_V4Add(rows[2], M.rows[2]);
|
|
ret.rows[3] = M3D_V4Add(rows[3], M.rows[3]);
|
|
return ret;
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator-= (M3D_MATRIX M) noexcept {
|
|
rows[0] = M3D_V4Subtract(rows[0], M.rows[0]);
|
|
rows[1] = M3D_V4Subtract(rows[1], M.rows[1]);
|
|
rows[2] = M3D_V4Subtract(rows[2], M.rows[2]);
|
|
rows[3] = M3D_V4Subtract(rows[3], M.rows[3]);
|
|
return *this;
|
|
}
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator- (M3D_MATRIX M) const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Subtract(rows[0], M.rows[0]);
|
|
ret.rows[1] = M3D_V4Subtract(rows[1], M.rows[1]);
|
|
ret.rows[2] = M3D_V4Subtract(rows[2], M.rows[2]);
|
|
ret.rows[3] = M3D_V4Subtract(rows[3], M.rows[3]);
|
|
return ret;
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator*=(M3D_MATRIX M) noexcept {
|
|
*this = M3D_MMultiply(*this, M);
|
|
return *this;
|
|
}
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator*(M3D_MATRIX M) const noexcept {
|
|
return M3D_MMultiply(*this, M);
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator*= (float S) noexcept {
|
|
rows[0] = M3D_V4Scale(rows[0], S);
|
|
rows[1] = M3D_V4Scale(rows[1], S);
|
|
rows[2] = M3D_V4Scale(rows[2], S);
|
|
rows[3] = M3D_V4Scale(rows[3], S);
|
|
return *this;
|
|
}
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator* (float S) const noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Scale(rows[0], S);
|
|
ret.rows[1] = M3D_V4Scale(rows[1], S);
|
|
ret.rows[2] = M3D_V4Scale(rows[2], S);
|
|
ret.rows[3] = M3D_V4Scale(rows[3], S);
|
|
return ret;
|
|
}
|
|
INLINE_AVX_FIX M3D_MATRIX operator* (float S, M3D_MATRIX M) noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Scale(M.rows[0], S);
|
|
ret.rows[1] = M3D_V4Scale(M.rows[1], S);
|
|
ret.rows[2] = M3D_V4Scale(M.rows[2], S);
|
|
ret.rows[3] = M3D_V4Scale(M.rows[3], S);
|
|
return ret;
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator/= (float S) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR vS = M3D_V4Replicate(S);
|
|
rows[0] = M3D_V4Divide(rows[0], vS);
|
|
rows[1] = M3D_V4Divide(rows[1], vS);
|
|
rows[2] = M3D_V4Divide(rows[2], vS);
|
|
rows[3] = M3D_V4Divide(rows[3], vS);
|
|
return *this;
|
|
#else
|
|
__m128 vS = _mm_set_ps1(S);
|
|
rows[0] = _mm_div_ps(rows[0], vS);
|
|
rows[1] = _mm_div_ps(rows[1], vS);
|
|
rows[2] = _mm_div_ps(rows[2], vS);
|
|
rows[3] = _mm_div_ps(rows[3], vS);
|
|
return *this;
|
|
#endif
|
|
}
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator/ (float S) const noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR vS = M3D_V4Replicate(S);
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Divide(rows[0], vS);
|
|
ret.rows[1] = M3D_V4Divide(rows[1], vS);
|
|
ret.rows[2] = M3D_V4Divide(rows[2], vS);
|
|
ret.rows[3] = M3D_V4Divide(rows[3], vS);
|
|
return ret;
|
|
#else
|
|
__m128 vS = _mm_set_ps1(S);
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_div_ps(rows[0], vS);
|
|
ret.rows[1] = _mm_div_ps(rows[1], vS);
|
|
ret.rows[2] = _mm_div_ps(rows[2], vS);
|
|
ret.rows[3] = _mm_div_ps(rows[3], vS);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_MATRIX M3D_MIdentity() noexcept {
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0.v;
|
|
ret.rows[1] = M3D_MIdentityR1.v;
|
|
ret.rows[2] = M3D_MIdentityR2.v;
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
return ret;
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_MMultiply(M3D_MATRIX M1, M3D_MATRIX& M2) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
// Cache the invariants in registers
|
|
float x = M1.mat[0][0];
|
|
float y = M1.mat[0][1];
|
|
float z = M1.mat[0][2];
|
|
float w = M1.mat[0][3];
|
|
// Perform the operation on the first row
|
|
ret.mat[0][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[0][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[0][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[0][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
// Repeat for all the other rows
|
|
x = M1.mat[1][0];
|
|
y = M1.mat[1][1];
|
|
z = M1.mat[1][2];
|
|
w = M1.mat[1][3];
|
|
ret.mat[1][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[1][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[1][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[1][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
x = M1.mat[2][0];
|
|
y = M1.mat[2][1];
|
|
z = M1.mat[2][2];
|
|
w = M1.mat[2][3];
|
|
ret.mat[2][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[2][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[2][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[2][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
x = M1.mat[3][0];
|
|
y = M1.mat[3][1];
|
|
z = M1.mat[3][2];
|
|
w = M1.mat[3][3];
|
|
ret.mat[3][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w);
|
|
ret.mat[3][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w);
|
|
ret.mat[3][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w);
|
|
ret.mat[3][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w);
|
|
return ret;
|
|
#elif defined(AVX2_INTRINSICS)
|
|
__m256 t0 = _mm256_castps128_ps256(M1.rows[0]);
|
|
t0 = _mm256_insertf128_ps(t0, M1.rows[1], 1);
|
|
__m256 t1 = _mm256_castps128_ps256(M1.rows[2]);
|
|
t1 = _mm256_insertf128_ps(t1, M1.rows[3], 1);
|
|
|
|
__m256 u0 = _mm256_castps128_ps256(M2.rows[0]);
|
|
u0 = _mm256_insertf128_ps(u0, M2.rows[1], 1);
|
|
__m256 u1 = _mm256_castps128_ps256(M2.rows[2]);
|
|
u1 = _mm256_insertf128_ps(u1, M2.rows[3], 1);
|
|
|
|
__m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00);
|
|
__m256 c0 = _mm256_mul_ps(a0, b0);
|
|
__m256 c1 = _mm256_mul_ps(a1, b0);
|
|
|
|
a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1));
|
|
a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
b0 = _mm256_permute2f128_ps(u0, u0, 0x11);
|
|
__m256 c2 = _mm256_fmadd_ps(a0, b0, c0);
|
|
__m256 c3 = _mm256_fmadd_ps(a1, b0, c1);
|
|
|
|
a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2));
|
|
a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
__m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00);
|
|
__m256 c4 = _mm256_mul_ps(a0, b1);
|
|
__m256 c5 = _mm256_mul_ps(a1, b1);
|
|
|
|
a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3));
|
|
a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
|
|
b1 = _mm256_permute2f128_ps(u1, u1, 0x11);
|
|
__m256 c6 = _mm256_fmadd_ps(a0, b1, c4);
|
|
__m256 c7 = _mm256_fmadd_ps(a1, b1, c5);
|
|
|
|
t0 = _mm256_add_ps(c2, c6);
|
|
t1 = _mm256_add_ps(c3, c7);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm256_castps256_ps128(t0);
|
|
ret.rows[1] = _mm256_extractf128_ps(t0, 1);
|
|
ret.rows[2] = _mm256_castps256_ps128(t1);
|
|
ret.rows[3] = _mm256_extractf128_ps(t1, 1);
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
// Splat the component X,Y,Z then W
|
|
#ifdef AVX_INTRINSICS
|
|
M3D_VECTOR vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 0);
|
|
M3D_VECTOR vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 1);
|
|
M3D_VECTOR vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 2);
|
|
M3D_VECTOR vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[0]) + 3);
|
|
#else
|
|
// Use vW to hold the original row
|
|
M3D_VECTOR vW = M1.rows[0];
|
|
M3D_VECTOR vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
M3D_VECTOR vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
// Perform the operation on the first row
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
// Perform a binary add to reduce cumulative errors
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[0] = vX;
|
|
// Repeat for the other 3 rows
|
|
#ifdef AVX_INTRINSICS
|
|
vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 0);
|
|
vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 1);
|
|
vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 2);
|
|
vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[1]) + 3);
|
|
#else
|
|
vW = M1.rows[1];
|
|
vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[1] = vX;
|
|
#ifdef AVX_INTRINSICS
|
|
vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 0);
|
|
vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 1);
|
|
vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 2);
|
|
vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[2]) + 3);
|
|
#else
|
|
vW = M1.rows[2];
|
|
vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[2] = vX;
|
|
#ifdef AVX_INTRINSICS
|
|
vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 0);
|
|
vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 1);
|
|
vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 2);
|
|
vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.rows[3]) + 3);
|
|
#else
|
|
vW = M1.rows[3];
|
|
vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
|
|
#endif
|
|
vX = _mm_mul_ps(vX, M2.rows[0]);
|
|
vY = _mm_mul_ps(vY, M2.rows[1]);
|
|
vZ = _mm_mul_ps(vZ, M2.rows[2]);
|
|
vW = _mm_mul_ps(vW, M2.rows[3]);
|
|
vX = _mm_add_ps(vX, vZ);
|
|
vY = _mm_add_ps(vY, vW);
|
|
vX = _mm_add_ps(vX, vY);
|
|
ret.rows[3] = vX;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_MTranspose(M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
// Original matrix:
|
|
//
|
|
// m00m01m02m03
|
|
// m10m11m12m13
|
|
// m20m21m22m23
|
|
// m30m31m32m33
|
|
|
|
M3D_MATRIX P;
|
|
P.rows[0] = M3D_V4MergeXY(M.rows[0], M.rows[2]); // m00m20m01m21
|
|
P.rows[1] = M3D_V4MergeXY(M.rows[1], M.rows[3]); // m10m30m11m31
|
|
P.rows[2] = M3D_V4MergeZW(M.rows[0], M.rows[2]); // m02m22m03m23
|
|
P.rows[3] = M3D_V4MergeZW(M.rows[1], M.rows[3]); // m12m32m13m33
|
|
|
|
M3D_MATRIX MT;
|
|
MT.rows[0] = M3D_V4MergeXY(P.rows[0], P.rows[1]); // m00m10m20m30
|
|
MT.rows[1] = M3D_V4MergeZW(P.rows[0], P.rows[1]); // m01m11m21m31
|
|
MT.rows[2] = M3D_V4MergeXY(P.rows[2], P.rows[3]); // m02m12m22m32
|
|
MT.rows[3] = M3D_V4MergeZW(P.rows[2], P.rows[3]); // m03m13m23m33
|
|
return MT;
|
|
#elif defined(AVX2_INTRINSICS)
|
|
__m256 t0 = _mm256_castps128_ps256(M.rows[0]);
|
|
t0 = _mm256_insertf128_ps(t0, M.rows[1], 1);
|
|
__m256 t1 = _mm256_castps128_ps256(M.rows[2]);
|
|
t1 = _mm256_insertf128_ps(t1, M.rows[3], 1);
|
|
|
|
__m256 vTemp = _mm256_unpacklo_ps(t0, t1);
|
|
__m256 vTemp2 = _mm256_unpackhi_ps(t0, t1);
|
|
__m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
|
|
__m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
|
|
vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4);
|
|
vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4);
|
|
t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
|
|
t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm256_castps256_ps128(t0);
|
|
ret.rows[1] = _mm256_extractf128_ps(t0, 1);
|
|
ret.rows[2] = _mm256_castps256_ps128(t1);
|
|
ret.rows[3] = _mm256_extractf128_ps(t1, 1);
|
|
return ret;
|
|
#else
|
|
// x.x,x.y,y.x,y.y
|
|
M3D_VECTOR vTemp1 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(1, 0, 1, 0));
|
|
// x.z,x.w,y.z,y.w
|
|
M3D_VECTOR vTemp3 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(3, 2, 3, 2));
|
|
// z.x,z.y,w.x,w.y
|
|
M3D_VECTOR vTemp2 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(1, 0, 1, 0));
|
|
// z.z,z.w,w.z,w.w
|
|
M3D_VECTOR vTemp4 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(3, 2, 3, 2));
|
|
|
|
M3D_MATRIX ret;
|
|
// x.x,y.x,z.x,w.x
|
|
ret.rows[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
|
|
// x.y,y.y,z.y,w.y
|
|
ret.rows[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
|
|
// x.z,y.z,z.z,w.z
|
|
ret.rows[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
|
|
// x.w,y.w,z.w,w.w
|
|
ret.rows[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_MInverse(M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX MT = M3D_MTranspose(M);
|
|
|
|
M3D_VECTOR V0[4], V1[4];
|
|
V0[0] = M3D_V4Swizzle<M3D_SWIZZLE_X, M3D_SWIZZLE_X, M3D_SWIZZLE_Y, M3D_SWIZZLE_Y>(MT.rows[2]);
|
|
V1[0] = M3D_V4Swizzle<M3D_SWIZZLE_Z, M3D_SWIZZLE_W, M3D_SWIZZLE_Z, M3D_SWIZZLE_W>(MT.rows[3]);
|
|
V0[1] = M3D_V4Swizzle<M3D_SWIZZLE_X, M3D_SWIZZLE_X, M3D_SWIZZLE_Y, M3D_SWIZZLE_Y>(MT.rows[0]);
|
|
V1[1] = M3D_V4Swizzle<M3D_SWIZZLE_Z, M3D_SWIZZLE_W, M3D_SWIZZLE_Z, M3D_SWIZZLE_W>(MT.rows[1]);
|
|
V0[2] = M3D_V4Permute<M3D_PERMUTE_0X, M3D_PERMUTE_0Z, M3D_PERMUTE_1X, M3D_PERMUTE_1Z>(MT.rows[2], MT.rows[0]);
|
|
V1[2] = M3D_V4Permute<M3D_PERMUTE_0Y, M3D_PERMUTE_0W, M3D_PERMUTE_1Y, M3D_PERMUTE_1W>(MT.rows[3], MT.rows[1]);
|
|
|
|
M3D_VECTOR D0 = M3D_V4Multiply(V0[0], V1[0]);
|
|
M3D_VECTOR D1 = M3D_V4Multiply(V0[1], V1[1]);
|
|
M3D_VECTOR D2 = M3D_V4Multiply(V0[2], V1[2]);
|
|
|
|
V0[0] = M3D_V4Swizzle<M3D_SWIZZLE_Z, M3D_SWIZZLE_W, M3D_SWIZZLE_Z, M3D_SWIZZLE_W>(MT.rows[2]);
|
|
V1[0] = M3D_V4Swizzle<M3D_SWIZZLE_X, M3D_SWIZZLE_X, M3D_SWIZZLE_Y, M3D_SWIZZLE_Y>(MT.rows[3]);
|
|
V0[1] = M3D_V4Swizzle<M3D_SWIZZLE_Z, M3D_SWIZZLE_W, M3D_SWIZZLE_Z, M3D_SWIZZLE_W>(MT.rows[0]);
|
|
V1[1] = M3D_V4Swizzle<M3D_SWIZZLE_X, M3D_SWIZZLE_X, M3D_SWIZZLE_Y, M3D_SWIZZLE_Y>(MT.rows[1]);
|
|
V0[2] = M3D_V4Permute<M3D_PERMUTE_0Y, M3D_PERMUTE_0W, M3D_PERMUTE_1Y, M3D_PERMUTE_1W>(MT.rows[2], MT.rows[0]);
|
|
V1[2] = M3D_V4Permute<M3D_PERMUTE_0X, M3D_PERMUTE_0Z, M3D_PERMUTE_1X, M3D_PERMUTE_1Z>(MT.rows[3], MT.rows[1]);
|
|
|
|
D0 = M3D_V4NegativeMultiplySubtract(V0[0], V1[0], D0);
|
|
D1 = M3D_V4NegativeMultiplySubtract(V0[1], V1[1], D1);
|
|
D2 = M3D_V4NegativeMultiplySubtract(V0[2], V1[2], D2);
|
|
|
|
V0[0] = M3D_V4Swizzle<M3D_SWIZZLE_Y, M3D_SWIZZLE_Z, M3D_SWIZZLE_X, M3D_SWIZZLE_Y>(MT.rows[1]);
|
|
V1[0] = M3D_V4Permute<M3D_PERMUTE_1Y, M3D_PERMUTE_0Y, M3D_PERMUTE_0W, M3D_PERMUTE_0X>(D0, D2);
|
|
V0[1] = M3D_V4Swizzle<M3D_SWIZZLE_Z, M3D_SWIZZLE_X, M3D_SWIZZLE_Y, M3D_SWIZZLE_X>(MT.rows[0]);
|
|
V1[1] = M3D_V4Permute<M3D_PERMUTE_0W, M3D_PERMUTE_1Y, M3D_PERMUTE_0Y, M3D_PERMUTE_0Z>(D0, D2);
|
|
V0[2] = M3D_V4Swizzle<M3D_SWIZZLE_Y, M3D_SWIZZLE_Z, M3D_SWIZZLE_X, M3D_SWIZZLE_Y>(MT.rows[3]);
|
|
V1[2] = M3D_V4Permute<M3D_PERMUTE_1W, M3D_PERMUTE_0Y, M3D_PERMUTE_0W, M3D_PERMUTE_0X>(D1, D2);
|
|
V0[3] = M3D_V4Swizzle<M3D_SWIZZLE_Z, M3D_SWIZZLE_X, M3D_SWIZZLE_Y, M3D_SWIZZLE_X>(MT.rows[2]);
|
|
V1[3] = M3D_V4Permute<M3D_PERMUTE_0W, M3D_PERMUTE_1W, M3D_PERMUTE_0Y, M3D_PERMUTE_0Z>(D1, D2);
|
|
|
|
M3D_VECTOR C0 = M3D_V4Multiply(V0[0], V1[0]);
|
|
M3D_VECTOR C2 = M3D_V4Multiply(V0[1], V1[1]);
|
|
M3D_VECTOR C4 = M3D_V4Multiply(V0[2], V1[2]);
|
|
M3D_VECTOR C6 = M3D_V4Multiply(V0[3], V1[3]);
|
|
|
|
V0[0] = M3D_V4Swizzle<M3D_SWIZZLE_Z, M3D_SWIZZLE_W, M3D_SWIZZLE_Y, M3D_SWIZZLE_Z>(MT.rows[1]);
|
|
V1[0] = M3D_V4Permute<M3D_PERMUTE_0W, M3D_PERMUTE_0X, M3D_PERMUTE_0Y, M3D_PERMUTE_1X>(D0, D2);
|
|
V0[1] = M3D_V4Swizzle<M3D_SWIZZLE_W, M3D_SWIZZLE_Z, M3D_SWIZZLE_W, M3D_SWIZZLE_Y>(MT.rows[0]);
|
|
V1[1] = M3D_V4Permute<M3D_PERMUTE_0Z, M3D_PERMUTE_0Y, M3D_PERMUTE_1X, M3D_PERMUTE_0X>(D0, D2);
|
|
V0[2] = M3D_V4Swizzle<M3D_SWIZZLE_Z, M3D_SWIZZLE_W, M3D_SWIZZLE_Y, M3D_SWIZZLE_Z>(MT.rows[3]);
|
|
V1[2] = M3D_V4Permute<M3D_PERMUTE_0W, M3D_PERMUTE_0X, M3D_PERMUTE_0Y, M3D_PERMUTE_1Z>(D1, D2);
|
|
V0[3] = M3D_V4Swizzle<M3D_SWIZZLE_W, M3D_SWIZZLE_Z, M3D_SWIZZLE_W, M3D_SWIZZLE_Y>(MT.rows[2]);
|
|
V1[3] = M3D_V4Permute<M3D_PERMUTE_0Z, M3D_PERMUTE_0Y, M3D_PERMUTE_1Z, M3D_PERMUTE_0X>(D1, D2);
|
|
|
|
C0 = M3D_V4NegativeMultiplySubtract(V0[0], V1[0], C0);
|
|
C2 = M3D_V4NegativeMultiplySubtract(V0[1], V1[1], C2);
|
|
C4 = M3D_V4NegativeMultiplySubtract(V0[2], V1[2], C4);
|
|
C6 = M3D_V4NegativeMultiplySubtract(V0[3], V1[3], C6);
|
|
|
|
V0[0] = M3D_V4Swizzle<M3D_SWIZZLE_W, M3D_SWIZZLE_X, M3D_SWIZZLE_W, M3D_SWIZZLE_X>(MT.rows[1]);
|
|
V1[0] = M3D_V4Permute<M3D_PERMUTE_0Z, M3D_PERMUTE_1Y, M3D_PERMUTE_1X, M3D_PERMUTE_0Z>(D0, D2);
|
|
V0[1] = M3D_V4Swizzle<M3D_SWIZZLE_Y, M3D_SWIZZLE_W, M3D_SWIZZLE_X, M3D_SWIZZLE_Z>(MT.rows[0]);
|
|
V1[1] = M3D_V4Permute<M3D_PERMUTE_1Y, M3D_PERMUTE_0X, M3D_PERMUTE_0W, M3D_PERMUTE_1X>(D0, D2);
|
|
V0[2] = M3D_V4Swizzle<M3D_SWIZZLE_W, M3D_SWIZZLE_X, M3D_SWIZZLE_W, M3D_SWIZZLE_X>(MT.rows[3]);
|
|
V1[2] = M3D_V4Permute<M3D_PERMUTE_0Z, M3D_PERMUTE_1W, M3D_PERMUTE_1Z, M3D_PERMUTE_0Z>(D1, D2);
|
|
V0[3] = M3D_V4Swizzle<M3D_SWIZZLE_Y, M3D_SWIZZLE_W, M3D_SWIZZLE_X, M3D_SWIZZLE_Z>(MT.rows[2]);
|
|
V1[3] = M3D_V4Permute<M3D_PERMUTE_1W, M3D_PERMUTE_0X, M3D_PERMUTE_0W, M3D_PERMUTE_1Z>(D1, D2);
|
|
|
|
M3D_VECTOR C1 = M3D_V4NegativeMultiplySubtract(V0[0], V1[0], C0);
|
|
C0 = M3D_V4MultiplyAdd(V0[0], V1[0], C0);
|
|
M3D_VECTOR C3 = M3D_V4MultiplyAdd(V0[1], V1[1], C2);
|
|
C2 = M3D_V4NegativeMultiplySubtract(V0[1], V1[1], C2);
|
|
M3D_VECTOR C5 = M3D_V4NegativeMultiplySubtract(V0[2], V1[2], C4);
|
|
C4 = M3D_V4MultiplyAdd(V0[2], V1[2], C4);
|
|
M3D_VECTOR C7 = M3D_V4MultiplyAdd(V0[3], V1[3], C6);
|
|
C6 = M3D_V4NegativeMultiplySubtract(V0[3], V1[3], C6);
|
|
|
|
M3D_MATRIX R;
|
|
R.rows[0] = M3D_V4Select(C0, C1, M3D_MSelect0101.v);
|
|
R.rows[1] = M3D_V4Select(C2, C3, M3D_MSelect0101.v);
|
|
R.rows[2] = M3D_V4Select(C4, C5, M3D_MSelect0101.v);
|
|
R.rows[3] = M3D_V4Select(C6, C7, M3D_MSelect0101.v);
|
|
|
|
M3D_VECTOR Determinant = M3D_V4Dot(R.rows[0], MT.rows[0]);
|
|
|
|
//if (pDeterminant != nullptr)
|
|
// *pDeterminant = Determinant;
|
|
|
|
M3D_VECTOR Reciprocal = M3D_V4Reciprocal(Determinant);
|
|
|
|
M3D_MATRIX Result;
|
|
Result.rows[0] = M3D_V4Multiply(R.rows[0], Reciprocal);
|
|
Result.rows[1] = M3D_V4Multiply(R.rows[1], Reciprocal);
|
|
Result.rows[2] = M3D_V4Multiply(R.rows[2], Reciprocal);
|
|
Result.rows[3] = M3D_V4Multiply(R.rows[3], Reciprocal);
|
|
return Result;
|
|
#else
|
|
// Transpose matrix
|
|
M3D_VECTOR vTemp1 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(1, 0, 1, 0));
|
|
M3D_VECTOR vTemp3 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(3, 2, 3, 2));
|
|
M3D_VECTOR vTemp2 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(1, 0, 1, 0));
|
|
M3D_VECTOR vTemp4 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(3, 2, 3, 2));
|
|
|
|
M3D_MATRIX MT;
|
|
MT.rows[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
|
|
MT.rows[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
|
|
MT.rows[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
|
|
MT.rows[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
|
|
|
|
M3D_VECTOR V00 = M3D_PERMUTE_PS(MT.rows[2], _MM_SHUFFLE(1, 1, 0, 0));
|
|
M3D_VECTOR V10 = M3D_PERMUTE_PS(MT.rows[3], _MM_SHUFFLE(3, 2, 3, 2));
|
|
M3D_VECTOR V01 = M3D_PERMUTE_PS(MT.rows[0], _MM_SHUFFLE(1, 1, 0, 0));
|
|
M3D_VECTOR V11 = M3D_PERMUTE_PS(MT.rows[1], _MM_SHUFFLE(3, 2, 3, 2));
|
|
M3D_VECTOR V02 = _mm_shuffle_ps(MT.rows[2], MT.rows[0], _MM_SHUFFLE(2, 0, 2, 0));
|
|
M3D_VECTOR V12 = _mm_shuffle_ps(MT.rows[3], MT.rows[1], _MM_SHUFFLE(3, 1, 3, 1));
|
|
|
|
M3D_VECTOR D0 = _mm_mul_ps(V00, V10);
|
|
M3D_VECTOR D1 = _mm_mul_ps(V01, V11);
|
|
M3D_VECTOR D2 = _mm_mul_ps(V02, V12);
|
|
|
|
V00 = M3D_PERMUTE_PS(MT.rows[2], _MM_SHUFFLE(3, 2, 3, 2));
|
|
V10 = M3D_PERMUTE_PS(MT.rows[3], _MM_SHUFFLE(1, 1, 0, 0));
|
|
V01 = M3D_PERMUTE_PS(MT.rows[0], _MM_SHUFFLE(3, 2, 3, 2));
|
|
V11 = M3D_PERMUTE_PS(MT.rows[1], _MM_SHUFFLE(1, 1, 0, 0));
|
|
V02 = _mm_shuffle_ps(MT.rows[2], MT.rows[0], _MM_SHUFFLE(3, 1, 3, 1));
|
|
V12 = _mm_shuffle_ps(MT.rows[3], MT.rows[1], _MM_SHUFFLE(2, 0, 2, 0));
|
|
|
|
D0 = M3D_FNMADD_PS(V00, V10, D0);
|
|
D1 = M3D_FNMADD_PS(V01, V11, D1);
|
|
D2 = M3D_FNMADD_PS(V02, V12, D2);
|
|
// V11 = D0Y,D0W,D2Y,D2Y
|
|
V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 1, 3, 1));
|
|
V00 = M3D_PERMUTE_PS(MT.rows[1], _MM_SHUFFLE(1, 0, 2, 1));
|
|
V10 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(0, 3, 0, 2));
|
|
V01 = M3D_PERMUTE_PS(MT.rows[0], _MM_SHUFFLE(0, 1, 0, 2));
|
|
V11 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(2, 1, 2, 1));
|
|
// V13 = D1Y,D1W,D2W,D2W
|
|
M3D_VECTOR V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 3, 3, 1));
|
|
V02 = M3D_PERMUTE_PS(MT.rows[3], _MM_SHUFFLE(1, 0, 2, 1));
|
|
V12 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(0, 3, 0, 2));
|
|
M3D_VECTOR V03 = M3D_PERMUTE_PS(MT.rows[2], _MM_SHUFFLE(0, 1, 0, 2));
|
|
V13 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(2, 1, 2, 1));
|
|
|
|
M3D_VECTOR C0 = _mm_mul_ps(V00, V10);
|
|
M3D_VECTOR C2 = _mm_mul_ps(V01, V11);
|
|
M3D_VECTOR C4 = _mm_mul_ps(V02, V12);
|
|
M3D_VECTOR C6 = _mm_mul_ps(V03, V13);
|
|
|
|
// V11 = D0X,D0Y,D2X,D2X
|
|
V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(0, 0, 1, 0));
|
|
V00 = M3D_PERMUTE_PS(MT.rows[1], _MM_SHUFFLE(2, 1, 3, 2));
|
|
V10 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(2, 1, 0, 3));
|
|
V01 = M3D_PERMUTE_PS(MT.rows[0], _MM_SHUFFLE(1, 3, 2, 3));
|
|
V11 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(0, 2, 1, 2));
|
|
// V13 = D1X,D1Y,D2Z,D2Z
|
|
V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(2, 2, 1, 0));
|
|
V02 = M3D_PERMUTE_PS(MT.rows[3], _MM_SHUFFLE(2, 1, 3, 2));
|
|
V12 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(2, 1, 0, 3));
|
|
V03 = M3D_PERMUTE_PS(MT.rows[2], _MM_SHUFFLE(1, 3, 2, 3));
|
|
V13 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(0, 2, 1, 2));
|
|
|
|
C0 = M3D_FNMADD_PS(V00, V10, C0);
|
|
C2 = M3D_FNMADD_PS(V01, V11, C2);
|
|
C4 = M3D_FNMADD_PS(V02, V12, C4);
|
|
C6 = M3D_FNMADD_PS(V03, V13, C6);
|
|
|
|
V00 = M3D_PERMUTE_PS(MT.rows[1], _MM_SHUFFLE(0, 3, 0, 3));
|
|
// V10 = D0Z,D0Z,D2X,D2Y
|
|
V10 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 2, 2));
|
|
V10 = M3D_PERMUTE_PS(V10, _MM_SHUFFLE(0, 2, 3, 0));
|
|
V01 = M3D_PERMUTE_PS(MT.rows[0], _MM_SHUFFLE(2, 0, 3, 1));
|
|
// V11 = D0X,D0W,D2X,D2Y
|
|
V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 3, 0));
|
|
V11 = M3D_PERMUTE_PS(V11, _MM_SHUFFLE(2, 1, 0, 3));
|
|
V02 = M3D_PERMUTE_PS(MT.rows[3], _MM_SHUFFLE(0, 3, 0, 3));
|
|
// V12 = D1Z,D1Z,D2Z,D2W
|
|
V12 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 2, 2));
|
|
V12 = M3D_PERMUTE_PS(V12, _MM_SHUFFLE(0, 2, 3, 0));
|
|
V03 = M3D_PERMUTE_PS(MT.rows[2], _MM_SHUFFLE(2, 0, 3, 1));
|
|
// V13 = D1X,D1W,D2Z,D2W
|
|
V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 3, 0));
|
|
V13 = M3D_PERMUTE_PS(V13, _MM_SHUFFLE(2, 1, 0, 3));
|
|
|
|
V00 = _mm_mul_ps(V00, V10);
|
|
V01 = _mm_mul_ps(V01, V11);
|
|
V02 = _mm_mul_ps(V02, V12);
|
|
V03 = _mm_mul_ps(V03, V13);
|
|
M3D_VECTOR C1 = _mm_sub_ps(C0, V00);
|
|
C0 = _mm_add_ps(C0, V00);
|
|
M3D_VECTOR C3 = _mm_add_ps(C2, V01);
|
|
C2 = _mm_sub_ps(C2, V01);
|
|
M3D_VECTOR C5 = _mm_sub_ps(C4, V02);
|
|
C4 = _mm_add_ps(C4, V02);
|
|
M3D_VECTOR C7 = _mm_add_ps(C6, V03);
|
|
C6 = _mm_sub_ps(C6, V03);
|
|
|
|
C0 = _mm_shuffle_ps(C0, C1, _MM_SHUFFLE(3, 1, 2, 0));
|
|
C2 = _mm_shuffle_ps(C2, C3, _MM_SHUFFLE(3, 1, 2, 0));
|
|
C4 = _mm_shuffle_ps(C4, C5, _MM_SHUFFLE(3, 1, 2, 0));
|
|
C6 = _mm_shuffle_ps(C6, C7, _MM_SHUFFLE(3, 1, 2, 0));
|
|
C0 = M3D_PERMUTE_PS(C0, _MM_SHUFFLE(3, 1, 2, 0));
|
|
C2 = M3D_PERMUTE_PS(C2, _MM_SHUFFLE(3, 1, 2, 0));
|
|
C4 = M3D_PERMUTE_PS(C4, _MM_SHUFFLE(3, 1, 2, 0));
|
|
C6 = M3D_PERMUTE_PS(C6, _MM_SHUFFLE(3, 1, 2, 0));
|
|
// Get the determinant
|
|
M3D_VECTOR vTemp = M3D_V4Dot(C0, MT.rows[0]);
|
|
//if (pDeterminant != nullptr)
|
|
// *pDeterminant = vTemp;
|
|
vTemp = _mm_div_ps(M3D_MOne, vTemp);
|
|
M3D_MATRIX mResult;
|
|
mResult.rows[0] = _mm_mul_ps(C0, vTemp);
|
|
mResult.rows[1] = _mm_mul_ps(C2, vTemp);
|
|
mResult.rows[2] = _mm_mul_ps(C4, vTemp);
|
|
mResult.rows[3] = _mm_mul_ps(C6, vTemp);
|
|
return mResult;
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
INLINE_AVX_FIX M3D_VECTOR M3D_QRotationFromMatrix(M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_V4F32 q;
|
|
float r22 = M.mat[2][2];
|
|
if (r22 <= 0.f) { // x^2 + y^2 >= z^2 + w^2
|
|
float dif10 = M.mat[1][1] - M.mat[0][0];
|
|
float omr22 = 1.f - r22;
|
|
if (dif10 <= 0.f) { // x^2 >= y^2
|
|
float fourXSqr = omr22 - dif10;
|
|
float inv4x = 0.5f / sqrtf(fourXSqr);
|
|
q.f[0] = fourXSqr * inv4x;
|
|
q.f[1] = (M.mat[0][1] + M.mat[1][0]) * inv4x;
|
|
q.f[2] = (M.mat[0][2] + M.mat[2][0]) * inv4x;
|
|
q.f[3] = (M.mat[1][2] - M.mat[2][1]) * inv4x;
|
|
} else { // y^2 >= x^2
|
|
float fourYSqr = omr22 + dif10;
|
|
float inv4y = 0.5f / sqrtf(fourYSqr);
|
|
q.f[0] = (M.mat[0][1] + M.mat[1][0]) * inv4y;
|
|
q.f[1] = fourYSqr * inv4y;
|
|
q.f[2] = (M.mat[1][2] + M.mat[2][1]) * inv4y;
|
|
q.f[3] = (M.mat[2][0] - M.mat[0][2]) * inv4y;
|
|
}
|
|
} else { // z^2 + w^2 >= x^2 + y^2
|
|
float sum10 = M.mat[1][1] + M.mat[0][0];
|
|
float opr22 = 1.f + r22;
|
|
if (sum10 <= 0.f) { // z^2 >= w^2
|
|
float fourZSqr = opr22 - sum10;
|
|
float inv4z = 0.5f / sqrtf(fourZSqr);
|
|
q.f[0] = (M.mat[0][2] + M.mat[2][0]) * inv4z;
|
|
q.f[1] = (M.mat[1][2] + M.mat[2][1]) * inv4z;
|
|
q.f[2] = fourZSqr * inv4z;
|
|
q.f[3] = (M.mat[0][1] - M.mat[1][0]) * inv4z;
|
|
} else { // w^2 >= z^2
|
|
float fourWSqr = opr22 + sum10;
|
|
float inv4w = 0.5f / sqrtf(fourWSqr);
|
|
q.f[0] = (M.mat[1][2] - M.mat[2][1]) * inv4w;
|
|
q.f[1] = (M.mat[2][0] - M.mat[0][2]) * inv4w;
|
|
q.f[2] = (M.mat[0][1] - M.mat[1][0]) * inv4w;
|
|
q.f[3] = fourWSqr * inv4w;
|
|
}
|
|
}
|
|
return q.v;
|
|
#else
|
|
static const M3D_V4F32 XMPMMP = {{{+1.0f, -1.0f, -1.0f, +1.0f}}};
|
|
static const M3D_V4F32 XMMPMP = {{{-1.0f, +1.0f, -1.0f, +1.0f}}};
|
|
static const M3D_V4F32 XMMMPP = {{{-1.0f, -1.0f, +1.0f, +1.0f}}};
|
|
|
|
M3D_VECTOR r0 = M.rows[0]; // (r00, r01, r02, 0)
|
|
M3D_VECTOR r1 = M.rows[1]; // (r10, r11, r12, 0)
|
|
M3D_VECTOR r2 = M.rows[2]; // (r20, r21, r22, 0)
|
|
|
|
// (r00, r00, r00, r00)
|
|
M3D_VECTOR r00 = M3D_PERMUTE_PS(r0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
// (r11, r11, r11, r11)
|
|
M3D_VECTOR r11 = M3D_PERMUTE_PS(r1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
// (r22, r22, r22, r22)
|
|
M3D_VECTOR r22 = M3D_PERMUTE_PS(r2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
|
|
// x^2 >= y^2 equivalent to r11 - r00 <= 0
|
|
// (r11 - r00, r11 - r00, r11 - r00, r11 - r00)
|
|
M3D_VECTOR r11mr00 = _mm_sub_ps(r11, r00);
|
|
M3D_VECTOR x2gey2 = _mm_cmple_ps(r11mr00, M3D_MZero);
|
|
|
|
// z^2 >= w^2 equivalent to r11 + r00 <= 0
|
|
// (r11 + r00, r11 + r00, r11 + r00, r11 + r00)
|
|
M3D_VECTOR r11pr00 = _mm_add_ps(r11, r00);
|
|
M3D_VECTOR z2gew2 = _mm_cmple_ps(r11pr00, M3D_MZero);
|
|
|
|
// x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
|
|
M3D_VECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, M3D_MZero);
|
|
|
|
// (4*x^2, 4*y^2, 4*z^2, 4*w^2)
|
|
M3D_VECTOR t0 = M3D_FMADD_PS(XMPMMP, r00, M3D_MOne);
|
|
M3D_VECTOR t1 = _mm_mul_ps(XMMPMP, r11);
|
|
M3D_VECTOR t2 = M3D_FMADD_PS(XMMMPP, r22, t0);
|
|
M3D_VECTOR x2y2z2w2 = _mm_add_ps(t1, t2);
|
|
|
|
// (r01, r02, r12, r11)
|
|
t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 2, 2, 1));
|
|
// (r10, r10, r20, r21)
|
|
t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 0, 0));
|
|
// (r10, r20, r21, r10)
|
|
t1 = M3D_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0));
|
|
// (4*x*y, 4*x*z, 4*y*z, unused)
|
|
M3D_VECTOR xyxzyz = _mm_add_ps(t0, t1);
|
|
|
|
// (r21, r20, r10, r10)
|
|
t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 1));
|
|
// (r12, r12, r02, r01)
|
|
t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1, 2, 2, 2));
|
|
// (r12, r02, r01, r12)
|
|
t1 = M3D_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0));
|
|
// (4*x*w, 4*y*w, 4*z*w, unused)
|
|
M3D_VECTOR xwywzw = _mm_sub_ps(t0, t1);
|
|
xwywzw = _mm_mul_ps(XMMPMP, xwywzw);
|
|
|
|
// (4*x^2, 4*y^2, 4*x*y, unused)
|
|
t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0, 0, 1, 0));
|
|
// (4*z^2, 4*w^2, 4*z*w, unused)
|
|
t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0, 2, 3, 2));
|
|
// (4*x*z, 4*y*z, 4*x*w, 4*y*w)
|
|
t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1, 0, 2, 1));
|
|
|
|
// (4*x*x, 4*x*y, 4*x*z, 4*x*w)
|
|
M3D_VECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2, 0, 2, 0));
|
|
// (4*y*x, 4*y*y, 4*y*z, 4*y*w)
|
|
M3D_VECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 1, 1, 2));
|
|
// (4*z*x, 4*z*y, 4*z*z, 4*z*w)
|
|
M3D_VECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2, 0, 1, 0));
|
|
// (4*w*x, 4*w*y, 4*w*z, 4*w*w)
|
|
M3D_VECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1, 2, 3, 2));
|
|
|
|
// Select the row of the tensor-product matrix that has the largest
|
|
// magnitude.
|
|
t0 = _mm_and_ps(x2gey2, tensor0);
|
|
t1 = _mm_andnot_ps(x2gey2, tensor1);
|
|
t0 = _mm_or_ps(t0, t1);
|
|
t1 = _mm_and_ps(z2gew2, tensor2);
|
|
t2 = _mm_andnot_ps(z2gew2, tensor3);
|
|
t1 = _mm_or_ps(t1, t2);
|
|
t0 = _mm_and_ps(x2py2gez2pw2, t0);
|
|
t1 = _mm_andnot_ps(x2py2gez2pw2, t1);
|
|
t2 = _mm_or_ps(t0, t1);
|
|
|
|
// Normalize the row. No division by zero is possible because the
|
|
// quaternion is unit-length (and the row is a nonzero multiple of
|
|
// the quaternion).
|
|
t0 = M3D_V4Length(t2);
|
|
return _mm_div_ps(t2, t0);
|
|
#endif
|
|
}
|
|
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
inline M3D_VECTOR M3D_V3Rotate(M3D_VECTOR V, M3D_VECTOR RotationQuaternion) noexcept {
|
|
M3D_VECTOR A = M3D_V4Select(M3D_MSelect1110.v, V, M3D_MSelect1110.v);
|
|
M3D_VECTOR Q = M3D_QConjugate(RotationQuaternion);
|
|
M3D_VECTOR Result = M3D_QMultiply(Q, A);
|
|
return M3D_QMultiply(Result, RotationQuaternion);
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_VECTOR M3D_V3Transform(M3D_VECTOR V, M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, M.rows[2], M.rows[3]);
|
|
Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result);
|
|
Result = M3D_V4MultiplyAdd(X, M.rows[0], Result);
|
|
|
|
return Result;
|
|
#else
|
|
M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z
|
|
vResult = M3D_FMADD_PS(vResult, M.rows[2], M.rows[3]);
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
|
|
vResult = M3D_FMADD_PS(vTemp, M.rows[1], vResult);
|
|
vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
|
|
vResult = M3D_FMADD_PS(vTemp, M.rows[0], vResult);
|
|
return vResult;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX void M3D_V3Transform(
|
|
M3D_F4* pOutputStream,
|
|
size_t OutputStride,
|
|
const M3D_F3* pInputStream,
|
|
size_t InputStride,
|
|
size_t VectorCount,
|
|
M3D_MATRIX M
|
|
) noexcept {
|
|
auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
|
|
auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
|
|
|
|
const M3D_VECTOR row0 = M.rows[0];
|
|
const M3D_VECTOR row1 = M.rows[1];
|
|
const M3D_VECTOR row2 = M.rows[2];
|
|
const M3D_VECTOR row3 = M.rows[3];
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
for (size_t i = 0; i < VectorCount; i++) {
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, row2, row3);
|
|
Result = M3D_V4MultiplyAdd(Y, row1, Result);
|
|
Result = M3D_V4MultiplyAdd(X, row0, Result);
|
|
|
|
M3D_V4StoreF4(reinterpret_cast<M3D_F4*>(pOutputVector), Result);
|
|
|
|
pInputVector += InputStride;
|
|
pOutputVector += OutputStride;
|
|
}
|
|
#else
|
|
size_t i = 0;
|
|
size_t four = VectorCount >> 2;
|
|
if (four > 0) {
|
|
if (InputStride == sizeof(M3D_F3)) {
|
|
if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF)) {
|
|
// Packed input, aligned output
|
|
for (size_t j = 0; j < four; ++j) {
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
i += 4;
|
|
}
|
|
} else {
|
|
// Packed input, unaligned output
|
|
for (size_t j = 0; j < four; ++j)
|
|
{
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
i += 4;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF)) {
|
|
// Aligned output
|
|
for (; i < VectorCount; ++i) {
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
} else {
|
|
// Unaligned output
|
|
for (; i < VectorCount; ++i)
|
|
{
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
}
|
|
|
|
M3D_SFENCE();
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_VECTOR M3D_V3TransformNormal(M3D_VECTOR V, M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4Multiply(Z, M.rows[2]);
|
|
Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result);
|
|
Result = M3D_V4MultiplyAdd(X, M.rows[0], Result);
|
|
|
|
return Result;
|
|
#else
|
|
M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z
|
|
vResult = _mm_mul_ps(vResult, M.rows[2]);
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
|
|
vResult = M3D_FMADD_PS(vTemp, M.rows[1], vResult);
|
|
vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
|
|
vResult = M3D_FMADD_PS(vTemp, M.rows[0], vResult);
|
|
return vResult;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_VECTOR M3D_V3TransformPersDiv(M3D_VECTOR V, M3D_MATRIX M) noexcept {
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, M.rows[2], M.rows[3]);
|
|
Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result);
|
|
Result = M3D_V4MultiplyAdd(X, M.rows[0], Result);
|
|
|
|
M3D_VECTOR W = M3D_V4SplatW(Result);
|
|
return M3D_V4Divide(Result, W);
|
|
}
|
|
|
|
INLINE_AVX_FIX void M3D_V3TransformPersDiv(
|
|
M3D_F3* pOutputStream,
|
|
size_t OutputStride,
|
|
const M3D_F3* pInputStream,
|
|
size_t InputStride,
|
|
size_t VectorCount,
|
|
M3D_MATRIX M
|
|
) noexcept {
|
|
auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
|
|
auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
|
|
|
|
const M3D_VECTOR row0 = M.rows[0];
|
|
const M3D_VECTOR row1 = M.rows[1];
|
|
const M3D_VECTOR row2 = M.rows[2];
|
|
const M3D_VECTOR row3 = M.rows[3];
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
for (size_t i = 0; i < VectorCount; i++)
|
|
{
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, row2, row3);
|
|
Result = M3D_V4MultiplyAdd(Y, row1, Result);
|
|
Result = M3D_V4MultiplyAdd(X, row0, Result);
|
|
|
|
M3D_VECTOR W = M3D_V4SplatW(Result);
|
|
|
|
Result = M3D_V4Divide(Result, W);
|
|
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), Result);
|
|
|
|
pInputVector += InputStride;
|
|
pOutputVector += OutputStride;
|
|
}
|
|
#else
|
|
size_t i = 0;
|
|
size_t four = VectorCount >> 2;
|
|
if (four > 0) {
|
|
if (InputStride == sizeof(M3D_F3)) {
|
|
if (OutputStride == sizeof(M3D_F3)) {
|
|
if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF)) {
|
|
// Packed input, aligned & packed output
|
|
for (size_t j = 0; j < four; ++j) {
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V1 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V2 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V3 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V4 = _mm_div_ps(vTemp, W);
|
|
|
|
// Pack and store the vectors
|
|
M3D_PACK4INTO3(vTemp);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), V1);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 32), V3);
|
|
pOutputVector += sizeof(M3D_F3) * 4;
|
|
i += 4;
|
|
}
|
|
} else {
|
|
// Packed input, unaligned & packed output
|
|
for (size_t j = 0; j < four; ++j) {
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V1 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V2 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V3 = _mm_div_ps(vTemp, W);
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
V4 = _mm_div_ps(vTemp, W);
|
|
|
|
// Pack and store the vectors
|
|
M3D_PACK4INTO3(vTemp);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), V1);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 32), V3);
|
|
pOutputVector += sizeof(M3D_F3) * 4;
|
|
i += 4;
|
|
}
|
|
}
|
|
} else {
|
|
// Packed input, unpacked output
|
|
for (size_t j = 0; j < four; ++j)
|
|
{
|
|
__m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
__m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
|
|
__m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
|
|
pInputVector += sizeof(M3D_F3) * 4;
|
|
|
|
// Unpack the 4 vectors (.w components are junk)
|
|
M3D_UNPACK3INTO4(V1, L2, L3);
|
|
|
|
// Result 1
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 2
|
|
Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 3
|
|
Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
// Result 4
|
|
Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
|
|
Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
vTemp2 = _mm_mul_ps(Y, row1);
|
|
vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
|
|
i += 4;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (; i < VectorCount; i++) {
|
|
M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast<const M3D_F3*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3);
|
|
M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1);
|
|
M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0);
|
|
vTemp = _mm_add_ps(vTemp, vTemp2);
|
|
vTemp = _mm_add_ps(vTemp, vTemp3);
|
|
|
|
M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTemp = _mm_div_ps(vTemp, W);
|
|
|
|
M3D_V4StoreF3(reinterpret_cast<M3D_F3*>(pOutputVector), vTemp);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
|
|
M3D_SFENCE();
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_VECTOR M3D_V4Transform(M3D_VECTOR V, M3D_MATRIX M) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR W = M3D_V4SplatW(V);
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4Multiply(W, M.rows[3]);
|
|
Result = M3D_V4MultiplyAdd(Z, M.rows[2], Result);
|
|
Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result);
|
|
Result = M3D_V4MultiplyAdd(X, M.rows[0], Result);
|
|
return Result;
|
|
#else
|
|
M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); // W
|
|
vResult = _mm_mul_ps(vResult, M.rows[3]);
|
|
M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z
|
|
vResult = M3D_FMADD_PS(vTemp, M.rows[2], vResult);
|
|
vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
|
|
vResult = M3D_FMADD_PS(vTemp, M.rows[1], vResult);
|
|
vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
|
|
vResult = M3D_FMADD_PS(vTemp, M.rows[0], vResult);
|
|
return vResult;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX void M3D_V4Transform(M3D_F4* pOutputStream, size_t OutputStride, const M3D_F4* pInputStream, size_t InputStride, size_t VectorCount, M3D_MATRIX M) noexcept {
|
|
auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
|
|
auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
const M3D_VECTOR row0 = M.rows[0];
|
|
const M3D_VECTOR row1 = M.rows[1];
|
|
const M3D_VECTOR row2 = M.rows[2];
|
|
const M3D_VECTOR row3 = M.rows[3];
|
|
|
|
for (size_t i = 0; i < VectorCount; i++) {
|
|
M3D_VECTOR V = M3D_V4LoadF4(reinterpret_cast<const M3D_F4*>(pInputVector));
|
|
M3D_VECTOR W = M3D_V4SplatW(V);
|
|
M3D_VECTOR Z = M3D_V4SplatZ(V);
|
|
M3D_VECTOR Y = M3D_V4SplatY(V);
|
|
M3D_VECTOR X = M3D_V4SplatX(V);
|
|
|
|
M3D_VECTOR Result = M3D_V4Multiply(W, row3);
|
|
Result = M3D_V4MultiplyAdd(Z, row2, Result);
|
|
Result = M3D_V4MultiplyAdd(Y, row1, Result);
|
|
Result = M3D_V4MultiplyAdd(X, row0, Result);
|
|
|
|
M3D_V4StoreF4(reinterpret_cast<M3D_F4*>(pOutputVector), Result);
|
|
|
|
pInputVector += InputStride;
|
|
pOutputVector += OutputStride;
|
|
}
|
|
#elif defined(AVX2_INTRINSICS)
|
|
size_t i = 0;
|
|
size_t two = VectorCount >> 1;
|
|
if (two > 0) {
|
|
__m256 row0 = _mm256_broadcast_ps(&M.rows[0]);
|
|
__m256 row1 = _mm256_broadcast_ps(&M.rows[1]);
|
|
__m256 row2 = _mm256_broadcast_ps(&M.rows[2]);
|
|
__m256 row3 = _mm256_broadcast_ps(&M.rows[3]);
|
|
|
|
if (InputStride == sizeof(M3D_F4)) {
|
|
if (OutputStride == sizeof(M3D_F4)) {
|
|
if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0x1F)) {
|
|
// Packed input, aligned & packed output
|
|
for (size_t j = 0; j < two; ++j) {
|
|
__m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
pInputVector += sizeof(M3D_F4) * 2;
|
|
|
|
__m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
|
|
__m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
|
|
__m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTempX = _mm256_mul_ps(vTempX, row0);
|
|
vTempY = _mm256_mul_ps(vTempY, row1);
|
|
vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX);
|
|
vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY);
|
|
vTempX = _mm256_add_ps(vTempZ, vTempW);
|
|
|
|
M3D_STREAM_256b_PS(reinterpret_cast<float*>(pOutputVector), vTempX);
|
|
pOutputVector += sizeof(M3D_F4) * 2;
|
|
|
|
i += 2;
|
|
}
|
|
} else {
|
|
// Packed input, packed output
|
|
for (size_t j = 0; j < two; ++j) {
|
|
__m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
pInputVector += sizeof(M3D_F4) * 2;
|
|
|
|
__m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
|
|
__m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
|
|
__m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTempX = _mm256_mul_ps(vTempX, row0);
|
|
vTempY = _mm256_mul_ps(vTempY, row1);
|
|
vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX);
|
|
vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY);
|
|
vTempX = _mm256_add_ps(vTempZ, vTempW);
|
|
|
|
_mm256_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
|
|
pOutputVector += sizeof(M3D_F4) * 2;
|
|
|
|
i += 2;
|
|
}
|
|
}
|
|
} else {
|
|
// Packed input, unpacked output
|
|
for (size_t j = 0; j < two; ++j) {
|
|
__m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
pInputVector += sizeof(M3D_F4) * 2;
|
|
|
|
__m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
|
|
__m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
|
|
__m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTempX = _mm256_mul_ps(vTempX, row0);
|
|
vTempY = _mm256_mul_ps(vTempY, row1);
|
|
vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX);
|
|
vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY);
|
|
vTempX = _mm256_add_ps(vTempZ, vTempW);
|
|
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), _mm256_castps256_ps128(vTempX));
|
|
pOutputVector += OutputStride;
|
|
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), _mm256_extractf128_ps(vTempX, 1));
|
|
pOutputVector += OutputStride;
|
|
i += 2;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (i < VectorCount) {
|
|
const M3D_VECTOR row0 = M.rows[0];
|
|
const M3D_VECTOR row1 = M.rows[1];
|
|
const M3D_VECTOR row2 = M.rows[2];
|
|
const M3D_VECTOR row3 = M.rows[3];
|
|
|
|
for (; i < VectorCount; i++) {
|
|
__m128 V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR vTempX = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
M3D_VECTOR vTempY = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR vTempZ = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR vTempW = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTempX = _mm_mul_ps(vTempX, row0);
|
|
vTempY = _mm_mul_ps(vTempY, row1);
|
|
vTempZ = M3D_FMADD_PS(vTempZ, row2, vTempX);
|
|
vTempW = M3D_FMADD_PS(vTempW, row3, vTempY);
|
|
vTempX = _mm_add_ps(vTempZ, vTempW);
|
|
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
}
|
|
|
|
M3D_SFENCE();
|
|
#else
|
|
const M3D_VECTOR row0 = M.rows[0];
|
|
const M3D_VECTOR row1 = M.rows[1];
|
|
const M3D_VECTOR row2 = M.rows[2];
|
|
const M3D_VECTOR row3 = M.rows[3];
|
|
|
|
if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF)) {
|
|
if (!(reinterpret_cast<uintptr_t>(pInputStream) & 0xF) && !(InputStride & 0xF)) {
|
|
// Aligned input, aligned output
|
|
for (size_t i = 0; i < VectorCount; i++) {
|
|
__m128 V = _mm_load_ps(reinterpret_cast<const float*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR vTempX = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
M3D_VECTOR vTempY = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR vTempZ = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR vTempW = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTempX = _mm_mul_ps(vTempX, row0);
|
|
vTempY = _mm_mul_ps(vTempY, row1);
|
|
vTempZ = M3D_FMADD_PS(vTempZ, row2, vTempX);
|
|
vTempW = M3D_FMADD_PS(vTempW, row3, vTempY);
|
|
vTempX = _mm_add_ps(vTempZ, vTempW);
|
|
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTempX);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
} else {
|
|
// Unaligned input, aligned output
|
|
for (size_t i = 0; i < VectorCount; i++) {
|
|
__m128 V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR vTempX = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
M3D_VECTOR vTempY = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR vTempZ = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR vTempW = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTempX = _mm_mul_ps(vTempX, row0);
|
|
vTempY = _mm_mul_ps(vTempY, row1);
|
|
vTempZ = M3D_FMADD_PS(vTempZ, row2, vTempX);
|
|
vTempW = M3D_FMADD_PS(vTempW, row3, vTempY);
|
|
vTempX = _mm_add_ps(vTempZ, vTempW);
|
|
|
|
M3D_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTempX);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
}
|
|
} else {
|
|
if (!(reinterpret_cast<uintptr_t>(pInputStream) & 0xF) && !(InputStride & 0xF)) {
|
|
// Aligned input, unaligned output
|
|
for (size_t i = 0; i < VectorCount; i++) {
|
|
__m128 V = _mm_load_ps(reinterpret_cast<const float*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR vTempX = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
M3D_VECTOR vTempY = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR vTempZ = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR vTempW = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTempX = _mm_mul_ps(vTempX, row0);
|
|
vTempY = _mm_mul_ps(vTempY, row1);
|
|
vTempZ = M3D_FMADD_PS(vTempZ, row2, vTempX);
|
|
vTempW = M3D_FMADD_PS(vTempW, row3, vTempY);
|
|
vTempX = _mm_add_ps(vTempZ, vTempW);
|
|
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
} else {
|
|
// Unaligned input, unaligned output
|
|
for (size_t i = 0; i < VectorCount; i++)
|
|
{
|
|
__m128 V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
|
|
pInputVector += InputStride;
|
|
|
|
M3D_VECTOR vTempX = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
|
|
M3D_VECTOR vTempY = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
|
|
M3D_VECTOR vTempZ = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
|
|
M3D_VECTOR vTempW = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
|
|
|
|
vTempX = _mm_mul_ps(vTempX, row0);
|
|
vTempY = _mm_mul_ps(vTempY, row1);
|
|
vTempZ = M3D_FMADD_PS(vTempZ, row2, vTempX);
|
|
vTempW = M3D_FMADD_PS(vTempW, row3, vTempY);
|
|
vTempX = _mm_add_ps(vTempZ, vTempW);
|
|
|
|
_mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
|
|
pOutputVector += OutputStride;
|
|
}
|
|
}
|
|
}
|
|
|
|
M3D_SFENCE();
|
|
#endif
|
|
}
|
|
|
|
inline M3D_VECTOR M3D_V3TransformNDCToViewport(M3D_VECTOR V, float vpX, float vpY, float vpW, float vpH, float vpMinZ, float vpMaxZ) noexcept {
|
|
const float halfVPWidth = vpW * 0.5f;
|
|
const float halfVPHeight = vpH * 0.5f;
|
|
|
|
M3D_VECTOR s = M3D_V4Set(halfVPWidth, -halfVPHeight, vpMaxZ - vpMinZ, 0.0f);
|
|
M3D_VECTOR o = M3D_V4Set(vpX + halfVPWidth, vpY + halfVPHeight, vpMinZ, 0.0f);
|
|
|
|
return M3D_V4MultiplyAdd(V, s, o);
|
|
}
|
|
|
|
/* -------------------------------------------------------------------------------------------------------------------------- */
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixCamLookAtLH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept {
|
|
M3D_VECTOR dir = M3D_V4Subtract(focusPos, viewPos);
|
|
return M3D_TransformMatrixCamLookToLH(viewPos, dir, upDirection);
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixCamLookAtRH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept {
|
|
M3D_VECTOR dir_n = M3D_V4Subtract(viewPos, focusPos);
|
|
return M3D_TransformMatrixCamLookToLH(viewPos, dir_n, upDirection);
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixCamLookToLH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept {
|
|
// Keep viewer's axes orthogonal to each other and of unit length
|
|
M3D_VECTOR look_normal = M3D_V3Normalize(viewDirection);
|
|
M3D_VECTOR up_norm = M3D_V3Normalize(M3D_V3Cross(upDirection, look_normal));
|
|
|
|
// U, L already ortho-normal, so no need to normalize cross product
|
|
M3D_VECTOR right_norm = M3D_V3Cross(look_normal, up_norm);
|
|
|
|
M3D_VECTOR viewPos_n = M3D_V4Negate(viewPos);
|
|
|
|
M3D_VECTOR up_vec = M3D_V3Dot(up_norm, viewPos_n);
|
|
M3D_VECTOR right_vec = M3D_V3Dot(right_norm, viewPos_n);
|
|
M3D_VECTOR look_vec = M3D_V3Dot(look_normal, viewPos_n);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Select(up_vec, up_norm, M3D_MSelect1110.v);
|
|
ret.rows[1] = M3D_V4Select(right_vec, right_norm, M3D_MSelect1110.v);
|
|
ret.rows[2] = M3D_V4Select(look_vec, look_normal, M3D_MSelect1110.v);
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
|
|
ret = M3D_MTranspose(ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixCamLookToRH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept {
|
|
M3D_VECTOR viewDirection_n = M3D_V4Negate(viewDirection);
|
|
return M3D_TransformMatrixCamLookToLH(viewPos, viewDirection_n, upDirection);
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixFrustrumFovLH(float fov, float ratio, float near, float far) noexcept {
|
|
float SinFov;
|
|
float CosFov;
|
|
M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov);
|
|
float fRange = far / (far - near);
|
|
float Height = CosFov / SinFov;
|
|
float Width = Height / ratio;
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = Width;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = Height;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = fRange;
|
|
ret.mat[2][3] = 1.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = -fRange * near;
|
|
ret.mat[3][3] = 0.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR rMem = {
|
|
Width,
|
|
Height,
|
|
fRange,
|
|
-fRange * near
|
|
};
|
|
|
|
// Copy from memory to SSE register
|
|
M3D_VECTOR vValues = rMem;
|
|
|
|
M3D_MATRIX ret;
|
|
M3D_VECTOR vTemp = _mm_setzero_ps();
|
|
vTemp = _mm_move_ss(vTemp, vValues);
|
|
ret.rows[0] = vTemp; // Width, 0, 0, 0
|
|
vTemp = vValues;
|
|
vTemp = _mm_and_ps(vTemp, M3D_MMaskY);
|
|
ret.rows[1] = vTemp; // 0, Height, 0, 0
|
|
vTemp = _mm_setzero_ps();
|
|
vValues = _mm_shuffle_ps(vValues, M3D_MIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
|
|
ret.rows[2] = vTemp; // 0, 0, fRange, 1.0f
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
|
|
ret.rows[3] = vTemp; // 0, 0, -fRange * near, 0.0f
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixFrustrumFovRH(float fov, float ratio, float near, float far) noexcept {
|
|
float SinFov;
|
|
float CosFov;
|
|
M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov);
|
|
float fRange = far / (near - far);
|
|
float Height = CosFov / SinFov;
|
|
float Width = Height / ratio;
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = Width;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = Height;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = fRange;
|
|
ret.mat[2][3] = -1.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = fRange * near;
|
|
ret.mat[3][3] = 0.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR rMem = {
|
|
Width,
|
|
Height,
|
|
fRange,
|
|
fRange * near
|
|
};
|
|
|
|
// Copy from memory to SSE register
|
|
M3D_VECTOR vValues = rMem;
|
|
|
|
M3D_MATRIX ret;
|
|
M3D_VECTOR vTemp = _mm_setzero_ps();
|
|
vTemp = _mm_move_ss(vTemp, vValues);
|
|
ret.rows[0] = vTemp; // Height / a_ratio, 0, 0, 0
|
|
vTemp = vValues;
|
|
vTemp = _mm_and_ps(vTemp, M3D_MMaskY);
|
|
ret.rows[1] = vTemp; // 0, CosFov / SinFov, 0, 0
|
|
vTemp = _mm_setzero_ps();
|
|
vValues = _mm_shuffle_ps(vValues, M3D_MIdentityR3_n, _MM_SHUFFLE(3, 2, 3, 2));
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
|
|
ret.rows[2] = vTemp; // 0, 0, fRange, -1.0f
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
|
|
ret.rows[3] = vTemp; // 0, 0, fRange * near, 0.0f
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixOrthographicLH(float w, float h, float near, float far) noexcept {
|
|
float fRange = 1.0f / (far - near);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = 2.0f / w;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = 2.0f / h;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = fRange;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = -fRange * near;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR rMem = {
|
|
2.0f / w,
|
|
2.0f / h,
|
|
fRange,
|
|
-fRange * near
|
|
};
|
|
|
|
// Copy from memory to SSE register
|
|
M3D_VECTOR vValues = rMem;
|
|
|
|
M3D_MATRIX ret;
|
|
M3D_VECTOR vTemp = _mm_setzero_ps();
|
|
vTemp = _mm_move_ss(vTemp, vValues);
|
|
|
|
ret.rows[0] = vTemp; // 2.0f / w, 0, 0, 0
|
|
|
|
vTemp = vValues;
|
|
vTemp = _mm_and_ps(vTemp, M3D_MMaskY);
|
|
ret.rows[1] = vTemp; // 0, 2.0f / h, 0, 0
|
|
vTemp = _mm_setzero_ps();
|
|
vValues = _mm_shuffle_ps(vValues, M3D_MIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 0, 0, 0));
|
|
ret.rows[2] = vTemp; // 0, 0, fRange, 0.f
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 1, 0, 0));
|
|
ret.rows[3] = vTemp; // 0, 0, -fRange * near, 1.0f
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixOrthographicRH(float w, float h, float near, float far) noexcept {
|
|
float fRange = 1.0f / (near - far);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = 2.0f / w;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = 2.0f / h;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = fRange;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = fRange * near;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR rMem = {
|
|
2.0f / w,
|
|
2.0f / h,
|
|
fRange,
|
|
fRange * near
|
|
};
|
|
|
|
// Copy from memory to SSE register
|
|
M3D_VECTOR vValues = rMem;
|
|
|
|
M3D_MATRIX ret;
|
|
M3D_VECTOR vTemp = _mm_setzero_ps();
|
|
vTemp = _mm_move_ss(vTemp, vValues);
|
|
|
|
ret.rows[0] = vTemp; // 2.0f / w, 0, 0, 0
|
|
|
|
vTemp = vValues;
|
|
vTemp = _mm_and_ps(vTemp, M3D_MMaskY);
|
|
ret.rows[1] = vTemp; // 0, 2.0f / h, 0, 0
|
|
vTemp = _mm_setzero_ps();
|
|
vValues = _mm_shuffle_ps(vValues, M3D_MIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 0, 0, 0));
|
|
ret.rows[2] = vTemp; // 0, 0, fRange, 0.f
|
|
vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 1, 0, 0));
|
|
ret.rows[3] = vTemp; // 0, 0, fRange * near, 1.0f
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixTranslate(M3D_VECTOR Offset) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = 1.0f;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = 1.0f;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = Offset.v4f[0];
|
|
ret.mat[3][1] = Offset.v4f[1];
|
|
ret.mat[3][2] = Offset.v4f[2];
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0.v;
|
|
ret.rows[1] = M3D_MIdentityR1.v;
|
|
ret.rows[2] = M3D_MIdentityR2.v;
|
|
ret.rows[3] = M3D_V4Select(M3D_MIdentityR3.v, Offset, M3D_MSelect1110.v);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixScale(float ScaleX, float ScaleY, float ScaleZ) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = ScaleX;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = ScaleY;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = ScaleZ;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_set_ps(0, 0, 0, ScaleX);
|
|
ret.rows[1] = _mm_set_ps(0, 0, ScaleY, 0);
|
|
ret.rows[2] = _mm_set_ps(0, ScaleZ, 0, 0);
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixScale(M3D_VECTOR Scale) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = Scale.v4f[0];
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = Scale.v4f[1];
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = Scale.v4f[2];
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = _mm_and_ps(Scale, M3D_MMaskX);
|
|
ret.rows[1] = _mm_and_ps(Scale, M3D_MMaskY);
|
|
ret.rows[2] = _mm_and_ps(Scale, M3D_MMaskZ);
|
|
ret.rows[3] = M3D_MIdentityR3.v;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixTranslate(float OffsetX, float OffsetY, float OffsetZ) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = 1.0f;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = 1.0f;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = OffsetX;
|
|
ret.mat[3][1] = OffsetY;
|
|
ret.mat[3][2] = OffsetZ;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0.v;
|
|
ret.rows[1] = M3D_MIdentityR1.v;
|
|
ret.rows[2] = M3D_MIdentityR2.v;
|
|
ret.rows[3] = M3D_V4Set(OffsetX, OffsetY, OffsetZ, 1.f);
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationX(float Angle) noexcept {
|
|
float SinAngle;
|
|
float CosAngle;
|
|
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = 1.0f;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = CosAngle;
|
|
ret.mat[1][2] = SinAngle;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = -SinAngle;
|
|
ret.mat[2][2] = CosAngle;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR vSin = _mm_set_ss(SinAngle);
|
|
M3D_VECTOR vCos = _mm_set_ss(CosAngle);
|
|
// x = 0,y = cos,z = sin, w = 0
|
|
vCos = _mm_shuffle_ps(vCos, vSin, _MM_SHUFFLE(3, 0, 0, 3));
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_MIdentityR0;
|
|
ret.rows[1] = vCos;
|
|
// x = 0,y = sin,z = cos, w = 0
|
|
vCos = M3D_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 1, 2, 0));
|
|
// x = 0,y = -sin,z = cos, w = 0
|
|
vCos = _mm_mul_ps(vCos, M3D_MNegateY);
|
|
ret.rows[2] = vCos;
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationY(float Angle) noexcept {
|
|
float SinAngle;
|
|
float CosAngle;
|
|
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = CosAngle;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = -SinAngle;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = 1.0f;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = SinAngle;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = CosAngle;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR vSin = _mm_set_ss(SinAngle);
|
|
M3D_VECTOR vCos = _mm_set_ss(CosAngle);
|
|
// x = sin,y = 0,z = cos, w = 0
|
|
vSin = _mm_shuffle_ps(vSin, vCos, _MM_SHUFFLE(3, 0, 3, 0));
|
|
M3D_MATRIX ret;
|
|
ret.rows[2] = vSin;
|
|
ret.rows[1] = M3D_MIdentityR1;
|
|
// x = cos,y = 0,z = sin, w = 0
|
|
vSin = M3D_PERMUTE_PS(vSin, _MM_SHUFFLE(3, 0, 1, 2));
|
|
// x = cos,y = 0,z = -sin, w = 0
|
|
vSin = _mm_mul_ps(vSin, M3D_MNegateZ);
|
|
ret.rows[0] = vSin;
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationZ(float Angle) noexcept {
|
|
float SinAngle;
|
|
float CosAngle;
|
|
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = CosAngle;
|
|
ret.mat[0][1] = SinAngle;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = -SinAngle;
|
|
ret.mat[1][1] = CosAngle;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_VECTOR vSin = _mm_set_ss(SinAngle);
|
|
M3D_VECTOR vCos = _mm_set_ss(CosAngle);
|
|
// x = cos,y = sin,z = 0, w = 0
|
|
vCos = _mm_unpacklo_ps(vCos, vSin);
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = vCos;
|
|
// x = sin,y = cos,z = 0, w = 0
|
|
vCos = M3D_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 2, 0, 1));
|
|
// x = cos,y = -sin,z = 0, w = 0
|
|
vCos = _mm_mul_ps(vCos, M3D_MNegateX);
|
|
ret.rows[1] = vCos;
|
|
ret.rows[2] = M3D_MIdentityR2;
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotation(M3D_VECTOR Angles) noexcept {
|
|
#ifdef DISABLE_INTRINSICS
|
|
float cp = cosf(Angles.v4f[0]);
|
|
float sp = sinf(Angles.v4f[0]);
|
|
|
|
float cy = cosf(Angles.v4f[1]);
|
|
float sy = sinf(Angles.v4f[1]);
|
|
|
|
float cr = cosf(Angles.v4f[2]);
|
|
float sr = sinf(Angles.v4f[2]);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = cr * cy + sr * sp * sy;
|
|
ret.mat[0][1] = sr * cp;
|
|
ret.mat[0][2] = sr * sp * cy - cr * sy;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = cr * sp * sy - sr * cy;
|
|
ret.mat[1][1] = cr * cp;
|
|
ret.mat[1][2] = sr * sy + cr * sp * cy;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = cp * sy;
|
|
ret.mat[2][1] = -sp;
|
|
ret.mat[2][2] = cp * cy;
|
|
ret.mat[2][3] = 0.0f;
|
|
|
|
ret.mat[3][0] = 0.0f;
|
|
ret.mat[3][1] = 0.0f;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
static const M3D_V4F32 Sign = {{{1.0f, -1.0f, -1.0f, 1.0f}}};
|
|
|
|
M3D_VECTOR SinAngles, CosAngles;
|
|
M3D_V4SinCos(&SinAngles, &CosAngles, Angles);
|
|
|
|
M3D_VECTOR P0 = M3D_V4Permute<M3D_PERMUTE_1X, M3D_PERMUTE_0Z, M3D_PERMUTE_1Z, M3D_PERMUTE_1X>(SinAngles, CosAngles);
|
|
M3D_VECTOR Y0 = M3D_V4Permute<M3D_PERMUTE_0Y, M3D_PERMUTE_1X, M3D_PERMUTE_1X, M3D_PERMUTE_1Y>(SinAngles, CosAngles);
|
|
M3D_VECTOR P1 = M3D_V4Permute<M3D_PERMUTE_1Z, M3D_PERMUTE_0Z, M3D_PERMUTE_1Z, M3D_PERMUTE_0Z>(SinAngles, CosAngles);
|
|
M3D_VECTOR Y1 = M3D_V4Permute<M3D_PERMUTE_1Y, M3D_PERMUTE_1Y, M3D_PERMUTE_0Y, M3D_PERMUTE_0Y>(SinAngles, CosAngles);
|
|
M3D_VECTOR P2 = M3D_V4Permute<M3D_PERMUTE_0Z, M3D_PERMUTE_1Z, M3D_PERMUTE_0Z, M3D_PERMUTE_1Z>(SinAngles, CosAngles);
|
|
M3D_VECTOR P3 = M3D_V4Permute<M3D_PERMUTE_0Y, M3D_PERMUTE_0Y, M3D_PERMUTE_1Y, M3D_PERMUTE_1Y>(SinAngles, CosAngles);
|
|
M3D_VECTOR Y2 = M3D_V4SplatX(SinAngles);
|
|
M3D_VECTOR NS = M3D_V4Negate(SinAngles);
|
|
|
|
M3D_VECTOR Q0 = M3D_V4Multiply(P0, Y0);
|
|
M3D_VECTOR Q1 = M3D_V4Multiply(P1, Sign.v);
|
|
Q1 = M3D_V4Multiply(Q1, Y1);
|
|
M3D_VECTOR Q2 = M3D_V4Multiply(P2, Y2);
|
|
Q2 = M3D_V4MultiplyAdd(Q2, P3, Q1);
|
|
|
|
M3D_VECTOR V0 = M3D_V4Permute<M3D_PERMUTE_1X, M3D_PERMUTE_0Y, M3D_PERMUTE_1Z, M3D_PERMUTE_0W>(Q0, Q2);
|
|
M3D_VECTOR V1 = M3D_V4Permute<M3D_PERMUTE_1Y, M3D_PERMUTE_0Z, M3D_PERMUTE_1W, M3D_PERMUTE_0W>(Q0, Q2);
|
|
M3D_VECTOR V2 = M3D_V4Permute<M3D_PERMUTE_0X, M3D_PERMUTE_1X, M3D_PERMUTE_0W, M3D_PERMUTE_0W>(Q0, NS);
|
|
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Select(M3D_MZero, V0, M3D_MSelect1110.v);
|
|
ret.rows[1] = M3D_V4Select(M3D_MZero, V1, M3D_MSelect1110.v);
|
|
ret.rows[2] = M3D_V4Select(M3D_MZero, V2, M3D_MSelect1110.v);
|
|
ret.rows[3] = M3D_MIdentityR3;
|
|
return ret;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationNormal(M3D_VECTOR NormalAxis, float Angle) noexcept {
|
|
float fSinAngle;
|
|
float fCosAngle;
|
|
M3D_ScalarSinCos(&fSinAngle, &fCosAngle, Angle);
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_VECTOR A = M3D_V4Set(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f);
|
|
|
|
M3D_VECTOR C2 = M3D_V4SplatZ(A);
|
|
M3D_VECTOR C1 = M3D_V4SplatY(A);
|
|
M3D_VECTOR C0 = M3D_V4SplatX(A);
|
|
|
|
M3D_VECTOR N0 = M3D_V4Swizzle<M3D_SWIZZLE_Y, M3D_SWIZZLE_Z, M3D_SWIZZLE_X, M3D_SWIZZLE_W>(NormalAxis);
|
|
M3D_VECTOR N1 = M3D_V4Swizzle<M3D_SWIZZLE_Z, M3D_SWIZZLE_X, M3D_SWIZZLE_Y, M3D_SWIZZLE_W>(NormalAxis);
|
|
|
|
M3D_VECTOR V0 = M3D_V4Multiply(C2, N0);
|
|
V0 = M3D_V4Multiply(V0, N1);
|
|
|
|
M3D_VECTOR R0 = M3D_V4Multiply(C2, NormalAxis);
|
|
R0 = M3D_V4MultiplyAdd(R0, NormalAxis, C1);
|
|
|
|
M3D_VECTOR R1 = M3D_V4MultiplyAdd(C0, NormalAxis, V0);
|
|
M3D_VECTOR R2 = M3D_V4NegativeMultiplySubtract(C0, NormalAxis, V0);
|
|
|
|
V0 = M3D_V4Select(A, R0, M3D_MSelect1110.v);
|
|
M3D_VECTOR V1 = M3D_V4Permute<M3D_PERMUTE_0Z, M3D_PERMUTE_1Y, M3D_PERMUTE_1Z, M3D_PERMUTE_0X>(R1, R2);
|
|
M3D_VECTOR V2 = M3D_V4Permute<M3D_PERMUTE_0Y, M3D_PERMUTE_1X, M3D_PERMUTE_0Y, M3D_PERMUTE_1X>(R1, R2);
|
|
|
|
M3D_MATRIX M;
|
|
M.rows[0] = M3D_V4Permute<M3D_PERMUTE_0X, M3D_PERMUTE_1X, M3D_PERMUTE_1Y, M3D_PERMUTE_0W>(V0, V1);
|
|
M.rows[1] = M3D_V4Permute<M3D_PERMUTE_1Z, M3D_PERMUTE_0Y, M3D_PERMUTE_1W, M3D_PERMUTE_0W>(V0, V1);
|
|
M.rows[2] = M3D_V4Permute<M3D_PERMUTE_1X, M3D_PERMUTE_1Y, M3D_PERMUTE_0Z, M3D_PERMUTE_0W>(V0, V2);
|
|
M.rows[3] = M3D_MIdentityR3.v;
|
|
return M;
|
|
#else
|
|
M3D_VECTOR C2 = _mm_set_ps1(1.0f - fCosAngle);
|
|
M3D_VECTOR C1 = _mm_set_ps1(fCosAngle);
|
|
M3D_VECTOR C0 = _mm_set_ps1(fSinAngle);
|
|
|
|
M3D_VECTOR N0 = M3D_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 0, 2, 1));
|
|
M3D_VECTOR N1 = M3D_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 1, 0, 2));
|
|
|
|
M3D_VECTOR V0 = _mm_mul_ps(C2, N0);
|
|
V0 = _mm_mul_ps(V0, N1);
|
|
|
|
M3D_VECTOR R0 = _mm_mul_ps(C2, NormalAxis);
|
|
R0 = _mm_mul_ps(R0, NormalAxis);
|
|
R0 = _mm_add_ps(R0, C1);
|
|
|
|
M3D_VECTOR R1 = _mm_mul_ps(C0, NormalAxis);
|
|
R1 = _mm_add_ps(R1, V0);
|
|
M3D_VECTOR R2 = _mm_mul_ps(C0, NormalAxis);
|
|
R2 = _mm_sub_ps(V0, R2);
|
|
|
|
V0 = _mm_and_ps(R0, M3D_MMask3);
|
|
M3D_VECTOR V1 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(2, 1, 2, 0));
|
|
V1 = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 3, 2, 1));
|
|
M3D_VECTOR V2 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(0, 0, 1, 1));
|
|
V2 = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 0, 2, 0));
|
|
|
|
R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(1, 0, 3, 0));
|
|
R2 = M3D_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 2, 0));
|
|
|
|
M3D_MATRIX M;
|
|
M.rows[0] = R2;
|
|
|
|
R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(3, 2, 3, 1));
|
|
R2 = M3D_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 0, 2));
|
|
M.rows[1] = R2;
|
|
|
|
V2 = _mm_shuffle_ps(V2, V0, _MM_SHUFFLE(3, 2, 1, 0));
|
|
M.rows[2] = V2;
|
|
M.rows[3] = M3D_MIdentityR3.v;
|
|
return M;
|
|
#endif
|
|
}
|
|
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationAxis(M3D_VECTOR axis, float angle) noexcept {
|
|
M3D_VECTOR nv = M3D_V3Normalize(axis);
|
|
return M3D_TransformMatrixRotationNormal(nv, angle);
|
|
}
|
|
|
|
//TODO: transform matrix is incomplete
|
|
//v_tri[v_cnt].position.z = ((far+near)/2)+((far-near)/2)*_2dCoord.z;
|
|
INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixViewport(float _w, float _h, float _wOffset, float _hOffset) noexcept {
|
|
const float widthDiv2 = _w / 2;
|
|
const float heightDiv2 = _h / 2;
|
|
|
|
#ifdef DISABLE_INTRINSICS
|
|
M3D_MATRIX ret;
|
|
ret.mat[0][0] = widthDiv2;
|
|
ret.mat[0][1] = 0.0f;
|
|
ret.mat[0][2] = 0.0f;
|
|
ret.mat[0][3] = 0.0f;
|
|
|
|
ret.mat[1][0] = 0.0f;
|
|
ret.mat[1][1] = -heightDiv2;
|
|
ret.mat[1][2] = 0.0f;
|
|
ret.mat[1][3] = 0.0f;
|
|
|
|
ret.mat[2][0] = 0.0f;
|
|
ret.mat[2][1] = 0.0f;
|
|
ret.mat[2][2] = 1.0f; // maxZ-minZ ignored
|
|
ret.mat[2][3] = 0.0f; // minZ ignored
|
|
|
|
ret.mat[3][0] = _wOffset + widthDiv2;
|
|
ret.mat[3][1] = _hOffset + heightDiv2;
|
|
ret.mat[3][2] = 0.0f;
|
|
ret.mat[3][3] = 1.0f;
|
|
return ret;
|
|
#else
|
|
M3D_MATRIX ret;
|
|
ret.rows[0] = M3D_V4Set(widthDiv2, 0, 0, 0);
|
|
ret.rows[1] = M3D_V4Set(0, -heightDiv2, 0, 0);
|
|
ret.rows[2] = M3D_MIdentityR2.v; // maxZ-minZ and minZ are ignored
|
|
ret.rows[3] = M3D_V4Set(_wOffset + widthDiv2, _hOffset + heightDiv2, 0, 1);
|
|
return ret;
|
|
#endif
|
|
} |