GCC issue 54412 fix

AVX datas aren't correctly aligned in stack in following case:
- using inline functions (GCC not always inline function),
- not using reference to AVX data (copy of data in stack dynamically),
- not using O2 or O3 optimization flags (related behaviors of the previous points).
This commit is contained in:
JackCarterSmith 2024-11-01 16:43:19 +01:00
parent aa871b8b76
commit a8e396d08f
Signed by: JackCarterSmith
GPG Key ID: 832E52F4E23F8F24
5 changed files with 129 additions and 49 deletions

View File

@ -11,7 +11,7 @@ if(NOT DEFINED PROJECT_BINARY_DIR)
endif() endif()
if(NOT MSVC) if(NOT MSVC)
add_compile_options(-Wall -march=native -mavx2 -mfma -msse4.2) add_compile_options(-Wall)
else() else()
add_compile_options(/Wall) add_compile_options(/Wall)
endif() endif()
@ -26,6 +26,16 @@ project(ProtoTank VERSION 0.1.0 DESCRIPTION "Arcade 80s-style game with tanks" L
# Compilation option # Compilation option
option(DISABLE_CPU_OPTI "Disable CPU optimizations" OFF) option(DISABLE_CPU_OPTI "Disable CPU optimizations" OFF)
if(NOT DISABLE_CPU_OPTI)
if(NOT MSVC)
#add_compile_options(-march=native -mtune=generic)
add_compile_options(-march=native -mtune=native)
add_compile_options(-msse2 -msse4.2 -mavx -mavx2 -mfma)
else()
add_compile_options(/fp:fast /arch:SSE2 /arch:SSE4.2 /arch:AVX /arch:AVX2 /GL)
endif()
endif()
include(FindPkgConfig) include(FindPkgConfig)
include(CheckIncludeFile) include(CheckIncludeFile)
include(CheckCSourceCompiles) include(CheckCSourceCompiles)

View File

@ -15,6 +15,12 @@
#error This header requires C++ #error This header requires C++
#endif #endif
#ifdef _MSC_VER
#define INLINE_AVX_FIX
#else
#define INLINE_AVX_FIX __attribute__((always_inline)) inline
#endif
#ifndef DISABLE_INTRINSICS #ifndef DISABLE_INTRINSICS
#ifdef NO_MOVNT #ifdef NO_MOVNT
@ -85,6 +91,11 @@ constexpr uint32_t M3D_PERMUTE_1Y = 5;
constexpr uint32_t M3D_PERMUTE_1Z = 6; constexpr uint32_t M3D_PERMUTE_1Z = 6;
constexpr uint32_t M3D_PERMUTE_1W = 7; constexpr uint32_t M3D_PERMUTE_1W = 7;
constexpr uint32_t M3D_SWIZZLE_X = 0;
constexpr uint32_t M3D_SWIZZLE_Y = 1;
constexpr uint32_t M3D_SWIZZLE_Z = 2;
constexpr uint32_t M3D_SWIZZLE_W = 3;
constexpr float M3D_Deg2Rad(float a) noexcept { return a * (M3D_PI / 180.0f); } constexpr float M3D_Deg2Rad(float a) noexcept { return a * (M3D_PI / 180.0f); }
constexpr float M3D_Rad2Deg(float a) noexcept { return a * (180.0f / M3D_PI); } constexpr float M3D_Rad2Deg(float a) noexcept { return a * (180.0f / M3D_PI); }
@ -358,6 +369,7 @@ M3D_VECTOR M3D_V4SetX(M3D_VECTOR V, float x) noexcept;
M3D_VECTOR M3D_V4SetY(M3D_VECTOR V, float y) noexcept; M3D_VECTOR M3D_V4SetY(M3D_VECTOR V, float y) noexcept;
M3D_VECTOR M3D_V4SetZ(M3D_VECTOR V, float z) noexcept; M3D_VECTOR M3D_V4SetZ(M3D_VECTOR V, float z) noexcept;
M3D_VECTOR M3D_V4SetW(M3D_VECTOR V, float w) noexcept; M3D_VECTOR M3D_V4SetW(M3D_VECTOR V, float w) noexcept;
M3D_VECTOR M3D_V4Swizzle(M3D_VECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3) noexcept;
M3D_VECTOR M3D_V4Permute(M3D_VECTOR V1, M3D_VECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW) noexcept; M3D_VECTOR M3D_V4Permute(M3D_VECTOR V1, M3D_VECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW) noexcept;
M3D_VECTOR M3D_V4SplatX(M3D_VECTOR V) noexcept; M3D_VECTOR M3D_V4SplatX(M3D_VECTOR V) noexcept;
M3D_VECTOR M3D_V4SplatY(M3D_VECTOR V) noexcept; M3D_VECTOR M3D_V4SplatY(M3D_VECTOR V) noexcept;
@ -376,6 +388,7 @@ M3D_VECTOR M3D_V4NegativeMultiplySubtract(M3D_VECTOR V1, M3D_VECTOR V2, M3D_VECT
bool M3D_V4EqualInt(M3D_VECTOR V1, M3D_VECTOR V2) noexcept; bool M3D_V4EqualInt(M3D_VECTOR V1, M3D_VECTOR V2) noexcept;
M3D_VECTOR M3D_V4Abs(M3D_VECTOR V) noexcept; M3D_VECTOR M3D_V4Abs(M3D_VECTOR V) noexcept;
M3D_VECTOR M3D_V4Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept; M3D_VECTOR M3D_V4Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept;
M3D_VECTOR M3D_V4LengthSq(M3D_VECTOR V) noexcept;
M3D_VECTOR M3D_V4Length(M3D_VECTOR V) noexcept; M3D_VECTOR M3D_V4Length(M3D_VECTOR V) noexcept;
M3D_VECTOR M3D_V4Scale(M3D_VECTOR V, float scale) noexcept; M3D_VECTOR M3D_V4Scale(M3D_VECTOR V, float scale) noexcept;
M3D_VECTOR M3D_V4Select(M3D_VECTOR V1, M3D_VECTOR V2, M3D_VECTOR Control) noexcept; M3D_VECTOR M3D_V4Select(M3D_VECTOR V1, M3D_VECTOR V2, M3D_VECTOR Control) noexcept;
@ -483,6 +496,31 @@ template<> inline M3D_VECTOR M3D_V4Permute<4, 1, 6, 7>(M3D_VECTOR V1, M3D_VECTOR
template<> inline M3D_VECTOR M3D_V4Permute<0, 5, 6, 7>(M3D_VECTOR V1, M3D_VECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xE); } template<> inline M3D_VECTOR M3D_V4Permute<0, 5, 6, 7>(M3D_VECTOR V1, M3D_VECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xE); }
#endif #endif
template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
inline M3D_VECTOR M3D_V4Swizzle(M3D_VECTOR V) noexcept {
#ifndef DISABLE_INTRINSICS
return M3D_PERMUTE_PS(V, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
#else
return M3D_V4Swizzle(V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
#endif
}
#if !defined(DISABLE_INTRINSICS)
template<> inline M3D_VECTOR M3D_V4Swizzle<0, 1, 0, 1>(M3D_VECTOR V) noexcept { return _mm_movelh_ps(V, V); }
template<> inline M3D_VECTOR M3D_V4Swizzle<2, 3, 2, 3>(M3D_VECTOR V) noexcept { return _mm_movehl_ps(V, V); }
template<> inline M3D_VECTOR M3D_V4Swizzle<0, 0, 1, 1>(M3D_VECTOR V) noexcept { return _mm_unpacklo_ps(V, V); }
template<> inline M3D_VECTOR M3D_V4Swizzle<2, 2, 3, 3>(M3D_VECTOR V) noexcept { return _mm_unpackhi_ps(V, V); }
#endif
#if defined(SSE3_INTRINSICS) && !defined(DISABLE_INTRINSICS)
template<> inline M3D_VECTOR M3D_V4Swizzle<0, 0, 2, 2>(M3D_VECTOR V) noexcept { return _mm_moveldup_ps(V); }
template<> inline M3D_VECTOR M3D_V4Swizzle<1, 1, 3, 3>(M3D_VECTOR V) noexcept { return _mm_movehdup_ps(V); }
#endif
#if defined(AVX2_INTRINSICS) && !defined(DISABLE_INTRINSICS)
template<> inline M3D_VECTOR M3D_V4Swizzle<0, 0, 0, 0>(M3D_VECTOR V) noexcept { return _mm_broadcastss_ps(V); }
#endif
M3D_VECTOR M3D_QMultiply(M3D_VECTOR Q1, M3D_VECTOR Q2) noexcept; M3D_VECTOR M3D_QMultiply(M3D_VECTOR Q1, M3D_VECTOR Q2) noexcept;
M3D_VECTOR M3D_QConjugate(M3D_VECTOR Q) noexcept; M3D_VECTOR M3D_QConjugate(M3D_VECTOR Q) noexcept;

View File

@ -128,7 +128,7 @@ inline void M3D_BoundingBox::CreateFromPoints(M3D_BoundingBox& Out, size_t Count
M3D_V4StoreF3(&Out.Extents, M3D_V4Scale(M3D_V4Subtract(vMax, vMin), 0.5f)); M3D_V4StoreF3(&Out.Extents, M3D_V4Scale(M3D_V4Subtract(vMax, vMin), 0.5f));
} }
inline void M3D_BoundingBox::Transform(M3D_BoundingBox& Out, M3D_MATRIX M) const noexcept { INLINE_AVX_FIX void M3D_BoundingBox::Transform(M3D_BoundingBox& Out, M3D_MATRIX M) const noexcept {
// Load center and extents. // Load center and extents.
M3D_VECTOR vCenter = M3D_V4LoadF3(&Center); M3D_VECTOR vCenter = M3D_V4LoadF3(&Center);
M3D_VECTOR vExtents = M3D_V4LoadF3(&Extents); M3D_VECTOR vExtents = M3D_V4LoadF3(&Extents);
@ -163,11 +163,11 @@ inline void M3D_BoundingBox::GetCorners(M3D_F3* Corners) const noexcept {
} }
} }
inline M3D_BoundingFrustum::M3D_BoundingFrustum(M3D_MATRIX Projection, bool rhcoords) noexcept { INLINE_AVX_FIX M3D_BoundingFrustum::M3D_BoundingFrustum(M3D_MATRIX Projection, bool rhcoords) noexcept {
CreateFromMatrix(*this, Projection, rhcoords); CreateFromMatrix(*this, Projection, rhcoords);
} }
inline void M3D_BoundingFrustum::Transform(M3D_BoundingFrustum& Out, M3D_MATRIX M) const noexcept { INLINE_AVX_FIX void M3D_BoundingFrustum::Transform(M3D_BoundingFrustum& Out, M3D_MATRIX M) const noexcept {
// Load the frustum. // Load the frustum.
M3D_VECTOR vOrigin = M3D_V4LoadF3(&Origin); M3D_VECTOR vOrigin = M3D_V4LoadF3(&Origin);
M3D_VECTOR vOrientation = M3D_V4LoadF4(&Orientation); M3D_VECTOR vOrientation = M3D_V4LoadF4(&Orientation);
@ -318,7 +318,7 @@ inline void M3D_BoundingFrustum::GetPlanes(M3D_VECTOR* NearPlane, M3D_VECTOR* Fa
} }
} }
inline void M3D_BoundingFrustum::CreateFromMatrix(M3D_BoundingFrustum& Out, M3D_MATRIX Projection, bool rhcoords) noexcept { INLINE_AVX_FIX void M3D_BoundingFrustum::CreateFromMatrix(M3D_BoundingFrustum& Out, M3D_MATRIX Projection, bool rhcoords) noexcept {
// Corners of the projection frustum in NDC space. // Corners of the projection frustum in NDC space.
static M3D_V4F32 NDCPoints[6] = { static M3D_V4F32 NDCPoints[6] = {
{{{1.0f, 0.0f, 1.0f, 1.0f}}}, // right (at far plane) {{{1.0f, 0.0f, 1.0f, 1.0f}}}, // right (at far plane)

View File

@ -13,7 +13,7 @@ inline M3D_MATRIX::M3D_MATRIX(
rows[3] = M3D_V4Set(f30, f31, f32, f33); rows[3] = M3D_V4Set(f30, f31, f32, f33);
} }
inline M3D_MATRIX M3D_MATRIX::operator- () const noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator- () const noexcept {
M3D_MATRIX ret; M3D_MATRIX ret;
ret.rows[0] = M3D_V4Negate(rows[0]); ret.rows[0] = M3D_V4Negate(rows[0]);
ret.rows[1] = M3D_V4Negate(rows[1]); ret.rows[1] = M3D_V4Negate(rows[1]);
@ -22,14 +22,14 @@ inline M3D_MATRIX M3D_MATRIX::operator- () const noexcept {
return ret; return ret;
} }
inline M3D_MATRIX& M3D_MATRIX::operator+= (M3D_MATRIX M) noexcept { INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator+= (M3D_MATRIX M) noexcept {
rows[0] = M3D_V4Add(rows[0], M.rows[0]); rows[0] = M3D_V4Add(rows[0], M.rows[0]);
rows[1] = M3D_V4Add(rows[1], M.rows[1]); rows[1] = M3D_V4Add(rows[1], M.rows[1]);
rows[2] = M3D_V4Add(rows[2], M.rows[2]); rows[2] = M3D_V4Add(rows[2], M.rows[2]);
rows[3] = M3D_V4Add(rows[3], M.rows[3]); rows[3] = M3D_V4Add(rows[3], M.rows[3]);
return *this; return *this;
} }
inline M3D_MATRIX M3D_MATRIX::operator+ (M3D_MATRIX M) const noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator+ (M3D_MATRIX M) const noexcept {
M3D_MATRIX ret; M3D_MATRIX ret;
ret.rows[0] = M3D_V4Add(rows[0], M.rows[0]); ret.rows[0] = M3D_V4Add(rows[0], M.rows[0]);
ret.rows[1] = M3D_V4Add(rows[1], M.rows[1]); ret.rows[1] = M3D_V4Add(rows[1], M.rows[1]);
@ -38,14 +38,14 @@ inline M3D_MATRIX M3D_MATRIX::operator+ (M3D_MATRIX M) const noexcept {
return ret; return ret;
} }
inline M3D_MATRIX& M3D_MATRIX::operator-= (M3D_MATRIX M) noexcept { INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator-= (M3D_MATRIX M) noexcept {
rows[0] = M3D_V4Subtract(rows[0], M.rows[0]); rows[0] = M3D_V4Subtract(rows[0], M.rows[0]);
rows[1] = M3D_V4Subtract(rows[1], M.rows[1]); rows[1] = M3D_V4Subtract(rows[1], M.rows[1]);
rows[2] = M3D_V4Subtract(rows[2], M.rows[2]); rows[2] = M3D_V4Subtract(rows[2], M.rows[2]);
rows[3] = M3D_V4Subtract(rows[3], M.rows[3]); rows[3] = M3D_V4Subtract(rows[3], M.rows[3]);
return *this; return *this;
} }
inline M3D_MATRIX M3D_MATRIX::operator- (M3D_MATRIX M) const noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator- (M3D_MATRIX M) const noexcept {
M3D_MATRIX ret; M3D_MATRIX ret;
ret.rows[0] = M3D_V4Subtract(rows[0], M.rows[0]); ret.rows[0] = M3D_V4Subtract(rows[0], M.rows[0]);
ret.rows[1] = M3D_V4Subtract(rows[1], M.rows[1]); ret.rows[1] = M3D_V4Subtract(rows[1], M.rows[1]);
@ -54,22 +54,22 @@ inline M3D_MATRIX M3D_MATRIX::operator- (M3D_MATRIX M) const noexcept {
return ret; return ret;
} }
inline M3D_MATRIX& M3D_MATRIX::operator*=(M3D_MATRIX M) noexcept { INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator*=(M3D_MATRIX M) noexcept {
*this = M3D_MMultiply(*this, M); *this = M3D_MMultiply(*this, M);
return *this; return *this;
} }
inline M3D_MATRIX M3D_MATRIX::operator*(M3D_MATRIX M) const noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator*(M3D_MATRIX M) const noexcept {
return M3D_MMultiply(*this, M); return M3D_MMultiply(*this, M);
} }
inline M3D_MATRIX& M3D_MATRIX::operator*= (float S) noexcept { INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator*= (float S) noexcept {
rows[0] = M3D_V4Scale(rows[0], S); rows[0] = M3D_V4Scale(rows[0], S);
rows[1] = M3D_V4Scale(rows[1], S); rows[1] = M3D_V4Scale(rows[1], S);
rows[2] = M3D_V4Scale(rows[2], S); rows[2] = M3D_V4Scale(rows[2], S);
rows[3] = M3D_V4Scale(rows[3], S); rows[3] = M3D_V4Scale(rows[3], S);
return *this; return *this;
} }
inline M3D_MATRIX M3D_MATRIX::operator* (float S) const noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator* (float S) const noexcept {
M3D_MATRIX ret; M3D_MATRIX ret;
ret.rows[0] = M3D_V4Scale(rows[0], S); ret.rows[0] = M3D_V4Scale(rows[0], S);
ret.rows[1] = M3D_V4Scale(rows[1], S); ret.rows[1] = M3D_V4Scale(rows[1], S);
@ -77,7 +77,7 @@ inline M3D_MATRIX M3D_MATRIX::operator* (float S) const noexcept {
ret.rows[3] = M3D_V4Scale(rows[3], S); ret.rows[3] = M3D_V4Scale(rows[3], S);
return ret; return ret;
} }
inline M3D_MATRIX operator* (float S, M3D_MATRIX M) noexcept { INLINE_AVX_FIX M3D_MATRIX operator* (float S, M3D_MATRIX M) noexcept {
M3D_MATRIX ret; M3D_MATRIX ret;
ret.rows[0] = M3D_V4Scale(M.rows[0], S); ret.rows[0] = M3D_V4Scale(M.rows[0], S);
ret.rows[1] = M3D_V4Scale(M.rows[1], S); ret.rows[1] = M3D_V4Scale(M.rows[1], S);
@ -86,7 +86,7 @@ inline M3D_MATRIX operator* (float S, M3D_MATRIX M) noexcept {
return ret; return ret;
} }
inline M3D_MATRIX& M3D_MATRIX::operator/= (float S) noexcept { INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator/= (float S) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_VECTOR vS = M3D_V4Replicate(S); M3D_VECTOR vS = M3D_V4Replicate(S);
rows[0] = M3D_V4Divide(rows[0], vS); rows[0] = M3D_V4Divide(rows[0], vS);
@ -103,7 +103,7 @@ inline M3D_MATRIX& M3D_MATRIX::operator/= (float S) noexcept {
return *this; return *this;
#endif #endif
} }
inline M3D_MATRIX M3D_MATRIX::operator/ (float S) const noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator/ (float S) const noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_VECTOR vS = M3D_V4Replicate(S); M3D_VECTOR vS = M3D_V4Replicate(S);
M3D_MATRIX ret; M3D_MATRIX ret;
@ -135,7 +135,7 @@ inline M3D_MATRIX M3D_MIdentity() noexcept {
return ret; return ret;
} }
inline M3D_MATRIX M3D_MMultiply(M3D_MATRIX M1, M3D_MATRIX& M2) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_MMultiply(M3D_MATRIX M1, M3D_MATRIX& M2) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_MATRIX ret; M3D_MATRIX ret;
// Cache the invariants in registers // Cache the invariants in registers
@ -309,7 +309,7 @@ inline M3D_MATRIX M3D_MMultiply(M3D_MATRIX M1, M3D_MATRIX& M2) noexcept {
#endif #endif
} }
inline M3D_MATRIX M3D_MTranspose(M3D_MATRIX M) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_MTranspose(M3D_MATRIX M) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
// Original matrix: // Original matrix:
// //
@ -374,7 +374,7 @@ inline M3D_MATRIX M3D_MTranspose(M3D_MATRIX M) noexcept {
#endif #endif
} }
inline M3D_MATRIX M3D_MInverse(M3D_MATRIX M) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_MInverse(M3D_MATRIX M) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_MATRIX MT = M3D_MTranspose(M); M3D_MATRIX MT = M3D_MTranspose(M);
@ -591,7 +591,7 @@ inline M3D_MATRIX M3D_MInverse(M3D_MATRIX M) noexcept {
/* -------------------------------------------------------------------------------------------------------------------------- */ /* -------------------------------------------------------------------------------------------------------------------------- */
inline M3D_VECTOR M3D_QRotationMatrix(M3D_MATRIX M) noexcept { INLINE_AVX_FIX M3D_VECTOR M3D_QRotationMatrix(M3D_MATRIX M) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_V4F32 q; M3D_V4F32 q;
float r22 = M.mat[2][2]; float r22 = M.mat[2][2];
@ -733,7 +733,7 @@ inline M3D_VECTOR M3D_V3Rotate(M3D_VECTOR V, M3D_VECTOR RotationQuaternion) noex
return M3D_QMultiply(Result, RotationQuaternion); return M3D_QMultiply(Result, RotationQuaternion);
} }
inline M3D_VECTOR M3D_V3Transform(M3D_VECTOR V, M3D_MATRIX M) noexcept { INLINE_AVX_FIX M3D_VECTOR M3D_V3Transform(M3D_VECTOR V, M3D_MATRIX M) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Z = M3D_V4SplatZ(V);
M3D_VECTOR Y = M3D_V4SplatY(V); M3D_VECTOR Y = M3D_V4SplatY(V);
@ -755,7 +755,7 @@ inline M3D_VECTOR M3D_V3Transform(M3D_VECTOR V, M3D_MATRIX M) noexcept {
#endif #endif
} }
inline void M3D_V3Transform( INLINE_AVX_FIX void M3D_V3Transform(
M3D_F4* pOutputStream, M3D_F4* pOutputStream,
size_t OutputStride, size_t OutputStride,
const M3D_F3* pInputStream, const M3D_F3* pInputStream,
@ -972,7 +972,7 @@ inline void M3D_V3Transform(
#endif #endif
} }
inline M3D_VECTOR M3D_V3TransformNormal(M3D_VECTOR V, M3D_MATRIX M) noexcept { INLINE_AVX_FIX M3D_VECTOR M3D_V3TransformNormal(M3D_VECTOR V, M3D_MATRIX M) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Z = M3D_V4SplatZ(V);
M3D_VECTOR Y = M3D_V4SplatY(V); M3D_VECTOR Y = M3D_V4SplatY(V);
@ -994,7 +994,7 @@ inline M3D_VECTOR M3D_V3TransformNormal(M3D_VECTOR V, M3D_MATRIX M) noexcept {
#endif #endif
} }
inline M3D_VECTOR M3D_V3TransformPersDiv(M3D_VECTOR V, M3D_MATRIX M) noexcept { INLINE_AVX_FIX M3D_VECTOR M3D_V3TransformPersDiv(M3D_VECTOR V, M3D_MATRIX M) noexcept {
M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Z = M3D_V4SplatZ(V);
M3D_VECTOR Y = M3D_V4SplatY(V); M3D_VECTOR Y = M3D_V4SplatY(V);
M3D_VECTOR X = M3D_V4SplatX(V); M3D_VECTOR X = M3D_V4SplatX(V);
@ -1007,7 +1007,7 @@ inline M3D_VECTOR M3D_V3TransformPersDiv(M3D_VECTOR V, M3D_MATRIX M) noexcept {
return M3D_V4Divide(Result, W); return M3D_V4Divide(Result, W);
} }
inline void M3D_V3TransformPersDiv( INLINE_AVX_FIX void M3D_V3TransformPersDiv(
M3D_F3* pOutputStream, M3D_F3* pOutputStream,
size_t OutputStride, size_t OutputStride,
const M3D_F3* pInputStream, const M3D_F3* pInputStream,
@ -1321,7 +1321,7 @@ inline void M3D_V3TransformPersDiv(
#endif #endif
} }
inline M3D_VECTOR M3D_V4Transform(M3D_VECTOR V, M3D_MATRIX M) noexcept { INLINE_AVX_FIX M3D_VECTOR M3D_V4Transform(M3D_VECTOR V, M3D_MATRIX M) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_VECTOR W = M3D_V4SplatW(V); M3D_VECTOR W = M3D_V4SplatW(V);
M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Z = M3D_V4SplatZ(V);
@ -1346,7 +1346,7 @@ inline M3D_VECTOR M3D_V4Transform(M3D_VECTOR V, M3D_MATRIX M) noexcept {
#endif #endif
} }
inline void M3D_V4Transform(M3D_F4* pOutputStream, size_t OutputStride, const M3D_F4* pInputStream, size_t InputStride, size_t VectorCount, M3D_MATRIX M) noexcept { INLINE_AVX_FIX void M3D_V4Transform(M3D_F4* pOutputStream, size_t OutputStride, const M3D_F4* pInputStream, size_t InputStride, size_t VectorCount, M3D_MATRIX M) noexcept {
auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream); auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream); auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
@ -1593,17 +1593,17 @@ inline M3D_VECTOR M3D_V3TransformNDCToViewport(M3D_VECTOR V, float vpX, float vp
/* -------------------------------------------------------------------------------------------------------------------------- */ /* -------------------------------------------------------------------------------------------------------------------------- */
inline M3D_MATRIX M3D_TransformMatrixCamLookAtLH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixCamLookAtLH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept {
M3D_VECTOR dir = M3D_V4Subtract(focusPos, viewPos); M3D_VECTOR dir = M3D_V4Subtract(focusPos, viewPos);
return M3D_TransformMatrixCamLookToLH(viewPos, dir, upDirection); return M3D_TransformMatrixCamLookToLH(viewPos, dir, upDirection);
} }
inline M3D_MATRIX M3D_TransformMatrixCamLookAtRH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixCamLookAtRH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept {
M3D_VECTOR dir_n = M3D_V4Subtract(viewPos, focusPos); M3D_VECTOR dir_n = M3D_V4Subtract(viewPos, focusPos);
return M3D_TransformMatrixCamLookToLH(viewPos, dir_n, upDirection); return M3D_TransformMatrixCamLookToLH(viewPos, dir_n, upDirection);
} }
inline M3D_MATRIX M3D_TransformMatrixCamLookToLH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixCamLookToLH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept {
// Keep viewer's axes orthogonal to each other and of unit length // Keep viewer's axes orthogonal to each other and of unit length
M3D_VECTOR look_normal = M3D_V3Normalize(viewDirection); M3D_VECTOR look_normal = M3D_V3Normalize(viewDirection);
M3D_VECTOR up_norm = M3D_V3Normalize(M3D_V3Cross(upDirection, look_normal)); M3D_VECTOR up_norm = M3D_V3Normalize(M3D_V3Cross(upDirection, look_normal));
@ -1628,12 +1628,12 @@ inline M3D_MATRIX M3D_TransformMatrixCamLookToLH(M3D_VECTOR viewPos, M3D_VECTOR
return ret; return ret;
} }
inline M3D_MATRIX M3D_TransformMatrixCamLookToRH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixCamLookToRH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept {
M3D_VECTOR viewDirection_n = M3D_V4Negate(viewDirection); M3D_VECTOR viewDirection_n = M3D_V4Negate(viewDirection);
return M3D_TransformMatrixCamLookToLH(viewPos, viewDirection_n, upDirection); return M3D_TransformMatrixCamLookToLH(viewPos, viewDirection_n, upDirection);
} }
inline M3D_MATRIX M3D_TransformMatrixFrustrumFovLH(float fov, float ratio, float near, float far) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixFrustrumFovLH(float fov, float ratio, float near, float far) noexcept {
float SinFov; float SinFov;
float CosFov; float CosFov;
M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov); M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov);
@ -1691,7 +1691,7 @@ inline M3D_MATRIX M3D_TransformMatrixFrustrumFovLH(float fov, float ratio, float
#endif #endif
} }
inline M3D_MATRIX M3D_TransformMatrixFrustrumFovRH(float fov, float ratio, float near, float far) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixFrustrumFovRH(float fov, float ratio, float near, float far) noexcept {
float SinFov; float SinFov;
float CosFov; float CosFov;
M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov); M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov);
@ -1749,7 +1749,7 @@ inline M3D_MATRIX M3D_TransformMatrixFrustrumFovRH(float fov, float ratio, float
#endif #endif
} }
inline M3D_MATRIX M3D_TransformMatrixTranslate(M3D_VECTOR Offset) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixTranslate(M3D_VECTOR Offset) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_MATRIX ret; M3D_MATRIX ret;
ret.mat[0][0] = 1.0f; ret.mat[0][0] = 1.0f;
@ -1782,7 +1782,7 @@ inline M3D_MATRIX M3D_TransformMatrixTranslate(M3D_VECTOR Offset) noexcept {
#endif #endif
} }
inline M3D_MATRIX M3D_TransformMatrixScale(float ScaleX, float ScaleY, float ScaleZ) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixScale(float ScaleX, float ScaleY, float ScaleZ) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_MATRIX ret; M3D_MATRIX ret;
ret.mat[0][0] = ScaleX; ret.mat[0][0] = ScaleX;
@ -1815,7 +1815,7 @@ inline M3D_MATRIX M3D_TransformMatrixScale(float ScaleX, float ScaleY, float Sca
#endif #endif
} }
inline M3D_MATRIX M3D_TransformMatrixScale(M3D_VECTOR Scale) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixScale(M3D_VECTOR Scale) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_MATRIX ret; M3D_MATRIX ret;
ret.mat[0][0] = Scale.v4f[0]; ret.mat[0][0] = Scale.v4f[0];
@ -1848,7 +1848,7 @@ inline M3D_MATRIX M3D_TransformMatrixScale(M3D_VECTOR Scale) noexcept {
#endif #endif
} }
inline M3D_MATRIX M3D_TransformMatrixTranslate(float OffsetX, float OffsetY, float OffsetZ) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixTranslate(float OffsetX, float OffsetY, float OffsetZ) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_MATRIX ret; M3D_MATRIX ret;
ret.mat[0][0] = 1.0f; ret.mat[0][0] = 1.0f;
@ -1881,7 +1881,7 @@ inline M3D_MATRIX M3D_TransformMatrixTranslate(float OffsetX, float OffsetY, flo
#endif #endif
} }
inline M3D_MATRIX M3D_TransformMatrixRotationX(float Angle) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationX(float Angle) noexcept {
float SinAngle; float SinAngle;
float CosAngle; float CosAngle;
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle); M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
@ -1926,7 +1926,7 @@ inline M3D_MATRIX M3D_TransformMatrixRotationX(float Angle) noexcept {
#endif #endif
} }
inline M3D_MATRIX M3D_TransformMatrixRotationY(float Angle) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationY(float Angle) noexcept {
float SinAngle; float SinAngle;
float CosAngle; float CosAngle;
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle); M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
@ -1971,7 +1971,7 @@ inline M3D_MATRIX M3D_TransformMatrixRotationY(float Angle) noexcept {
#endif #endif
} }
inline M3D_MATRIX M3D_TransformMatrixRotationZ(float Angle) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationZ(float Angle) noexcept {
float SinAngle; float SinAngle;
float CosAngle; float CosAngle;
M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle); M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle);
@ -2016,7 +2016,7 @@ inline M3D_MATRIX M3D_TransformMatrixRotationZ(float Angle) noexcept {
#endif #endif
} }
inline M3D_MATRIX M3D_TransformMatrixRotation(M3D_VECTOR Angles) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotation(M3D_VECTOR Angles) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
float cp = cosf(Angles.v4f[0]); float cp = cosf(Angles.v4f[0]);
float sp = sinf(Angles.v4f[0]); float sp = sinf(Angles.v4f[0]);
@ -2082,7 +2082,7 @@ inline M3D_MATRIX M3D_TransformMatrixRotation(M3D_VECTOR Angles) noexcept {
#endif #endif
} }
inline M3D_MATRIX M3D_TransformMatrixRotationNormal(M3D_VECTOR NormalAxis, float Angle) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationNormal(M3D_VECTOR NormalAxis, float Angle) noexcept {
float fSinAngle; float fSinAngle;
float fCosAngle; float fCosAngle;
M3D_ScalarSinCos(&fSinAngle, &fCosAngle, Angle); M3D_ScalarSinCos(&fSinAngle, &fCosAngle, Angle);
@ -2159,14 +2159,14 @@ inline M3D_MATRIX M3D_TransformMatrixRotationNormal(M3D_VECTOR NormalAxis, float
#endif #endif
} }
inline M3D_MATRIX M3D_TransformMatrixRotationAxis(M3D_VECTOR axis, float angle) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationAxis(M3D_VECTOR axis, float angle) noexcept {
M3D_VECTOR nv = M3D_V3Normalize(axis); M3D_VECTOR nv = M3D_V3Normalize(axis);
return M3D_TransformMatrixRotationNormal(nv, angle); return M3D_TransformMatrixRotationNormal(nv, angle);
} }
//TODO: transform matrix is incomplete //TODO: transform matrix is incomplete
//v_tri[v_cnt].position.z = ((far+near)/2)+((far-near)/2)*_2dCoord.z; //v_tri[v_cnt].position.z = ((far+near)/2)+((far-near)/2)*_2dCoord.z;
inline M3D_MATRIX M3D_TransformMatrixViewport(float _w, float _h, float _wOffset, float _hOffset) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixViewport(float _w, float _h, float _wOffset, float _hOffset) noexcept {
const float widthDiv2 = _w / 2; const float widthDiv2 = _w / 2;
const float heightDiv2 = _h / 2; const float heightDiv2 = _h / 2;

View File

@ -175,7 +175,7 @@ inline void M3D_V4StoreF4A(M3D_F4A* dst, M3D_VECTOR V) noexcept {
#endif #endif
} }
inline M3D_MATRIX M3D_V4LoadF4x4(const M3D_F4X4* src) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_V4LoadF4x4(const M3D_F4X4* src) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_MATRIX ret; M3D_MATRIX ret;
ret.rows[0].v4f[0] = src->mat[0][0]; ret.rows[0].v4f[0] = src->mat[0][0];
@ -208,7 +208,7 @@ inline M3D_MATRIX M3D_V4LoadF4x4(const M3D_F4X4* src) noexcept {
#endif #endif
} }
inline M3D_MATRIX M3D_V4LoadF4x4A(const M3D_F4X4A* src) noexcept { INLINE_AVX_FIX M3D_MATRIX M3D_V4LoadF4x4A(const M3D_F4X4A* src) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_MATRIX ret; M3D_MATRIX ret;
ret.rows[0].v4f[0] = src->mat[0][0]; ret.rows[0].v4f[0] = src->mat[0][0];
@ -241,7 +241,7 @@ inline M3D_MATRIX M3D_V4LoadF4x4A(const M3D_F4X4A* src) noexcept {
#endif #endif
} }
inline void M3D_V4StoreF4x4(M3D_F4X4* dst, M3D_MATRIX M) noexcept { INLINE_AVX_FIX void M3D_V4StoreF4x4(M3D_F4X4* dst, M3D_MATRIX M) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
dst->mat[0][0] = M.rows[0].v4f[0]; dst->mat[0][0] = M.rows[0].v4f[0];
dst->mat[0][1] = M.rows[0].v4f[1]; dst->mat[0][1] = M.rows[0].v4f[1];
@ -270,7 +270,7 @@ inline void M3D_V4StoreF4x4(M3D_F4X4* dst, M3D_MATRIX M) noexcept {
#endif #endif
} }
inline void M3D_V4StoreF4x4A(M3D_F4X4A* dst, M3D_MATRIX M) noexcept { INLINE_AVX_FIX void M3D_V4StoreF4x4A(M3D_F4X4A* dst, M3D_MATRIX M) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
dst->mat[0][0] = M.rows[0].v4f[0]; dst->mat[0][0] = M.rows[0].v4f[0];
dst->mat[0][1] = M.rows[0].v4f[1]; dst->mat[0][1] = M.rows[0].v4f[1];
@ -548,6 +548,34 @@ inline M3D_VECTOR M3D_V4Permute(M3D_VECTOR V1, M3D_VECTOR V2, uint32_t PermuteX,
#endif #endif
} }
inline M3D_VECTOR M3D_V4Swizzle(M3D_VECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_V4F32 Result = {{{
V.v4f[E0],
V.v4f[E1],
V.v4f[E2],
V.v4f[E3]
}}};
return Result.v;
#elif defined(AVX_INTRINSICS)
unsigned int elem[4] = { E0, E1, E2, E3 };
__m128i vControl = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&elem[0]));
return _mm_permutevar_ps(V, vControl);
#else
auto aPtr = reinterpret_cast<const uint32_t*>(&V);
M3D_VECTOR Result;
auto pWork = reinterpret_cast<uint32_t*>(&Result);
pWork[0] = aPtr[E0];
pWork[1] = aPtr[E1];
pWork[2] = aPtr[E2];
pWork[3] = aPtr[E3];
return Result;
#endif
}
inline M3D_VECTOR M3D_V4SplatOne() noexcept { inline M3D_VECTOR M3D_V4SplatOne() noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_V4F32 vResult; M3D_V4F32 vResult;
@ -816,6 +844,10 @@ inline M3D_VECTOR M3D_V4Dot(M3D_VECTOR V1, M3D_VECTOR V2) noexcept {
#endif #endif
} }
inline M3D_VECTOR M3D_V4LengthSq(M3D_VECTOR V) noexcept {
return M3D_V4Dot(V, V);
}
inline M3D_VECTOR M3D_V4Length(M3D_VECTOR V) noexcept { inline M3D_VECTOR M3D_V4Length(M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS #ifdef DISABLE_INTRINSICS
M3D_VECTOR Result; M3D_VECTOR Result;