#pragma once inline M3D_MATRIX::M3D_MATRIX( float f00, float f01, float f02, float f03, float f10, float f11, float f12, float f13, float f20, float f21, float f22, float f23, float f30, float f31, float f32, float f33 ) noexcept { rows[0] = M3D_V4Set(f00, f01, f02, f03); rows[1] = M3D_V4Set(f10, f11, f12, f13); rows[2] = M3D_V4Set(f20, f21, f22, f23); rows[3] = M3D_V4Set(f30, f31, f32, f33); } INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator- () const noexcept { M3D_MATRIX ret; ret.rows[0] = M3D_V4Negate(rows[0]); ret.rows[1] = M3D_V4Negate(rows[1]); ret.rows[2] = M3D_V4Negate(rows[2]); ret.rows[3] = M3D_V4Negate(rows[3]); return ret; } INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator+= (M3D_MATRIX M) noexcept { rows[0] = M3D_V4Add(rows[0], M.rows[0]); rows[1] = M3D_V4Add(rows[1], M.rows[1]); rows[2] = M3D_V4Add(rows[2], M.rows[2]); rows[3] = M3D_V4Add(rows[3], M.rows[3]); return *this; } INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator+ (M3D_MATRIX M) const noexcept { M3D_MATRIX ret; ret.rows[0] = M3D_V4Add(rows[0], M.rows[0]); ret.rows[1] = M3D_V4Add(rows[1], M.rows[1]); ret.rows[2] = M3D_V4Add(rows[2], M.rows[2]); ret.rows[3] = M3D_V4Add(rows[3], M.rows[3]); return ret; } INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator-= (M3D_MATRIX M) noexcept { rows[0] = M3D_V4Subtract(rows[0], M.rows[0]); rows[1] = M3D_V4Subtract(rows[1], M.rows[1]); rows[2] = M3D_V4Subtract(rows[2], M.rows[2]); rows[3] = M3D_V4Subtract(rows[3], M.rows[3]); return *this; } INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator- (M3D_MATRIX M) const noexcept { M3D_MATRIX ret; ret.rows[0] = M3D_V4Subtract(rows[0], M.rows[0]); ret.rows[1] = M3D_V4Subtract(rows[1], M.rows[1]); ret.rows[2] = M3D_V4Subtract(rows[2], M.rows[2]); ret.rows[3] = M3D_V4Subtract(rows[3], M.rows[3]); return ret; } INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator*=(M3D_MATRIX M) noexcept { *this = M3D_MMultiply(*this, M); return *this; } INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator*(M3D_MATRIX M) const noexcept { return M3D_MMultiply(*this, M); } INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator*= (float S) noexcept { rows[0] = M3D_V4Scale(rows[0], S); rows[1] = M3D_V4Scale(rows[1], S); rows[2] = M3D_V4Scale(rows[2], S); rows[3] = M3D_V4Scale(rows[3], S); return *this; } INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator* (float S) const noexcept { M3D_MATRIX ret; ret.rows[0] = M3D_V4Scale(rows[0], S); ret.rows[1] = M3D_V4Scale(rows[1], S); ret.rows[2] = M3D_V4Scale(rows[2], S); ret.rows[3] = M3D_V4Scale(rows[3], S); return ret; } INLINE_AVX_FIX M3D_MATRIX operator* (float S, M3D_MATRIX M) noexcept { M3D_MATRIX ret; ret.rows[0] = M3D_V4Scale(M.rows[0], S); ret.rows[1] = M3D_V4Scale(M.rows[1], S); ret.rows[2] = M3D_V4Scale(M.rows[2], S); ret.rows[3] = M3D_V4Scale(M.rows[3], S); return ret; } INLINE_AVX_FIX M3D_MATRIX& M3D_MATRIX::operator/= (float S) noexcept { #ifdef DISABLE_INTRINSICS M3D_VECTOR vS = M3D_V4Replicate(S); rows[0] = M3D_V4Divide(rows[0], vS); rows[1] = M3D_V4Divide(rows[1], vS); rows[2] = M3D_V4Divide(rows[2], vS); rows[3] = M3D_V4Divide(rows[3], vS); return *this; #else __m128 vS = _mm_set_ps1(S); rows[0] = _mm_div_ps(rows[0], vS); rows[1] = _mm_div_ps(rows[1], vS); rows[2] = _mm_div_ps(rows[2], vS); rows[3] = _mm_div_ps(rows[3], vS); return *this; #endif } INLINE_AVX_FIX M3D_MATRIX M3D_MATRIX::operator/ (float S) const noexcept { #ifdef DISABLE_INTRINSICS M3D_VECTOR vS = M3D_V4Replicate(S); M3D_MATRIX ret; ret.rows[0] = M3D_V4Divide(rows[0], vS); ret.rows[1] = M3D_V4Divide(rows[1], vS); ret.rows[2] = M3D_V4Divide(rows[2], vS); ret.rows[3] = M3D_V4Divide(rows[3], vS); return ret; #else __m128 vS = _mm_set_ps1(S); M3D_MATRIX ret; ret.rows[0] = _mm_div_ps(rows[0], vS); ret.rows[1] = _mm_div_ps(rows[1], vS); ret.rows[2] = _mm_div_ps(rows[2], vS); ret.rows[3] = _mm_div_ps(rows[3], vS); return ret; #endif } /* -------------------------------------------------------------------------------------------------------------------------- */ inline M3D_MATRIX M3D_MIdentity() noexcept { M3D_MATRIX ret; ret.rows[0] = M3D_MIdentityR0.v; ret.rows[1] = M3D_MIdentityR1.v; ret.rows[2] = M3D_MIdentityR2.v; ret.rows[3] = M3D_MIdentityR3.v; return ret; } INLINE_AVX_FIX M3D_MATRIX M3D_MMultiply(M3D_MATRIX M1, M3D_MATRIX& M2) noexcept { #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; // Cache the invariants in registers float x = M1.mat[0][0]; float y = M1.mat[0][1]; float z = M1.mat[0][2]; float w = M1.mat[0][3]; // Perform the operation on the first row ret.mat[0][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w); ret.mat[0][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w); ret.mat[0][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w); ret.mat[0][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w); // Repeat for all the other rows x = M1.mat[1][0]; y = M1.mat[1][1]; z = M1.mat[1][2]; w = M1.mat[1][3]; ret.mat[1][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w); ret.mat[1][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w); ret.mat[1][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w); ret.mat[1][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w); x = M1.mat[2][0]; y = M1.mat[2][1]; z = M1.mat[2][2]; w = M1.mat[2][3]; ret.mat[2][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w); ret.mat[2][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w); ret.mat[2][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w); ret.mat[2][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w); x = M1.mat[3][0]; y = M1.mat[3][1]; z = M1.mat[3][2]; w = M1.mat[3][3]; ret.mat[3][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w); ret.mat[3][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w); ret.mat[3][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w); ret.mat[3][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w); return ret; #elif defined(AVX2_INTRINSICS) __m256 t0 = _mm256_castps128_ps256(M1.rows[0]); t0 = _mm256_insertf128_ps(t0, M1.rows[1], 1); __m256 t1 = _mm256_castps128_ps256(M1.rows[2]); t1 = _mm256_insertf128_ps(t1, M1.rows[3], 1); __m256 u0 = _mm256_castps128_ps256(M2.rows[0]); u0 = _mm256_insertf128_ps(u0, M2.rows[1], 1); __m256 u1 = _mm256_castps128_ps256(M2.rows[2]); u1 = _mm256_insertf128_ps(u1, M2.rows[3], 1); __m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0)); __m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0)); __m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00); __m256 c0 = _mm256_mul_ps(a0, b0); __m256 c1 = _mm256_mul_ps(a1, b0); a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1)); a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1)); b0 = _mm256_permute2f128_ps(u0, u0, 0x11); __m256 c2 = _mm256_fmadd_ps(a0, b0, c0); __m256 c3 = _mm256_fmadd_ps(a1, b0, c1); a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2)); a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2)); __m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00); __m256 c4 = _mm256_mul_ps(a0, b1); __m256 c5 = _mm256_mul_ps(a1, b1); a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3)); a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3)); b1 = _mm256_permute2f128_ps(u1, u1, 0x11); __m256 c6 = _mm256_fmadd_ps(a0, b1, c4); __m256 c7 = _mm256_fmadd_ps(a1, b1, c5); t0 = _mm256_add_ps(c2, c6); t1 = _mm256_add_ps(c3, c7); M3D_MATRIX ret; ret.rows[0] = _mm256_castps256_ps128(t0); ret.rows[1] = _mm256_extractf128_ps(t0, 1); ret.rows[2] = _mm256_castps256_ps128(t1); ret.rows[3] = _mm256_extractf128_ps(t1, 1); return ret; #else M3D_MATRIX ret; // Splat the component X,Y,Z then W #ifdef AVX_INTRINSICS M3D_VECTOR vX = _mm_broadcast_ss(reinterpret_cast(&M1.rows[0]) + 0); M3D_VECTOR vY = _mm_broadcast_ss(reinterpret_cast(&M1.rows[0]) + 1); M3D_VECTOR vZ = _mm_broadcast_ss(reinterpret_cast(&M1.rows[0]) + 2); M3D_VECTOR vW = _mm_broadcast_ss(reinterpret_cast(&M1.rows[0]) + 3); #else // Use vW to hold the original row M3D_VECTOR vW = M1.rows[0]; M3D_VECTOR vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); #endif // Perform the operation on the first row vX = _mm_mul_ps(vX, M2.rows[0]); vY = _mm_mul_ps(vY, M2.rows[1]); vZ = _mm_mul_ps(vZ, M2.rows[2]); vW = _mm_mul_ps(vW, M2.rows[3]); // Perform a binary add to reduce cumulative errors vX = _mm_add_ps(vX, vZ); vY = _mm_add_ps(vY, vW); vX = _mm_add_ps(vX, vY); ret.rows[0] = vX; // Repeat for the other 3 rows #ifdef AVX_INTRINSICS vX = _mm_broadcast_ss(reinterpret_cast(&M1.rows[1]) + 0); vY = _mm_broadcast_ss(reinterpret_cast(&M1.rows[1]) + 1); vZ = _mm_broadcast_ss(reinterpret_cast(&M1.rows[1]) + 2); vW = _mm_broadcast_ss(reinterpret_cast(&M1.rows[1]) + 3); #else vW = M1.rows[1]; vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); #endif vX = _mm_mul_ps(vX, M2.rows[0]); vY = _mm_mul_ps(vY, M2.rows[1]); vZ = _mm_mul_ps(vZ, M2.rows[2]); vW = _mm_mul_ps(vW, M2.rows[3]); vX = _mm_add_ps(vX, vZ); vY = _mm_add_ps(vY, vW); vX = _mm_add_ps(vX, vY); ret.rows[1] = vX; #ifdef AVX_INTRINSICS vX = _mm_broadcast_ss(reinterpret_cast(&M1.rows[2]) + 0); vY = _mm_broadcast_ss(reinterpret_cast(&M1.rows[2]) + 1); vZ = _mm_broadcast_ss(reinterpret_cast(&M1.rows[2]) + 2); vW = _mm_broadcast_ss(reinterpret_cast(&M1.rows[2]) + 3); #else vW = M1.rows[2]; vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); #endif vX = _mm_mul_ps(vX, M2.rows[0]); vY = _mm_mul_ps(vY, M2.rows[1]); vZ = _mm_mul_ps(vZ, M2.rows[2]); vW = _mm_mul_ps(vW, M2.rows[3]); vX = _mm_add_ps(vX, vZ); vY = _mm_add_ps(vY, vW); vX = _mm_add_ps(vX, vY); ret.rows[2] = vX; #ifdef AVX_INTRINSICS vX = _mm_broadcast_ss(reinterpret_cast(&M1.rows[3]) + 0); vY = _mm_broadcast_ss(reinterpret_cast(&M1.rows[3]) + 1); vZ = _mm_broadcast_ss(reinterpret_cast(&M1.rows[3]) + 2); vW = _mm_broadcast_ss(reinterpret_cast(&M1.rows[3]) + 3); #else vW = M1.rows[3]; vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); #endif vX = _mm_mul_ps(vX, M2.rows[0]); vY = _mm_mul_ps(vY, M2.rows[1]); vZ = _mm_mul_ps(vZ, M2.rows[2]); vW = _mm_mul_ps(vW, M2.rows[3]); vX = _mm_add_ps(vX, vZ); vY = _mm_add_ps(vY, vW); vX = _mm_add_ps(vX, vY); ret.rows[3] = vX; return ret; #endif } INLINE_AVX_FIX M3D_MATRIX M3D_MTranspose(M3D_MATRIX M) noexcept { #ifdef DISABLE_INTRINSICS // Original matrix: // // m00m01m02m03 // m10m11m12m13 // m20m21m22m23 // m30m31m32m33 M3D_MATRIX P; P.rows[0] = M3D_V4MergeXY(M.rows[0], M.rows[2]); // m00m20m01m21 P.rows[1] = M3D_V4MergeXY(M.rows[1], M.rows[3]); // m10m30m11m31 P.rows[2] = M3D_V4MergeZW(M.rows[0], M.rows[2]); // m02m22m03m23 P.rows[3] = M3D_V4MergeZW(M.rows[1], M.rows[3]); // m12m32m13m33 M3D_MATRIX MT; MT.rows[0] = M3D_V4MergeXY(P.rows[0], P.rows[1]); // m00m10m20m30 MT.rows[1] = M3D_V4MergeZW(P.rows[0], P.rows[1]); // m01m11m21m31 MT.rows[2] = M3D_V4MergeXY(P.rows[2], P.rows[3]); // m02m12m22m32 MT.rows[3] = M3D_V4MergeZW(P.rows[2], P.rows[3]); // m03m13m23m33 return MT; #elif defined(AVX2_INTRINSICS) __m256 t0 = _mm256_castps128_ps256(M.rows[0]); t0 = _mm256_insertf128_ps(t0, M.rows[1], 1); __m256 t1 = _mm256_castps128_ps256(M.rows[2]); t1 = _mm256_insertf128_ps(t1, M.rows[3], 1); __m256 vTemp = _mm256_unpacklo_ps(t0, t1); __m256 vTemp2 = _mm256_unpackhi_ps(t0, t1); __m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); __m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4); vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4); t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); M3D_MATRIX ret; ret.rows[0] = _mm256_castps256_ps128(t0); ret.rows[1] = _mm256_extractf128_ps(t0, 1); ret.rows[2] = _mm256_castps256_ps128(t1); ret.rows[3] = _mm256_extractf128_ps(t1, 1); return ret; #else // x.x,x.y,y.x,y.y M3D_VECTOR vTemp1 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(1, 0, 1, 0)); // x.z,x.w,y.z,y.w M3D_VECTOR vTemp3 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(3, 2, 3, 2)); // z.x,z.y,w.x,w.y M3D_VECTOR vTemp2 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(1, 0, 1, 0)); // z.z,z.w,w.z,w.w M3D_VECTOR vTemp4 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(3, 2, 3, 2)); M3D_MATRIX ret; // x.x,y.x,z.x,w.x ret.rows[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); // x.y,y.y,z.y,w.y ret.rows[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); // x.z,y.z,z.z,w.z ret.rows[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); // x.w,y.w,z.w,w.w ret.rows[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); return ret; #endif } INLINE_AVX_FIX M3D_MATRIX M3D_MInverse(M3D_MATRIX M) noexcept { #ifdef DISABLE_INTRINSICS M3D_MATRIX MT = M3D_MTranspose(M); M3D_VECTOR V0[4], V1[4]; V0[0] = M3D_V4Swizzle(MT.rows[2]); V1[0] = M3D_V4Swizzle(MT.rows[3]); V0[1] = M3D_V4Swizzle(MT.rows[0]); V1[1] = M3D_V4Swizzle(MT.rows[1]); V0[2] = M3D_V4Permute(MT.rows[2], MT.rows[0]); V1[2] = M3D_V4Permute(MT.rows[3], MT.rows[1]); M3D_VECTOR D0 = M3D_V4Multiply(V0[0], V1[0]); M3D_VECTOR D1 = M3D_V4Multiply(V0[1], V1[1]); M3D_VECTOR D2 = M3D_V4Multiply(V0[2], V1[2]); V0[0] = M3D_V4Swizzle(MT.rows[2]); V1[0] = M3D_V4Swizzle(MT.rows[3]); V0[1] = M3D_V4Swizzle(MT.rows[0]); V1[1] = M3D_V4Swizzle(MT.rows[1]); V0[2] = M3D_V4Permute(MT.rows[2], MT.rows[0]); V1[2] = M3D_V4Permute(MT.rows[3], MT.rows[1]); D0 = M3D_V4NegativeMultiplySubtract(V0[0], V1[0], D0); D1 = M3D_V4NegativeMultiplySubtract(V0[1], V1[1], D1); D2 = M3D_V4NegativeMultiplySubtract(V0[2], V1[2], D2); V0[0] = M3D_V4Swizzle(MT.rows[1]); V1[0] = M3D_V4Permute(D0, D2); V0[1] = M3D_V4Swizzle(MT.rows[0]); V1[1] = M3D_V4Permute(D0, D2); V0[2] = M3D_V4Swizzle(MT.rows[3]); V1[2] = M3D_V4Permute(D1, D2); V0[3] = M3D_V4Swizzle(MT.rows[2]); V1[3] = M3D_V4Permute(D1, D2); M3D_VECTOR C0 = M3D_V4Multiply(V0[0], V1[0]); M3D_VECTOR C2 = M3D_V4Multiply(V0[1], V1[1]); M3D_VECTOR C4 = M3D_V4Multiply(V0[2], V1[2]); M3D_VECTOR C6 = M3D_V4Multiply(V0[3], V1[3]); V0[0] = M3D_V4Swizzle(MT.rows[1]); V1[0] = M3D_V4Permute(D0, D2); V0[1] = M3D_V4Swizzle(MT.rows[0]); V1[1] = M3D_V4Permute(D0, D2); V0[2] = M3D_V4Swizzle(MT.rows[3]); V1[2] = M3D_V4Permute(D1, D2); V0[3] = M3D_V4Swizzle(MT.rows[2]); V1[3] = M3D_V4Permute(D1, D2); C0 = M3D_V4NegativeMultiplySubtract(V0[0], V1[0], C0); C2 = M3D_V4NegativeMultiplySubtract(V0[1], V1[1], C2); C4 = M3D_V4NegativeMultiplySubtract(V0[2], V1[2], C4); C6 = M3D_V4NegativeMultiplySubtract(V0[3], V1[3], C6); V0[0] = M3D_V4Swizzle(MT.rows[1]); V1[0] = M3D_V4Permute(D0, D2); V0[1] = M3D_V4Swizzle(MT.rows[0]); V1[1] = M3D_V4Permute(D0, D2); V0[2] = M3D_V4Swizzle(MT.rows[3]); V1[2] = M3D_V4Permute(D1, D2); V0[3] = M3D_V4Swizzle(MT.rows[2]); V1[3] = M3D_V4Permute(D1, D2); M3D_VECTOR C1 = M3D_V4NegativeMultiplySubtract(V0[0], V1[0], C0); C0 = M3D_V4MultiplyAdd(V0[0], V1[0], C0); M3D_VECTOR C3 = M3D_V4MultiplyAdd(V0[1], V1[1], C2); C2 = M3D_V4NegativeMultiplySubtract(V0[1], V1[1], C2); M3D_VECTOR C5 = M3D_V4NegativeMultiplySubtract(V0[2], V1[2], C4); C4 = M3D_V4MultiplyAdd(V0[2], V1[2], C4); M3D_VECTOR C7 = M3D_V4MultiplyAdd(V0[3], V1[3], C6); C6 = M3D_V4NegativeMultiplySubtract(V0[3], V1[3], C6); M3D_MATRIX R; R.rows[0] = M3D_V4Select(C0, C1, M3D_MSelect0101.v); R.rows[1] = M3D_V4Select(C2, C3, M3D_MSelect0101.v); R.rows[2] = M3D_V4Select(C4, C5, M3D_MSelect0101.v); R.rows[3] = M3D_V4Select(C6, C7, M3D_MSelect0101.v); M3D_VECTOR Determinant = M3D_V4Dot(R.rows[0], MT.rows[0]); //if (pDeterminant != nullptr) // *pDeterminant = Determinant; M3D_VECTOR Reciprocal = M3D_V4Reciprocal(Determinant); M3D_MATRIX Result; Result.rows[0] = M3D_V4Multiply(R.rows[0], Reciprocal); Result.rows[1] = M3D_V4Multiply(R.rows[1], Reciprocal); Result.rows[2] = M3D_V4Multiply(R.rows[2], Reciprocal); Result.rows[3] = M3D_V4Multiply(R.rows[3], Reciprocal); return Result; #else // Transpose matrix M3D_VECTOR vTemp1 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(1, 0, 1, 0)); M3D_VECTOR vTemp3 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(3, 2, 3, 2)); M3D_VECTOR vTemp2 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(1, 0, 1, 0)); M3D_VECTOR vTemp4 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(3, 2, 3, 2)); M3D_MATRIX MT; MT.rows[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); MT.rows[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); MT.rows[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); MT.rows[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); M3D_VECTOR V00 = M3D_PERMUTE_PS(MT.rows[2], _MM_SHUFFLE(1, 1, 0, 0)); M3D_VECTOR V10 = M3D_PERMUTE_PS(MT.rows[3], _MM_SHUFFLE(3, 2, 3, 2)); M3D_VECTOR V01 = M3D_PERMUTE_PS(MT.rows[0], _MM_SHUFFLE(1, 1, 0, 0)); M3D_VECTOR V11 = M3D_PERMUTE_PS(MT.rows[1], _MM_SHUFFLE(3, 2, 3, 2)); M3D_VECTOR V02 = _mm_shuffle_ps(MT.rows[2], MT.rows[0], _MM_SHUFFLE(2, 0, 2, 0)); M3D_VECTOR V12 = _mm_shuffle_ps(MT.rows[3], MT.rows[1], _MM_SHUFFLE(3, 1, 3, 1)); M3D_VECTOR D0 = _mm_mul_ps(V00, V10); M3D_VECTOR D1 = _mm_mul_ps(V01, V11); M3D_VECTOR D2 = _mm_mul_ps(V02, V12); V00 = M3D_PERMUTE_PS(MT.rows[2], _MM_SHUFFLE(3, 2, 3, 2)); V10 = M3D_PERMUTE_PS(MT.rows[3], _MM_SHUFFLE(1, 1, 0, 0)); V01 = M3D_PERMUTE_PS(MT.rows[0], _MM_SHUFFLE(3, 2, 3, 2)); V11 = M3D_PERMUTE_PS(MT.rows[1], _MM_SHUFFLE(1, 1, 0, 0)); V02 = _mm_shuffle_ps(MT.rows[2], MT.rows[0], _MM_SHUFFLE(3, 1, 3, 1)); V12 = _mm_shuffle_ps(MT.rows[3], MT.rows[1], _MM_SHUFFLE(2, 0, 2, 0)); D0 = M3D_FNMADD_PS(V00, V10, D0); D1 = M3D_FNMADD_PS(V01, V11, D1); D2 = M3D_FNMADD_PS(V02, V12, D2); // V11 = D0Y,D0W,D2Y,D2Y V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 1, 3, 1)); V00 = M3D_PERMUTE_PS(MT.rows[1], _MM_SHUFFLE(1, 0, 2, 1)); V10 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(0, 3, 0, 2)); V01 = M3D_PERMUTE_PS(MT.rows[0], _MM_SHUFFLE(0, 1, 0, 2)); V11 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(2, 1, 2, 1)); // V13 = D1Y,D1W,D2W,D2W M3D_VECTOR V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 3, 3, 1)); V02 = M3D_PERMUTE_PS(MT.rows[3], _MM_SHUFFLE(1, 0, 2, 1)); V12 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(0, 3, 0, 2)); M3D_VECTOR V03 = M3D_PERMUTE_PS(MT.rows[2], _MM_SHUFFLE(0, 1, 0, 2)); V13 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(2, 1, 2, 1)); M3D_VECTOR C0 = _mm_mul_ps(V00, V10); M3D_VECTOR C2 = _mm_mul_ps(V01, V11); M3D_VECTOR C4 = _mm_mul_ps(V02, V12); M3D_VECTOR C6 = _mm_mul_ps(V03, V13); // V11 = D0X,D0Y,D2X,D2X V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(0, 0, 1, 0)); V00 = M3D_PERMUTE_PS(MT.rows[1], _MM_SHUFFLE(2, 1, 3, 2)); V10 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(2, 1, 0, 3)); V01 = M3D_PERMUTE_PS(MT.rows[0], _MM_SHUFFLE(1, 3, 2, 3)); V11 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(0, 2, 1, 2)); // V13 = D1X,D1Y,D2Z,D2Z V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(2, 2, 1, 0)); V02 = M3D_PERMUTE_PS(MT.rows[3], _MM_SHUFFLE(2, 1, 3, 2)); V12 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(2, 1, 0, 3)); V03 = M3D_PERMUTE_PS(MT.rows[2], _MM_SHUFFLE(1, 3, 2, 3)); V13 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(0, 2, 1, 2)); C0 = M3D_FNMADD_PS(V00, V10, C0); C2 = M3D_FNMADD_PS(V01, V11, C2); C4 = M3D_FNMADD_PS(V02, V12, C4); C6 = M3D_FNMADD_PS(V03, V13, C6); V00 = M3D_PERMUTE_PS(MT.rows[1], _MM_SHUFFLE(0, 3, 0, 3)); // V10 = D0Z,D0Z,D2X,D2Y V10 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 2, 2)); V10 = M3D_PERMUTE_PS(V10, _MM_SHUFFLE(0, 2, 3, 0)); V01 = M3D_PERMUTE_PS(MT.rows[0], _MM_SHUFFLE(2, 0, 3, 1)); // V11 = D0X,D0W,D2X,D2Y V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 3, 0)); V11 = M3D_PERMUTE_PS(V11, _MM_SHUFFLE(2, 1, 0, 3)); V02 = M3D_PERMUTE_PS(MT.rows[3], _MM_SHUFFLE(0, 3, 0, 3)); // V12 = D1Z,D1Z,D2Z,D2W V12 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 2, 2)); V12 = M3D_PERMUTE_PS(V12, _MM_SHUFFLE(0, 2, 3, 0)); V03 = M3D_PERMUTE_PS(MT.rows[2], _MM_SHUFFLE(2, 0, 3, 1)); // V13 = D1X,D1W,D2Z,D2W V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 3, 0)); V13 = M3D_PERMUTE_PS(V13, _MM_SHUFFLE(2, 1, 0, 3)); V00 = _mm_mul_ps(V00, V10); V01 = _mm_mul_ps(V01, V11); V02 = _mm_mul_ps(V02, V12); V03 = _mm_mul_ps(V03, V13); M3D_VECTOR C1 = _mm_sub_ps(C0, V00); C0 = _mm_add_ps(C0, V00); M3D_VECTOR C3 = _mm_add_ps(C2, V01); C2 = _mm_sub_ps(C2, V01); M3D_VECTOR C5 = _mm_sub_ps(C4, V02); C4 = _mm_add_ps(C4, V02); M3D_VECTOR C7 = _mm_add_ps(C6, V03); C6 = _mm_sub_ps(C6, V03); C0 = _mm_shuffle_ps(C0, C1, _MM_SHUFFLE(3, 1, 2, 0)); C2 = _mm_shuffle_ps(C2, C3, _MM_SHUFFLE(3, 1, 2, 0)); C4 = _mm_shuffle_ps(C4, C5, _MM_SHUFFLE(3, 1, 2, 0)); C6 = _mm_shuffle_ps(C6, C7, _MM_SHUFFLE(3, 1, 2, 0)); C0 = M3D_PERMUTE_PS(C0, _MM_SHUFFLE(3, 1, 2, 0)); C2 = M3D_PERMUTE_PS(C2, _MM_SHUFFLE(3, 1, 2, 0)); C4 = M3D_PERMUTE_PS(C4, _MM_SHUFFLE(3, 1, 2, 0)); C6 = M3D_PERMUTE_PS(C6, _MM_SHUFFLE(3, 1, 2, 0)); // Get the determinant M3D_VECTOR vTemp = M3D_V4Dot(C0, MT.rows[0]); //if (pDeterminant != nullptr) // *pDeterminant = vTemp; vTemp = _mm_div_ps(M3D_MOne, vTemp); M3D_MATRIX mResult; mResult.rows[0] = _mm_mul_ps(C0, vTemp); mResult.rows[1] = _mm_mul_ps(C2, vTemp); mResult.rows[2] = _mm_mul_ps(C4, vTemp); mResult.rows[3] = _mm_mul_ps(C6, vTemp); return mResult; #endif } /* -------------------------------------------------------------------------------------------------------------------------- */ INLINE_AVX_FIX M3D_VECTOR M3D_QRotationMatrix(M3D_MATRIX M) noexcept { #ifdef DISABLE_INTRINSICS M3D_V4F32 q; float r22 = M.mat[2][2]; if (r22 <= 0.f) { // x^2 + y^2 >= z^2 + w^2 float dif10 = M.mat[1][1] - M.mat[0][0]; float omr22 = 1.f - r22; if (dif10 <= 0.f) { // x^2 >= y^2 float fourXSqr = omr22 - dif10; float inv4x = 0.5f / sqrtf(fourXSqr); q.f[0] = fourXSqr * inv4x; q.f[1] = (M.mat[0][1] + M.mat[1][0]) * inv4x; q.f[2] = (M.mat[0][2] + M.mat[2][0]) * inv4x; q.f[3] = (M.mat[1][2] - M.mat[2][1]) * inv4x; } else { // y^2 >= x^2 float fourYSqr = omr22 + dif10; float inv4y = 0.5f / sqrtf(fourYSqr); q.f[0] = (M.mat[0][1] + M.mat[1][0]) * inv4y; q.f[1] = fourYSqr * inv4y; q.f[2] = (M.mat[1][2] + M.mat[2][1]) * inv4y; q.f[3] = (M.mat[2][0] - M.mat[0][2]) * inv4y; } } else { // z^2 + w^2 >= x^2 + y^2 float sum10 = M.mat[1][1] + M.mat[0][0]; float opr22 = 1.f + r22; if (sum10 <= 0.f) { // z^2 >= w^2 float fourZSqr = opr22 - sum10; float inv4z = 0.5f / sqrtf(fourZSqr); q.f[0] = (M.mat[0][2] + M.mat[2][0]) * inv4z; q.f[1] = (M.mat[1][2] + M.mat[2][1]) * inv4z; q.f[2] = fourZSqr * inv4z; q.f[3] = (M.mat[0][1] - M.mat[1][0]) * inv4z; } else { // w^2 >= z^2 float fourWSqr = opr22 + sum10; float inv4w = 0.5f / sqrtf(fourWSqr); q.f[0] = (M.mat[1][2] - M.mat[2][1]) * inv4w; q.f[1] = (M.mat[2][0] - M.mat[0][2]) * inv4w; q.f[2] = (M.mat[0][1] - M.mat[1][0]) * inv4w; q.f[3] = fourWSqr * inv4w; } } return q.v; #else static const M3D_V4F32 XMPMMP = {{{+1.0f, -1.0f, -1.0f, +1.0f}}}; static const M3D_V4F32 XMMPMP = {{{-1.0f, +1.0f, -1.0f, +1.0f}}}; static const M3D_V4F32 XMMMPP = {{{-1.0f, -1.0f, +1.0f, +1.0f}}}; M3D_VECTOR r0 = M.rows[0]; // (r00, r01, r02, 0) M3D_VECTOR r1 = M.rows[1]; // (r10, r11, r12, 0) M3D_VECTOR r2 = M.rows[2]; // (r20, r21, r22, 0) // (r00, r00, r00, r00) M3D_VECTOR r00 = M3D_PERMUTE_PS(r0, _MM_SHUFFLE(0, 0, 0, 0)); // (r11, r11, r11, r11) M3D_VECTOR r11 = M3D_PERMUTE_PS(r1, _MM_SHUFFLE(1, 1, 1, 1)); // (r22, r22, r22, r22) M3D_VECTOR r22 = M3D_PERMUTE_PS(r2, _MM_SHUFFLE(2, 2, 2, 2)); // x^2 >= y^2 equivalent to r11 - r00 <= 0 // (r11 - r00, r11 - r00, r11 - r00, r11 - r00) M3D_VECTOR r11mr00 = _mm_sub_ps(r11, r00); M3D_VECTOR x2gey2 = _mm_cmple_ps(r11mr00, M3D_MZero); // z^2 >= w^2 equivalent to r11 + r00 <= 0 // (r11 + r00, r11 + r00, r11 + r00, r11 + r00) M3D_VECTOR r11pr00 = _mm_add_ps(r11, r00); M3D_VECTOR z2gew2 = _mm_cmple_ps(r11pr00, M3D_MZero); // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 M3D_VECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, M3D_MZero); // (4*x^2, 4*y^2, 4*z^2, 4*w^2) M3D_VECTOR t0 = M3D_FMADD_PS(XMPMMP, r00, M3D_MOne); M3D_VECTOR t1 = _mm_mul_ps(XMMPMP, r11); M3D_VECTOR t2 = M3D_FMADD_PS(XMMMPP, r22, t0); M3D_VECTOR x2y2z2w2 = _mm_add_ps(t1, t2); // (r01, r02, r12, r11) t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 2, 2, 1)); // (r10, r10, r20, r21) t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 0, 0)); // (r10, r20, r21, r10) t1 = M3D_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0)); // (4*x*y, 4*x*z, 4*y*z, unused) M3D_VECTOR xyxzyz = _mm_add_ps(t0, t1); // (r21, r20, r10, r10) t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 1)); // (r12, r12, r02, r01) t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1, 2, 2, 2)); // (r12, r02, r01, r12) t1 = M3D_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0)); // (4*x*w, 4*y*w, 4*z*w, unused) M3D_VECTOR xwywzw = _mm_sub_ps(t0, t1); xwywzw = _mm_mul_ps(XMMPMP, xwywzw); // (4*x^2, 4*y^2, 4*x*y, unused) t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0, 0, 1, 0)); // (4*z^2, 4*w^2, 4*z*w, unused) t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0, 2, 3, 2)); // (4*x*z, 4*y*z, 4*x*w, 4*y*w) t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1, 0, 2, 1)); // (4*x*x, 4*x*y, 4*x*z, 4*x*w) M3D_VECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2, 0, 2, 0)); // (4*y*x, 4*y*y, 4*y*z, 4*y*w) M3D_VECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 1, 1, 2)); // (4*z*x, 4*z*y, 4*z*z, 4*z*w) M3D_VECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2, 0, 1, 0)); // (4*w*x, 4*w*y, 4*w*z, 4*w*w) M3D_VECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1, 2, 3, 2)); // Select the row of the tensor-product matrix that has the largest // magnitude. t0 = _mm_and_ps(x2gey2, tensor0); t1 = _mm_andnot_ps(x2gey2, tensor1); t0 = _mm_or_ps(t0, t1); t1 = _mm_and_ps(z2gew2, tensor2); t2 = _mm_andnot_ps(z2gew2, tensor3); t1 = _mm_or_ps(t1, t2); t0 = _mm_and_ps(x2py2gez2pw2, t0); t1 = _mm_andnot_ps(x2py2gez2pw2, t1); t2 = _mm_or_ps(t0, t1); // Normalize the row. No division by zero is possible because the // quaternion is unit-length (and the row is a nonzero multiple of // the quaternion). t0 = M3D_V4Length(t2); return _mm_div_ps(t2, t0); #endif } /* -------------------------------------------------------------------------------------------------------------------------- */ inline M3D_VECTOR M3D_V3Rotate(M3D_VECTOR V, M3D_VECTOR RotationQuaternion) noexcept { M3D_VECTOR A = M3D_V4Select(M3D_MSelect1110.v, V, M3D_MSelect1110.v); M3D_VECTOR Q = M3D_QConjugate(RotationQuaternion); M3D_VECTOR Result = M3D_QMultiply(Q, A); return M3D_QMultiply(Result, RotationQuaternion); } INLINE_AVX_FIX M3D_VECTOR M3D_V3Transform(M3D_VECTOR V, M3D_MATRIX M) noexcept { #ifdef DISABLE_INTRINSICS M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Y = M3D_V4SplatY(V); M3D_VECTOR X = M3D_V4SplatX(V); M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, M.rows[2], M.rows[3]); Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result); Result = M3D_V4MultiplyAdd(X, M.rows[0], Result); return Result; #else M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z vResult = M3D_FMADD_PS(vResult, M.rows[2], M.rows[3]); M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y vResult = M3D_FMADD_PS(vTemp, M.rows[1], vResult); vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X vResult = M3D_FMADD_PS(vTemp, M.rows[0], vResult); return vResult; #endif } INLINE_AVX_FIX void M3D_V3Transform( M3D_F4* pOutputStream, size_t OutputStride, const M3D_F3* pInputStream, size_t InputStride, size_t VectorCount, M3D_MATRIX M ) noexcept { auto pInputVector = reinterpret_cast(pInputStream); auto pOutputVector = reinterpret_cast(pOutputStream); const M3D_VECTOR row0 = M.rows[0]; const M3D_VECTOR row1 = M.rows[1]; const M3D_VECTOR row2 = M.rows[2]; const M3D_VECTOR row3 = M.rows[3]; #ifdef DISABLE_INTRINSICS for (size_t i = 0; i < VectorCount; i++) { M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast(pInputVector)); M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Y = M3D_V4SplatY(V); M3D_VECTOR X = M3D_V4SplatX(V); M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, row2, row3); Result = M3D_V4MultiplyAdd(Y, row1, Result); Result = M3D_V4MultiplyAdd(X, row0, Result); M3D_V4StoreF4(reinterpret_cast(pOutputVector), Result); pInputVector += InputStride; pOutputVector += OutputStride; } #else size_t i = 0; size_t four = VectorCount >> 2; if (four > 0) { if (InputStride == sizeof(M3D_F3)) { if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) { // Packed input, aligned output for (size_t j = 0; j < four; ++j) { __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); pInputVector += sizeof(M3D_F3) * 4; // Unpack the 4 vectors (.w components are junk) M3D_UNPACK3INTO4(V1, L2, L3); // Result 1 M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 2 Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 3 Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 4 Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; i += 4; } } else { // Packed input, unaligned output for (size_t j = 0; j < four; ++j) { __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); pInputVector += sizeof(M3D_F3) * 4; // Unpack the 4 vectors (.w components are junk) M3D_UNPACK3INTO4(V1, L2, L3); // Result 1 M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 2 Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 3 Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 4 Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; i += 4; } } } } if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) { // Aligned output for (; i < VectorCount; ++i) { M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast(pInputVector)); pInputVector += InputStride; M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; } } else { // Unaligned output for (; i < VectorCount; ++i) { M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast(pInputVector)); pInputVector += InputStride; M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; } } M3D_SFENCE(); #endif } INLINE_AVX_FIX M3D_VECTOR M3D_V3TransformNormal(M3D_VECTOR V, M3D_MATRIX M) noexcept { #ifdef DISABLE_INTRINSICS M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Y = M3D_V4SplatY(V); M3D_VECTOR X = M3D_V4SplatX(V); M3D_VECTOR Result = M3D_V4Multiply(Z, M.rows[2]); Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result); Result = M3D_V4MultiplyAdd(X, M.rows[0], Result); return Result; #else M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z vResult = _mm_mul_ps(vResult, M.rows[2]); M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y vResult = M3D_FMADD_PS(vTemp, M.rows[1], vResult); vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X vResult = M3D_FMADD_PS(vTemp, M.rows[0], vResult); return vResult; #endif } INLINE_AVX_FIX M3D_VECTOR M3D_V3TransformPersDiv(M3D_VECTOR V, M3D_MATRIX M) noexcept { M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Y = M3D_V4SplatY(V); M3D_VECTOR X = M3D_V4SplatX(V); M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, M.rows[2], M.rows[3]); Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result); Result = M3D_V4MultiplyAdd(X, M.rows[0], Result); M3D_VECTOR W = M3D_V4SplatW(Result); return M3D_V4Divide(Result, W); } INLINE_AVX_FIX void M3D_V3TransformPersDiv( M3D_F3* pOutputStream, size_t OutputStride, const M3D_F3* pInputStream, size_t InputStride, size_t VectorCount, M3D_MATRIX M ) noexcept { auto pInputVector = reinterpret_cast(pInputStream); auto pOutputVector = reinterpret_cast(pOutputStream); const M3D_VECTOR row0 = M.rows[0]; const M3D_VECTOR row1 = M.rows[1]; const M3D_VECTOR row2 = M.rows[2]; const M3D_VECTOR row3 = M.rows[3]; #ifdef DISABLE_INTRINSICS for (size_t i = 0; i < VectorCount; i++) { M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast(pInputVector)); M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Y = M3D_V4SplatY(V); M3D_VECTOR X = M3D_V4SplatX(V); M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, row2, row3); Result = M3D_V4MultiplyAdd(Y, row1, Result); Result = M3D_V4MultiplyAdd(X, row0, Result); M3D_VECTOR W = M3D_V4SplatW(Result); Result = M3D_V4Divide(Result, W); M3D_V4StoreF3(reinterpret_cast(pOutputVector), Result); pInputVector += InputStride; pOutputVector += OutputStride; } #else size_t i = 0; size_t four = VectorCount >> 2; if (four > 0) { if (InputStride == sizeof(M3D_F3)) { if (OutputStride == sizeof(M3D_F3)) { if (!(reinterpret_cast(pOutputStream) & 0xF)) { // Packed input, aligned & packed output for (size_t j = 0; j < four; ++j) { __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); pInputVector += sizeof(M3D_F3) * 4; // Unpack the 4 vectors (.w components are junk) M3D_UNPACK3INTO4(V1, L2, L3); // Result 1 M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V1 = _mm_div_ps(vTemp, W); // Result 2 Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V2 = _mm_div_ps(vTemp, W); // Result 3 Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V3 = _mm_div_ps(vTemp, W); // Result 4 Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V4 = _mm_div_ps(vTemp, W); // Pack and store the vectors M3D_PACK4INTO3(vTemp); M3D_STREAM_PS(reinterpret_cast(pOutputVector), V1); M3D_STREAM_PS(reinterpret_cast(pOutputVector + 16), vTemp); M3D_STREAM_PS(reinterpret_cast(pOutputVector + 32), V3); pOutputVector += sizeof(M3D_F3) * 4; i += 4; } } else { // Packed input, unaligned & packed output for (size_t j = 0; j < four; ++j) { __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); pInputVector += sizeof(M3D_F3) * 4; // Unpack the 4 vectors (.w components are junk) M3D_UNPACK3INTO4(V1, L2, L3); // Result 1 M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V1 = _mm_div_ps(vTemp, W); // Result 2 Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V2 = _mm_div_ps(vTemp, W); // Result 3 Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V3 = _mm_div_ps(vTemp, W); // Result 4 Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V4 = _mm_div_ps(vTemp, W); // Pack and store the vectors M3D_PACK4INTO3(vTemp); _mm_storeu_ps(reinterpret_cast(pOutputVector), V1); _mm_storeu_ps(reinterpret_cast(pOutputVector + 16), vTemp); _mm_storeu_ps(reinterpret_cast(pOutputVector + 32), V3); pOutputVector += sizeof(M3D_F3) * 4; i += 4; } } } else { // Packed input, unpacked output for (size_t j = 0; j < four; ++j) { __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); pInputVector += sizeof(M3D_F3) * 4; // Unpack the 4 vectors (.w components are junk) M3D_UNPACK3INTO4(V1, L2, L3); // Result 1 M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); vTemp = _mm_div_ps(vTemp, W); M3D_V4StoreF3(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 2 Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); vTemp = _mm_div_ps(vTemp, W); M3D_V4StoreF3(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 3 Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); vTemp = _mm_div_ps(vTemp, W); M3D_V4StoreF3(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 4 Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); vTemp = _mm_div_ps(vTemp, W); M3D_V4StoreF3(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; i += 4; } } } } for (; i < VectorCount; i++) { M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast(pInputVector)); pInputVector += InputStride; M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); vTemp = _mm_div_ps(vTemp, W); M3D_V4StoreF3(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; } M3D_SFENCE(); #endif } INLINE_AVX_FIX M3D_VECTOR M3D_V4Transform(M3D_VECTOR V, M3D_MATRIX M) noexcept { #ifdef DISABLE_INTRINSICS M3D_VECTOR W = M3D_V4SplatW(V); M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Y = M3D_V4SplatY(V); M3D_VECTOR X = M3D_V4SplatX(V); M3D_VECTOR Result = M3D_V4Multiply(W, M.rows[3]); Result = M3D_V4MultiplyAdd(Z, M.rows[2], Result); Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result); Result = M3D_V4MultiplyAdd(X, M.rows[0], Result); return Result; #else M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); // W vResult = _mm_mul_ps(vResult, M.rows[3]); M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z vResult = M3D_FMADD_PS(vTemp, M.rows[2], vResult); vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y vResult = M3D_FMADD_PS(vTemp, M.rows[1], vResult); vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X vResult = M3D_FMADD_PS(vTemp, M.rows[0], vResult); return vResult; #endif } INLINE_AVX_FIX void M3D_V4Transform(M3D_F4* pOutputStream, size_t OutputStride, const M3D_F4* pInputStream, size_t InputStride, size_t VectorCount, M3D_MATRIX M) noexcept { auto pInputVector = reinterpret_cast(pInputStream); auto pOutputVector = reinterpret_cast(pOutputStream); #ifdef DISABLE_INTRINSICS const M3D_VECTOR row0 = M.rows[0]; const M3D_VECTOR row1 = M.rows[1]; const M3D_VECTOR row2 = M.rows[2]; const M3D_VECTOR row3 = M.rows[3]; for (size_t i = 0; i < VectorCount; i++) { M3D_VECTOR V = M3D_V4LoadF4(reinterpret_cast(pInputVector)); M3D_VECTOR W = M3D_V4SplatW(V); M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Y = M3D_V4SplatY(V); M3D_VECTOR X = M3D_V4SplatX(V); M3D_VECTOR Result = M3D_V4Multiply(W, row3); Result = M3D_V4MultiplyAdd(Z, row2, Result); Result = M3D_V4MultiplyAdd(Y, row1, Result); Result = M3D_V4MultiplyAdd(X, row0, Result); M3D_V4StoreF4(reinterpret_cast(pOutputVector), Result); pInputVector += InputStride; pOutputVector += OutputStride; } #elif defined(AVX2_INTRINSICS) size_t i = 0; size_t two = VectorCount >> 1; if (two > 0) { __m256 row0 = _mm256_broadcast_ps(&M.rows[0]); __m256 row1 = _mm256_broadcast_ps(&M.rows[1]); __m256 row2 = _mm256_broadcast_ps(&M.rows[2]); __m256 row3 = _mm256_broadcast_ps(&M.rows[3]); if (InputStride == sizeof(M3D_F4)) { if (OutputStride == sizeof(M3D_F4)) { if (!(reinterpret_cast(pOutputStream) & 0x1F)) { // Packed input, aligned & packed output for (size_t j = 0; j < two; ++j) { __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); pInputVector += sizeof(M3D_F4) * 2; __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); vTempX = _mm256_mul_ps(vTempX, row0); vTempY = _mm256_mul_ps(vTempY, row1); vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX); vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY); vTempX = _mm256_add_ps(vTempZ, vTempW); M3D_STREAM_256b_PS(reinterpret_cast(pOutputVector), vTempX); pOutputVector += sizeof(M3D_F4) * 2; i += 2; } } else { // Packed input, packed output for (size_t j = 0; j < two; ++j) { __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); pInputVector += sizeof(M3D_F4) * 2; __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); vTempX = _mm256_mul_ps(vTempX, row0); vTempY = _mm256_mul_ps(vTempY, row1); vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX); vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY); vTempX = _mm256_add_ps(vTempZ, vTempW); _mm256_storeu_ps(reinterpret_cast(pOutputVector), vTempX); pOutputVector += sizeof(M3D_F4) * 2; i += 2; } } } else { // Packed input, unpacked output for (size_t j = 0; j < two; ++j) { __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); pInputVector += sizeof(M3D_F4) * 2; __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); vTempX = _mm256_mul_ps(vTempX, row0); vTempY = _mm256_mul_ps(vTempY, row1); vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX); vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY); vTempX = _mm256_add_ps(vTempZ, vTempW); _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_castps256_ps128(vTempX)); pOutputVector += OutputStride; _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_extractf128_ps(vTempX, 1)); pOutputVector += OutputStride; i += 2; } } } } if (i < VectorCount) { const M3D_VECTOR row0 = M.rows[0]; const M3D_VECTOR row1 = M.rows[1]; const M3D_VECTOR row2 = M.rows[2]; const M3D_VECTOR row3 = M.rows[3]; for (; i < VectorCount; i++) { __m128 V = _mm_loadu_ps(reinterpret_cast(pInputVector)); pInputVector += InputStride; M3D_VECTOR vTempX = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTempY = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR vTempZ = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR vTempW = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); vTempX = _mm_mul_ps(vTempX, row0); vTempY = _mm_mul_ps(vTempY, row1); vTempZ = M3D_FMADD_PS(vTempZ, row2, vTempX); vTempW = M3D_FMADD_PS(vTempW, row3, vTempY); vTempX = _mm_add_ps(vTempZ, vTempW); _mm_storeu_ps(reinterpret_cast(pOutputVector), vTempX); pOutputVector += OutputStride; } } M3D_SFENCE(); #else const M3D_VECTOR row0 = M.rows[0]; const M3D_VECTOR row1 = M.rows[1]; const M3D_VECTOR row2 = M.rows[2]; const M3D_VECTOR row3 = M.rows[3]; if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) { if (!(reinterpret_cast(pInputStream) & 0xF) && !(InputStride & 0xF)) { // Aligned input, aligned output for (size_t i = 0; i < VectorCount; i++) { __m128 V = _mm_load_ps(reinterpret_cast(pInputVector)); pInputVector += InputStride; M3D_VECTOR vTempX = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTempY = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR vTempZ = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR vTempW = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); vTempX = _mm_mul_ps(vTempX, row0); vTempY = _mm_mul_ps(vTempY, row1); vTempZ = M3D_FMADD_PS(vTempZ, row2, vTempX); vTempW = M3D_FMADD_PS(vTempW, row3, vTempY); vTempX = _mm_add_ps(vTempZ, vTempW); M3D_STREAM_PS(reinterpret_cast(pOutputVector), vTempX); pOutputVector += OutputStride; } } else { // Unaligned input, aligned output for (size_t i = 0; i < VectorCount; i++) { __m128 V = _mm_loadu_ps(reinterpret_cast(pInputVector)); pInputVector += InputStride; M3D_VECTOR vTempX = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTempY = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR vTempZ = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR vTempW = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); vTempX = _mm_mul_ps(vTempX, row0); vTempY = _mm_mul_ps(vTempY, row1); vTempZ = M3D_FMADD_PS(vTempZ, row2, vTempX); vTempW = M3D_FMADD_PS(vTempW, row3, vTempY); vTempX = _mm_add_ps(vTempZ, vTempW); M3D_STREAM_PS(reinterpret_cast(pOutputVector), vTempX); pOutputVector += OutputStride; } } } else { if (!(reinterpret_cast(pInputStream) & 0xF) && !(InputStride & 0xF)) { // Aligned input, unaligned output for (size_t i = 0; i < VectorCount; i++) { __m128 V = _mm_load_ps(reinterpret_cast(pInputVector)); pInputVector += InputStride; M3D_VECTOR vTempX = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTempY = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR vTempZ = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR vTempW = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); vTempX = _mm_mul_ps(vTempX, row0); vTempY = _mm_mul_ps(vTempY, row1); vTempZ = M3D_FMADD_PS(vTempZ, row2, vTempX); vTempW = M3D_FMADD_PS(vTempW, row3, vTempY); vTempX = _mm_add_ps(vTempZ, vTempW); _mm_storeu_ps(reinterpret_cast(pOutputVector), vTempX); pOutputVector += OutputStride; } } else { // Unaligned input, unaligned output for (size_t i = 0; i < VectorCount; i++) { __m128 V = _mm_loadu_ps(reinterpret_cast(pInputVector)); pInputVector += InputStride; M3D_VECTOR vTempX = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTempY = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR vTempZ = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR vTempW = M3D_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); vTempX = _mm_mul_ps(vTempX, row0); vTempY = _mm_mul_ps(vTempY, row1); vTempZ = M3D_FMADD_PS(vTempZ, row2, vTempX); vTempW = M3D_FMADD_PS(vTempW, row3, vTempY); vTempX = _mm_add_ps(vTempZ, vTempW); _mm_storeu_ps(reinterpret_cast(pOutputVector), vTempX); pOutputVector += OutputStride; } } } M3D_SFENCE(); #endif } inline M3D_VECTOR M3D_V3TransformNDCToViewport(M3D_VECTOR V, float vpX, float vpY, float vpW, float vpH, float vpMinZ, float vpMaxZ) noexcept { const float halfVPWidth = vpW * 0.5f; const float halfVPHeight = vpH * 0.5f; M3D_VECTOR s = M3D_V4Set(halfVPWidth, -halfVPHeight, vpMaxZ - vpMinZ, 0.0f); M3D_VECTOR o = M3D_V4Set(vpX + halfVPWidth, vpY + halfVPHeight, vpMinZ, 0.0f); return M3D_V4MultiplyAdd(V, s, o); } /* -------------------------------------------------------------------------------------------------------------------------- */ INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixCamLookAtLH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept { M3D_VECTOR dir = M3D_V4Subtract(focusPos, viewPos); return M3D_TransformMatrixCamLookToLH(viewPos, dir, upDirection); } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixCamLookAtRH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept { M3D_VECTOR dir_n = M3D_V4Subtract(viewPos, focusPos); return M3D_TransformMatrixCamLookToLH(viewPos, dir_n, upDirection); } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixCamLookToLH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept { // Keep viewer's axes orthogonal to each other and of unit length M3D_VECTOR look_normal = M3D_V3Normalize(viewDirection); M3D_VECTOR up_norm = M3D_V3Normalize(M3D_V3Cross(upDirection, look_normal)); // U, L already ortho-normal, so no need to normalize cross product M3D_VECTOR right_norm = M3D_V3Cross(look_normal, up_norm); M3D_VECTOR viewPos_n = M3D_V4Negate(viewPos); M3D_VECTOR up_vec = M3D_V3Dot(up_norm, viewPos_n); M3D_VECTOR right_vec = M3D_V3Dot(right_norm, viewPos_n); M3D_VECTOR look_vec = M3D_V3Dot(look_normal, viewPos_n); M3D_MATRIX ret; ret.rows[0] = M3D_V4Select(up_vec, up_norm, M3D_MSelect1110.v); ret.rows[1] = M3D_V4Select(right_vec, right_norm, M3D_MSelect1110.v); ret.rows[2] = M3D_V4Select(look_vec, look_normal, M3D_MSelect1110.v); ret.rows[3] = M3D_MIdentityR3.v; ret = M3D_MTranspose(ret); return ret; } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixCamLookToRH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept { M3D_VECTOR viewDirection_n = M3D_V4Negate(viewDirection); return M3D_TransformMatrixCamLookToLH(viewPos, viewDirection_n, upDirection); } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixFrustrumFovLH(float fov, float ratio, float near, float far) noexcept { float SinFov; float CosFov; M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov); float fRange = far / (far - near); float Height = CosFov / SinFov; float Width = Height / ratio; #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = Width; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = Height; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = fRange; ret.mat[2][3] = 1.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = -fRange * near; ret.mat[3][3] = 0.0f; return ret; #else M3D_VECTOR rMem = { Width, Height, fRange, -fRange * near }; // Copy from memory to SSE register M3D_VECTOR vValues = rMem; M3D_MATRIX ret; M3D_VECTOR vTemp = _mm_setzero_ps(); vTemp = _mm_move_ss(vTemp, vValues); ret.rows[0] = vTemp; // Width, 0, 0, 0 vTemp = vValues; vTemp = _mm_and_ps(vTemp, M3D_MMaskY); ret.rows[1] = vTemp; // 0, Height, 0, 0 vTemp = _mm_setzero_ps(); vValues = _mm_shuffle_ps(vValues, M3D_MIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); ret.rows[2] = vTemp; // 0, 0, fRange, 1.0f vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); ret.rows[3] = vTemp; // 0, 0, -fRange * near, 0.0f return ret; #endif } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixFrustrumFovRH(float fov, float ratio, float near, float far) noexcept { float SinFov; float CosFov; M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov); float fRange = far / (near - far); float Height = CosFov / SinFov; float Width = Height / ratio; #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = Width; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = Height; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = fRange; ret.mat[2][3] = -1.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = fRange * near; ret.mat[3][3] = 0.0f; return ret; #else M3D_VECTOR rMem = { Width, Height, fRange, fRange * near }; // Copy from memory to SSE register M3D_VECTOR vValues = rMem; M3D_MATRIX ret; M3D_VECTOR vTemp = _mm_setzero_ps(); vTemp = _mm_move_ss(vTemp, vValues); ret.rows[0] = vTemp; // Height / a_ratio, 0, 0, 0 vTemp = vValues; vTemp = _mm_and_ps(vTemp, M3D_MMaskY); ret.rows[1] = vTemp; // 0, CosFov / SinFov, 0, 0 vTemp = _mm_setzero_ps(); vValues = _mm_shuffle_ps(vValues, M3D_MIdentityR3_n, _MM_SHUFFLE(3, 2, 3, 2)); vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); ret.rows[2] = vTemp; // 0, 0, fRange, -1.0f vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); ret.rows[3] = vTemp; // 0, 0, fRange * near, 0.0f return ret; #endif } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixTranslate(M3D_VECTOR Offset) noexcept { #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = 1.0f; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = 1.0f; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = 1.0f; ret.mat[2][3] = 0.0f; ret.mat[3][0] = Offset.v4f[0]; ret.mat[3][1] = Offset.v4f[1]; ret.mat[3][2] = Offset.v4f[2]; ret.mat[3][3] = 1.0f; return ret; #else M3D_MATRIX ret; ret.rows[0] = M3D_MIdentityR0.v; ret.rows[1] = M3D_MIdentityR1.v; ret.rows[2] = M3D_MIdentityR2.v; ret.rows[3] = M3D_V4Select(M3D_MIdentityR3.v, Offset, M3D_MSelect1110.v); return ret; #endif } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixScale(float ScaleX, float ScaleY, float ScaleZ) noexcept { #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = ScaleX; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = ScaleY; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = ScaleZ; ret.mat[2][3] = 0.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = 0.0f; ret.mat[3][3] = 1.0f; return ret; #else M3D_MATRIX ret; ret.rows[0] = _mm_set_ps(0, 0, 0, ScaleX); ret.rows[1] = _mm_set_ps(0, 0, ScaleY, 0); ret.rows[2] = _mm_set_ps(0, ScaleZ, 0, 0); ret.rows[3] = M3D_MIdentityR3.v; return ret; #endif } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixScale(M3D_VECTOR Scale) noexcept { #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = Scale.v4f[0]; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = Scale.v4f[1]; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = Scale.v4f[2]; ret.mat[2][3] = 0.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = 0.0f; ret.mat[3][3] = 1.0f; return ret; #else M3D_MATRIX ret; ret.rows[0] = _mm_and_ps(Scale, M3D_MMaskX); ret.rows[1] = _mm_and_ps(Scale, M3D_MMaskY); ret.rows[2] = _mm_and_ps(Scale, M3D_MMaskZ); ret.rows[3] = M3D_MIdentityR3.v; return ret; #endif } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixTranslate(float OffsetX, float OffsetY, float OffsetZ) noexcept { #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = 1.0f; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = 1.0f; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = 1.0f; ret.mat[2][3] = 0.0f; ret.mat[3][0] = OffsetX; ret.mat[3][1] = OffsetY; ret.mat[3][2] = OffsetZ; ret.mat[3][3] = 1.0f; return ret; #else M3D_MATRIX ret; ret.rows[0] = M3D_MIdentityR0.v; ret.rows[1] = M3D_MIdentityR1.v; ret.rows[2] = M3D_MIdentityR2.v; ret.rows[3] = M3D_V4Set(OffsetX, OffsetY, OffsetZ, 1.f); return ret; #endif } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationX(float Angle) noexcept { float SinAngle; float CosAngle; M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle); #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = 1.0f; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = CosAngle; ret.mat[1][2] = SinAngle; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = -SinAngle; ret.mat[2][2] = CosAngle; ret.mat[2][3] = 0.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = 0.0f; ret.mat[3][3] = 1.0f; return ret; #else M3D_VECTOR vSin = _mm_set_ss(SinAngle); M3D_VECTOR vCos = _mm_set_ss(CosAngle); // x = 0,y = cos,z = sin, w = 0 vCos = _mm_shuffle_ps(vCos, vSin, _MM_SHUFFLE(3, 0, 0, 3)); M3D_MATRIX ret; ret.rows[0] = M3D_MIdentityR0; ret.rows[1] = vCos; // x = 0,y = sin,z = cos, w = 0 vCos = M3D_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 1, 2, 0)); // x = 0,y = -sin,z = cos, w = 0 vCos = _mm_mul_ps(vCos, M3D_MNegateY); ret.rows[2] = vCos; ret.rows[3] = M3D_MIdentityR3; return ret; #endif } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationY(float Angle) noexcept { float SinAngle; float CosAngle; M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle); #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = CosAngle; ret.mat[0][1] = 0.0f; ret.mat[0][2] = -SinAngle; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = 1.0f; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = SinAngle; ret.mat[2][1] = 0.0f; ret.mat[2][2] = CosAngle; ret.mat[2][3] = 0.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = 0.0f; ret.mat[3][3] = 1.0f; return ret; #else M3D_VECTOR vSin = _mm_set_ss(SinAngle); M3D_VECTOR vCos = _mm_set_ss(CosAngle); // x = sin,y = 0,z = cos, w = 0 vSin = _mm_shuffle_ps(vSin, vCos, _MM_SHUFFLE(3, 0, 3, 0)); M3D_MATRIX ret; ret.rows[2] = vSin; ret.rows[1] = M3D_MIdentityR1; // x = cos,y = 0,z = sin, w = 0 vSin = M3D_PERMUTE_PS(vSin, _MM_SHUFFLE(3, 0, 1, 2)); // x = cos,y = 0,z = -sin, w = 0 vSin = _mm_mul_ps(vSin, M3D_MNegateZ); ret.rows[0] = vSin; ret.rows[3] = M3D_MIdentityR3; return ret; #endif } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationZ(float Angle) noexcept { float SinAngle; float CosAngle; M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle); #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = CosAngle; ret.mat[0][1] = SinAngle; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = -SinAngle; ret.mat[1][1] = CosAngle; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = 1.0f; ret.mat[2][3] = 0.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = 0.0f; ret.mat[3][3] = 1.0f; return ret; #else M3D_VECTOR vSin = _mm_set_ss(SinAngle); M3D_VECTOR vCos = _mm_set_ss(CosAngle); // x = cos,y = sin,z = 0, w = 0 vCos = _mm_unpacklo_ps(vCos, vSin); M3D_MATRIX ret; ret.rows[0] = vCos; // x = sin,y = cos,z = 0, w = 0 vCos = M3D_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 2, 0, 1)); // x = cos,y = -sin,z = 0, w = 0 vCos = _mm_mul_ps(vCos, M3D_MNegateX); ret.rows[1] = vCos; ret.rows[2] = M3D_MIdentityR2; ret.rows[3] = M3D_MIdentityR3; return ret; #endif } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotation(M3D_VECTOR Angles) noexcept { #ifdef DISABLE_INTRINSICS float cp = cosf(Angles.v4f[0]); float sp = sinf(Angles.v4f[0]); float cy = cosf(Angles.v4f[1]); float sy = sinf(Angles.v4f[1]); float cr = cosf(Angles.v4f[2]); float sr = sinf(Angles.v4f[2]); M3D_MATRIX ret; ret.mat[0][0] = cr * cy + sr * sp * sy; ret.mat[0][1] = sr * cp; ret.mat[0][2] = sr * sp * cy - cr * sy; ret.mat[0][3] = 0.0f; ret.mat[1][0] = cr * sp * sy - sr * cy; ret.mat[1][1] = cr * cp; ret.mat[1][2] = sr * sy + cr * sp * cy; ret.mat[1][3] = 0.0f; ret.mat[2][0] = cp * sy; ret.mat[2][1] = -sp; ret.mat[2][2] = cp * cy; ret.mat[2][3] = 0.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = 0.0f; ret.mat[3][3] = 1.0f; return ret; #else static const M3D_V4F32 Sign = {{{1.0f, -1.0f, -1.0f, 1.0f}}}; M3D_VECTOR SinAngles, CosAngles; M3D_V4SinCos(&SinAngles, &CosAngles, Angles); M3D_VECTOR P0 = M3D_V4Permute(SinAngles, CosAngles); M3D_VECTOR Y0 = M3D_V4Permute(SinAngles, CosAngles); M3D_VECTOR P1 = M3D_V4Permute(SinAngles, CosAngles); M3D_VECTOR Y1 = M3D_V4Permute(SinAngles, CosAngles); M3D_VECTOR P2 = M3D_V4Permute(SinAngles, CosAngles); M3D_VECTOR P3 = M3D_V4Permute(SinAngles, CosAngles); M3D_VECTOR Y2 = M3D_V4SplatX(SinAngles); M3D_VECTOR NS = M3D_V4Negate(SinAngles); M3D_VECTOR Q0 = M3D_V4Multiply(P0, Y0); M3D_VECTOR Q1 = M3D_V4Multiply(P1, Sign.v); Q1 = M3D_V4Multiply(Q1, Y1); M3D_VECTOR Q2 = M3D_V4Multiply(P2, Y2); Q2 = M3D_V4MultiplyAdd(Q2, P3, Q1); M3D_VECTOR V0 = M3D_V4Permute(Q0, Q2); M3D_VECTOR V1 = M3D_V4Permute(Q0, Q2); M3D_VECTOR V2 = M3D_V4Permute(Q0, NS); M3D_MATRIX ret; ret.rows[0] = M3D_V4Select(M3D_MZero, V0, M3D_MSelect1110.v); ret.rows[1] = M3D_V4Select(M3D_MZero, V1, M3D_MSelect1110.v); ret.rows[2] = M3D_V4Select(M3D_MZero, V2, M3D_MSelect1110.v); ret.rows[3] = M3D_MIdentityR3; return ret; #endif } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationNormal(M3D_VECTOR NormalAxis, float Angle) noexcept { float fSinAngle; float fCosAngle; M3D_ScalarSinCos(&fSinAngle, &fCosAngle, Angle); #ifdef DISABLE_INTRINSICS M3D_VECTOR A = M3D_V4Set(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f); M3D_VECTOR C2 = M3D_V4SplatZ(A); M3D_VECTOR C1 = M3D_V4SplatY(A); M3D_VECTOR C0 = M3D_V4SplatX(A); M3D_VECTOR N0 = M3D_V4Swizzle(NormalAxis); M3D_VECTOR N1 = M3D_V4Swizzle(NormalAxis); M3D_VECTOR V0 = M3D_V4Multiply(C2, N0); V0 = M3D_V4Multiply(V0, N1); M3D_VECTOR R0 = M3D_V4Multiply(C2, NormalAxis); R0 = M3D_V4MultiplyAdd(R0, NormalAxis, C1); M3D_VECTOR R1 = M3D_V4MultiplyAdd(C0, NormalAxis, V0); M3D_VECTOR R2 = M3D_V4NegativeMultiplySubtract(C0, NormalAxis, V0); V0 = M3D_V4Select(A, R0, M3D_MSelect1110.v); M3D_VECTOR V1 = M3D_V4Permute(R1, R2); M3D_VECTOR V2 = M3D_V4Permute(R1, R2); M3D_MATRIX M; M.rows[0] = M3D_V4Permute(V0, V1); M.rows[1] = M3D_V4Permute(V0, V1); M.rows[2] = M3D_V4Permute(V0, V2); M.rows[3] = M3D_MIdentityR3.v; return M; #else M3D_VECTOR C2 = _mm_set_ps1(1.0f - fCosAngle); M3D_VECTOR C1 = _mm_set_ps1(fCosAngle); M3D_VECTOR C0 = _mm_set_ps1(fSinAngle); M3D_VECTOR N0 = M3D_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 0, 2, 1)); M3D_VECTOR N1 = M3D_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 1, 0, 2)); M3D_VECTOR V0 = _mm_mul_ps(C2, N0); V0 = _mm_mul_ps(V0, N1); M3D_VECTOR R0 = _mm_mul_ps(C2, NormalAxis); R0 = _mm_mul_ps(R0, NormalAxis); R0 = _mm_add_ps(R0, C1); M3D_VECTOR R1 = _mm_mul_ps(C0, NormalAxis); R1 = _mm_add_ps(R1, V0); M3D_VECTOR R2 = _mm_mul_ps(C0, NormalAxis); R2 = _mm_sub_ps(V0, R2); V0 = _mm_and_ps(R0, M3D_MMask3); M3D_VECTOR V1 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(2, 1, 2, 0)); V1 = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 3, 2, 1)); M3D_VECTOR V2 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(0, 0, 1, 1)); V2 = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 0, 2, 0)); R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(1, 0, 3, 0)); R2 = M3D_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 2, 0)); M3D_MATRIX M; M.rows[0] = R2; R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(3, 2, 3, 1)); R2 = M3D_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 0, 2)); M.rows[1] = R2; V2 = _mm_shuffle_ps(V2, V0, _MM_SHUFFLE(3, 2, 1, 0)); M.rows[2] = V2; M.rows[3] = M3D_MIdentityR3.v; return M; #endif } INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixRotationAxis(M3D_VECTOR axis, float angle) noexcept { M3D_VECTOR nv = M3D_V3Normalize(axis); return M3D_TransformMatrixRotationNormal(nv, angle); } //TODO: transform matrix is incomplete //v_tri[v_cnt].position.z = ((far+near)/2)+((far-near)/2)*_2dCoord.z; INLINE_AVX_FIX M3D_MATRIX M3D_TransformMatrixViewport(float _w, float _h, float _wOffset, float _hOffset) noexcept { const float widthDiv2 = _w / 2; const float heightDiv2 = _h / 2; #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = widthDiv2; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = -heightDiv2; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = 1.0f; // maxZ-minZ ignored ret.mat[2][3] = 0.0f; // minZ ignored ret.mat[3][0] = _wOffset + widthDiv2; ret.mat[3][1] = _hOffset + heightDiv2; ret.mat[3][2] = 0.0f; ret.mat[3][3] = 1.0f; return ret; #else M3D_MATRIX ret; ret.rows[0] = M3D_V4Set(widthDiv2, 0, 0, 0); ret.rows[1] = M3D_V4Set(0, -heightDiv2, 0, 0); ret.rows[2] = M3D_MIdentityR2.v; // maxZ-minZ and minZ are ignored ret.rows[3] = M3D_V4Set(_wOffset + widthDiv2, _hOffset + heightDiv2, 0, 1); return ret; #endif }