#pragma once inline M3D_MATRIX::M3D_MATRIX( float f00, float f01, float f02, float f03, float f10, float f11, float f12, float f13, float f20, float f21, float f22, float f23, float f30, float f31, float f32, float f33 ) noexcept { rows[0] = M3D_V4Set(f00, f01, f02, f03); rows[1] = M3D_V4Set(f10, f11, f12, f13); rows[2] = M3D_V4Set(f20, f21, f22, f23); rows[3] = M3D_V4Set(f30, f31, f32, f33); } inline M3D_MATRIX M3D_MATRIX::operator- () const noexcept { M3D_MATRIX ret; ret.rows[0] = M3D_V4Negate(rows[0]); ret.rows[1] = M3D_V4Negate(rows[1]); ret.rows[2] = M3D_V4Negate(rows[2]); ret.rows[3] = M3D_V4Negate(rows[3]); return ret; } inline M3D_MATRIX& M3D_MATRIX::operator+= (M3D_MATRIX M) noexcept { rows[0] = M3D_V4Add(rows[0], M.rows[0]); rows[1] = M3D_V4Add(rows[1], M.rows[1]); rows[2] = M3D_V4Add(rows[2], M.rows[2]); rows[3] = M3D_V4Add(rows[3], M.rows[3]); return *this; } inline M3D_MATRIX M3D_MATRIX::operator+ (M3D_MATRIX M) const noexcept { M3D_MATRIX ret; ret.rows[0] = M3D_V4Add(rows[0], M.rows[0]); ret.rows[1] = M3D_V4Add(rows[1], M.rows[1]); ret.rows[2] = M3D_V4Add(rows[2], M.rows[2]); ret.rows[3] = M3D_V4Add(rows[3], M.rows[3]); return ret; } inline M3D_MATRIX& M3D_MATRIX::operator-= (M3D_MATRIX M) noexcept { rows[0] = M3D_V4Subtract(rows[0], M.rows[0]); rows[1] = M3D_V4Subtract(rows[1], M.rows[1]); rows[2] = M3D_V4Subtract(rows[2], M.rows[2]); rows[3] = M3D_V4Subtract(rows[3], M.rows[3]); return *this; } inline M3D_MATRIX M3D_MATRIX::operator- (M3D_MATRIX M) const noexcept { M3D_MATRIX ret; ret.rows[0] = M3D_V4Subtract(rows[0], M.rows[0]); ret.rows[1] = M3D_V4Subtract(rows[1], M.rows[1]); ret.rows[2] = M3D_V4Subtract(rows[2], M.rows[2]); ret.rows[3] = M3D_V4Subtract(rows[3], M.rows[3]); return ret; } inline M3D_MATRIX& M3D_MATRIX::operator*=(M3D_MATRIX M) noexcept { *this = M3D_MMultiply(*this, M); return *this; } inline M3D_MATRIX M3D_MATRIX::operator*(M3D_MATRIX M) const noexcept { return M3D_MMultiply(*this, M); } inline M3D_MATRIX& M3D_MATRIX::operator*= (float S) noexcept { rows[0] = M3D_V4Scale(rows[0], S); rows[1] = M3D_V4Scale(rows[1], S); rows[2] = M3D_V4Scale(rows[2], S); rows[3] = M3D_V4Scale(rows[3], S); return *this; } inline M3D_MATRIX M3D_MATRIX::operator* (float S) const noexcept { M3D_MATRIX ret; ret.rows[0] = M3D_V4Scale(rows[0], S); ret.rows[1] = M3D_V4Scale(rows[1], S); ret.rows[2] = M3D_V4Scale(rows[2], S); ret.rows[3] = M3D_V4Scale(rows[3], S); return ret; } inline M3D_MATRIX operator* (float S, M3D_MATRIX M) noexcept { M3D_MATRIX ret; ret.rows[0] = M3D_V4Scale(M.rows[0], S); ret.rows[1] = M3D_V4Scale(M.rows[1], S); ret.rows[2] = M3D_V4Scale(M.rows[2], S); ret.rows[3] = M3D_V4Scale(M.rows[3], S); return ret; } inline M3D_MATRIX& M3D_MATRIX::operator/= (float S) noexcept { #ifdef DISABLE_INTRINSICS M3D_VECTOR vS = M3D_V4Replicate(S); rows[0] = M3D_V4Divide(rows[0], vS); rows[1] = M3D_V4Divide(rows[1], vS); rows[2] = M3D_V4Divide(rows[2], vS); rows[3] = M3D_V4Divide(rows[3], vS); return *this; #else __m128 vS = _mm_set_ps1(S); rows[0] = _mm_div_ps(rows[0], vS); rows[1] = _mm_div_ps(rows[1], vS); rows[2] = _mm_div_ps(rows[2], vS); rows[3] = _mm_div_ps(rows[3], vS); return *this; #endif } inline M3D_MATRIX M3D_MATRIX::operator/ (float S) const noexcept { #ifdef DISABLE_INTRINSICS M3D_VECTOR vS = M3D_V4Replicate(S); M3D_MATRIX ret; ret.rows[0] = M3D_V4Divide(rows[0], vS); ret.rows[1] = M3D_V4Divide(rows[1], vS); ret.rows[2] = M3D_V4Divide(rows[2], vS); ret.rows[3] = M3D_V4Divide(rows[3], vS); return ret; #else __m128 vS = _mm_set_ps1(S); M3D_MATRIX ret; ret.rows[0] = _mm_div_ps(rows[0], vS); ret.rows[1] = _mm_div_ps(rows[1], vS); ret.rows[2] = _mm_div_ps(rows[2], vS); ret.rows[3] = _mm_div_ps(rows[3], vS); return ret; #endif } /* -------------------------------------------------------------------------------------------------------------------------- */ inline M3D_MATRIX M3D_MIdentity() noexcept { M3D_MATRIX ret; ret.rows[0] = M3D_MIdentityR0.v; ret.rows[1] = M3D_MIdentityR1.v; ret.rows[2] = M3D_MIdentityR2.v; ret.rows[3] = M3D_MIdentityR3.v; return ret; } inline M3D_MATRIX M3D_MMultiply(M3D_MATRIX M1, M3D_MATRIX& M2) noexcept { #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; // Cache the invariants in registers float x = M1.mat[0][0]; float y = M1.mat[0][1]; float z = M1.mat[0][2]; float w = M1.mat[0][3]; // Perform the operation on the first row ret.mat[0][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w); ret.mat[0][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w); ret.mat[0][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w); ret.mat[0][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w); // Repeat for all the other rows x = M1.mat[1][0]; y = M1.mat[1][1]; z = M1.mat[1][2]; w = M1.mat[1][3]; ret.mat[1][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w); ret.mat[1][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w); ret.mat[1][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w); ret.mat[1][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w); x = M1.mat[2][0]; y = M1.mat[2][1]; z = M1.mat[2][2]; w = M1.mat[2][3]; ret.mat[2][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w); ret.mat[2][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w); ret.mat[2][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w); ret.mat[2][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w); x = M1.mat[3][0]; y = M1.mat[3][1]; z = M1.mat[3][2]; w = M1.mat[3][3]; ret.mat[3][0] = (M2.mat[0][0] * x) + (M2.mat[1][0] * y) + (M2.mat[2][0] * z) + (M2.mat[3][0] * w); ret.mat[3][1] = (M2.mat[0][1] * x) + (M2.mat[1][1] * y) + (M2.mat[2][1] * z) + (M2.mat[3][1] * w); ret.mat[3][2] = (M2.mat[0][2] * x) + (M2.mat[1][2] * y) + (M2.mat[2][2] * z) + (M2.mat[3][2] * w); ret.mat[3][3] = (M2.mat[0][3] * x) + (M2.mat[1][3] * y) + (M2.mat[2][3] * z) + (M2.mat[3][3] * w); return ret; #elif defined(AVX2_INTRINSICS) __m256 t0 = _mm256_castps128_ps256(M1.rows[0]); t0 = _mm256_insertf128_ps(t0, M1.rows[1], 1); __m256 t1 = _mm256_castps128_ps256(M1.rows[2]); t1 = _mm256_insertf128_ps(t1, M1.rows[3], 1); __m256 u0 = _mm256_castps128_ps256(M2.rows[0]); u0 = _mm256_insertf128_ps(u0, M2.rows[1], 1); __m256 u1 = _mm256_castps128_ps256(M2.rows[2]); u1 = _mm256_insertf128_ps(u1, M2.rows[3], 1); __m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0)); __m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0)); __m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00); __m256 c0 = _mm256_mul_ps(a0, b0); __m256 c1 = _mm256_mul_ps(a1, b0); a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1)); a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1)); b0 = _mm256_permute2f128_ps(u0, u0, 0x11); __m256 c2 = _mm256_fmadd_ps(a0, b0, c0); __m256 c3 = _mm256_fmadd_ps(a1, b0, c1); a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2)); a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2)); __m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00); __m256 c4 = _mm256_mul_ps(a0, b1); __m256 c5 = _mm256_mul_ps(a1, b1); a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3)); a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3)); b1 = _mm256_permute2f128_ps(u1, u1, 0x11); __m256 c6 = _mm256_fmadd_ps(a0, b1, c4); __m256 c7 = _mm256_fmadd_ps(a1, b1, c5); t0 = _mm256_add_ps(c2, c6); t1 = _mm256_add_ps(c3, c7); M3D_MATRIX ret; ret.rows[0] = _mm256_castps256_ps128(t0); ret.rows[1] = _mm256_extractf128_ps(t0, 1); ret.rows[2] = _mm256_castps256_ps128(t1); ret.rows[3] = _mm256_extractf128_ps(t1, 1); return ret; #else M3D_MATRIX ret; // Splat the component X,Y,Z then W #ifdef AVX_INTRINSICS XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast(&M1.rows[0]) + 0); XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast(&M1.rows[0]) + 1); XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast(&M1.rows[0]) + 2); XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast(&M1.rows[0]) + 3); #else // Use vW to hold the original row M3D_VECTOR vW = M1.rows[0]; M3D_VECTOR vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); #endif // Perform the operation on the first row vX = _mm_mul_ps(vX, M2.rows[0]); vY = _mm_mul_ps(vY, M2.rows[1]); vZ = _mm_mul_ps(vZ, M2.rows[2]); vW = _mm_mul_ps(vW, M2.rows[3]); // Perform a binary add to reduce cumulative errors vX = _mm_add_ps(vX, vZ); vY = _mm_add_ps(vY, vW); vX = _mm_add_ps(vX, vY); ret.rows[0] = vX; // Repeat for the other 3 rows #ifdef AVX_INTRINSICS vX = _mm_broadcast_ss(reinterpret_cast(&M1.rows[1]) + 0); vY = _mm_broadcast_ss(reinterpret_cast(&M1.rows[1]) + 1); vZ = _mm_broadcast_ss(reinterpret_cast(&M1.rows[1]) + 2); vW = _mm_broadcast_ss(reinterpret_cast(&M1.rows[1]) + 3); #else vW = M1.rows[1]; vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); #endif vX = _mm_mul_ps(vX, M2.rows[0]); vY = _mm_mul_ps(vY, M2.rows[1]); vZ = _mm_mul_ps(vZ, M2.rows[2]); vW = _mm_mul_ps(vW, M2.rows[3]); vX = _mm_add_ps(vX, vZ); vY = _mm_add_ps(vY, vW); vX = _mm_add_ps(vX, vY); ret.rows[1] = vX; #ifdef AVX_INTRINSICS vX = _mm_broadcast_ss(reinterpret_cast(&M1.rows[2]) + 0); vY = _mm_broadcast_ss(reinterpret_cast(&M1.rows[2]) + 1); vZ = _mm_broadcast_ss(reinterpret_cast(&M1.rows[2]) + 2); vW = _mm_broadcast_ss(reinterpret_cast(&M1.rows[2]) + 3); #else vW = M1.rows[2]; vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); #endif vX = _mm_mul_ps(vX, M2.rows[0]); vY = _mm_mul_ps(vY, M2.rows[1]); vZ = _mm_mul_ps(vZ, M2.rows[2]); vW = _mm_mul_ps(vW, M2.rows[3]); vX = _mm_add_ps(vX, vZ); vY = _mm_add_ps(vY, vW); vX = _mm_add_ps(vX, vY); ret.rows[2] = vX; #ifdef AVX_INTRINSICS vX = _mm_broadcast_ss(reinterpret_cast(&M1.rows[3]) + 0); vY = _mm_broadcast_ss(reinterpret_cast(&M1.rows[3]) + 1); vZ = _mm_broadcast_ss(reinterpret_cast(&M1.rows[3]) + 2); vW = _mm_broadcast_ss(reinterpret_cast(&M1.rows[3]) + 3); #else vW = M1.rows[3]; vX = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); vY = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); vZ = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); vW = M3D_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); #endif vX = _mm_mul_ps(vX, M2.rows[0]); vY = _mm_mul_ps(vY, M2.rows[1]); vZ = _mm_mul_ps(vZ, M2.rows[2]); vW = _mm_mul_ps(vW, M2.rows[3]); vX = _mm_add_ps(vX, vZ); vY = _mm_add_ps(vY, vW); vX = _mm_add_ps(vX, vY); ret.rows[3] = vX; return ret; #endif } inline M3D_MATRIX M3D_MTranspose(M3D_MATRIX M) noexcept { #ifdef DISABLE_INTRINSICS // Original matrix: // // m00m01m02m03 // m10m11m12m13 // m20m21m22m23 // m30m31m32m33 M3D_MATRIX P; P.rows[0] = M3D_V4MergeXY(M.rows[0], M.rows[2]); // m00m20m01m21 P.rows[1] = M3D_V4MergeXY(M.rows[1], M.rows[3]); // m10m30m11m31 P.rows[2] = M3D_V4MergeZW(M.rows[0], M.rows[2]); // m02m22m03m23 P.rows[3] = M3D_V4MergeZW(M.rows[1], M.rows[3]); // m12m32m13m33 M3D_MATRIX MT; MT.rows[0] = M3D_V4MergeXY(P.rows[0], P.rows[1]); // m00m10m20m30 MT.rows[1] = M3D_V4MergeZW(P.rows[0], P.rows[1]); // m01m11m21m31 MT.rows[2] = M3D_V4MergeXY(P.rows[2], P.rows[3]); // m02m12m22m32 MT.rows[3] = M3D_V4MergeZW(P.rows[2], P.rows[3]); // m03m13m23m33 return MT; #elif defined(AVX2_INTRINSICS) __m256 t0 = _mm256_castps128_ps256(M.rows[0]); t0 = _mm256_insertf128_ps(t0, M.rows[1], 1); __m256 t1 = _mm256_castps128_ps256(M.rows[2]); t1 = _mm256_insertf128_ps(t1, M.rows[3], 1); __m256 vTemp = _mm256_unpacklo_ps(t0, t1); __m256 vTemp2 = _mm256_unpackhi_ps(t0, t1); __m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); __m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4); vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4); t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); M3D_MATRIX ret; ret.rows[0] = _mm256_castps256_ps128(t0); ret.rows[1] = _mm256_extractf128_ps(t0, 1); ret.rows[2] = _mm256_castps256_ps128(t1); ret.rows[3] = _mm256_extractf128_ps(t1, 1); return ret; #else // x.x,x.y,y.x,y.y M3D_VECTOR vTemp1 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(1, 0, 1, 0)); // x.z,x.w,y.z,y.w M3D_VECTOR vTemp3 = _mm_shuffle_ps(M.rows[0], M.rows[1], _MM_SHUFFLE(3, 2, 3, 2)); // z.x,z.y,w.x,w.y M3D_VECTOR vTemp2 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(1, 0, 1, 0)); // z.z,z.w,w.z,w.w M3D_VECTOR vTemp4 = _mm_shuffle_ps(M.rows[2], M.rows[3], _MM_SHUFFLE(3, 2, 3, 2)); M3D_MATRIX ret; // x.x,y.x,z.x,w.x ret.rows[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); // x.y,y.y,z.y,w.y ret.rows[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); // x.z,y.z,z.z,w.z ret.rows[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); // x.w,y.w,z.w,w.w ret.rows[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); return ret; #endif } /* -------------------------------------------------------------------------------------------------------------------------- */ inline M3D_VECTOR M3D_V3Transform(M3D_VECTOR V, M3D_MATRIX M) noexcept { #ifdef DISABLE_INTRINSICS M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Y = M3D_V4SplatY(V); M3D_VECTOR X = M3D_V4SplatX(V); M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, M.rows[2], M.rows[3]); Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result); Result = M3D_V4MultiplyAdd(X, M.rows[0], Result); return Result; #else M3D_VECTOR vResult = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z vResult = M3D_FMADD_PS(vResult, M.rows[2], M.rows[3]); M3D_VECTOR vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y vResult = M3D_FMADD_PS(vTemp, M.rows[1], vResult); vTemp = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X vResult = M3D_FMADD_PS(vTemp, M.rows[0], vResult); return vResult; #endif } inline void M3D_V3Transform( M3D_F4* pOutputStream, size_t OutputStride, const M3D_F3* pInputStream, size_t InputStride, size_t VectorCount, M3D_MATRIX M ) noexcept { auto pInputVector = reinterpret_cast(pInputStream); auto pOutputVector = reinterpret_cast(pOutputStream); const M3D_VECTOR row0 = M.rows[0]; const M3D_VECTOR row1 = M.rows[1]; const M3D_VECTOR row2 = M.rows[2]; const M3D_VECTOR row3 = M.rows[3]; #ifdef DISABLE_INTRINSICS for (size_t i = 0; i < VectorCount; i++) { M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast(pInputVector)); M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Y = M3D_V4SplatY(V); M3D_VECTOR X = M3D_V4SplatX(V); M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, row2, row3); Result = M3D_V4MultiplyAdd(Y, row1, Result); Result = M3D_V4MultiplyAdd(X, row0, Result); M3D_V4StoreF4(reinterpret_cast(pOutputVector), Result); pInputVector += InputStride; pOutputVector += OutputStride; } #else size_t i = 0; size_t four = VectorCount >> 2; if (four > 0) { if (InputStride == sizeof(M3D_F3)) { if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) { // Packed input, aligned output for (size_t j = 0; j < four; ++j) { __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); pInputVector += sizeof(M3D_F3) * 4; // Unpack the 4 vectors (.w components are junk) M3D_UNPACK3INTO4(V1, L2, L3); // Result 1 M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 2 Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 3 Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 4 Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; i += 4; } } else { // Packed input, unaligned output for (size_t j = 0; j < four; ++j) { __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); pInputVector += sizeof(M3D_F3) * 4; // Unpack the 4 vectors (.w components are junk) M3D_UNPACK3INTO4(V1, L2, L3); // Result 1 M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 2 Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 3 Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 4 Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; i += 4; } } } } if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) { // Aligned output for (; i < VectorCount; ++i) { M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast(pInputVector)); pInputVector += InputStride; M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; } } else { // Unaligned output for (; i < VectorCount; ++i) { M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast(pInputVector)); pInputVector += InputStride; M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; } } M3D_SFENCE(); #endif } inline M3D_VECTOR M3D_V3TransformPersDiv(M3D_VECTOR V, M3D_MATRIX M) noexcept { M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Y = M3D_V4SplatY(V); M3D_VECTOR X = M3D_V4SplatX(V); M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, M.rows[2], M.rows[3]); Result = M3D_V4MultiplyAdd(Y, M.rows[1], Result); Result = M3D_V4MultiplyAdd(X, M.rows[0], Result); M3D_VECTOR W = M3D_V4SplatW(Result); return M3D_V4Divide(Result, W); } inline void M3D_V3TransformPersDiv( M3D_F3* pOutputStream, size_t OutputStride, const M3D_F3* pInputStream, size_t InputStride, size_t VectorCount, M3D_MATRIX M ) noexcept { auto pInputVector = reinterpret_cast(pInputStream); auto pOutputVector = reinterpret_cast(pOutputStream); const M3D_VECTOR row0 = M.rows[0]; const M3D_VECTOR row1 = M.rows[1]; const M3D_VECTOR row2 = M.rows[2]; const M3D_VECTOR row3 = M.rows[3]; #ifdef DISABLE_INTRINSICS for (size_t i = 0; i < VectorCount; i++) { M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast(pInputVector)); M3D_VECTOR Z = M3D_V4SplatZ(V); M3D_VECTOR Y = M3D_V4SplatY(V); M3D_VECTOR X = M3D_V4SplatX(V); M3D_VECTOR Result = M3D_V4MultiplyAdd(Z, row2, row3); Result = M3D_V4MultiplyAdd(Y, row1, Result); Result = M3D_V4MultiplyAdd(X, row0, Result); M3D_VECTOR W = M3D_V4SplatW(Result); Result = M3D_V4Divide(Result, W); M3D_V4StoreF3(reinterpret_cast(pOutputVector), Result); pInputVector += InputStride; pOutputVector += OutputStride; } #else size_t i = 0; size_t four = VectorCount >> 2; if (four > 0) { if (InputStride == sizeof(M3D_F3)) { if (OutputStride == sizeof(M3D_F3)) { if (!(reinterpret_cast(pOutputStream) & 0xF)) { // Packed input, aligned & packed output for (size_t j = 0; j < four; ++j) { __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); pInputVector += sizeof(M3D_F3) * 4; // Unpack the 4 vectors (.w components are junk) M3D_UNPACK3INTO4(V1, L2, L3); // Result 1 M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V1 = _mm_div_ps(vTemp, W); // Result 2 Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V2 = _mm_div_ps(vTemp, W); // Result 3 Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V3 = _mm_div_ps(vTemp, W); // Result 4 Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V4 = _mm_div_ps(vTemp, W); // Pack and store the vectors M3D_PACK4INTO3(vTemp); M3D_STREAM_PS(reinterpret_cast(pOutputVector), V1); M3D_STREAM_PS(reinterpret_cast(pOutputVector + 16), vTemp); M3D_STREAM_PS(reinterpret_cast(pOutputVector + 32), V3); pOutputVector += sizeof(M3D_F3) * 4; i += 4; } } else { // Packed input, unaligned & packed output for (size_t j = 0; j < four; ++j) { __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); pInputVector += sizeof(M3D_F3) * 4; // Unpack the 4 vectors (.w components are junk) M3D_UNPACK3INTO4(V1, L2, L3); // Result 1 M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V1 = _mm_div_ps(vTemp, W); // Result 2 Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V2 = _mm_div_ps(vTemp, W); // Result 3 Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V3 = _mm_div_ps(vTemp, W); // Result 4 Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); V4 = _mm_div_ps(vTemp, W); // Pack and store the vectors M3D_PACK4INTO3(vTemp); _mm_storeu_ps(reinterpret_cast(pOutputVector), V1); _mm_storeu_ps(reinterpret_cast(pOutputVector + 16), vTemp); _mm_storeu_ps(reinterpret_cast(pOutputVector + 32), V3); pOutputVector += sizeof(M3D_F3) * 4; i += 4; } } } else { // Packed input, unpacked output for (size_t j = 0; j < four; ++j) { __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); pInputVector += sizeof(M3D_F3) * 4; // Unpack the 4 vectors (.w components are junk) M3D_UNPACK3INTO4(V1, L2, L3); // Result 1 M3D_VECTOR Z = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); vTemp = _mm_div_ps(vTemp, W); M3D_V4StoreF3(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 2 Z = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); vTemp = _mm_div_ps(vTemp, W); M3D_V4StoreF3(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 3 Z = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); vTemp = _mm_div_ps(vTemp, W); M3D_V4StoreF3(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; // Result 4 Z = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); Y = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); X = M3D_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); vTemp = M3D_FMADD_PS(Z, row2, row3); vTemp2 = _mm_mul_ps(Y, row1); vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); vTemp = _mm_div_ps(vTemp, W); M3D_V4StoreF3(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; i += 4; } } } } for (; i < VectorCount; i++) { M3D_VECTOR V = M3D_V4LoadF3(reinterpret_cast(pInputVector)); pInputVector += InputStride; M3D_VECTOR Z = M3D_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); M3D_VECTOR Y = M3D_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); M3D_VECTOR X = M3D_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); M3D_VECTOR vTemp = M3D_FMADD_PS(Z, row2, row3); M3D_VECTOR vTemp2 = _mm_mul_ps(Y, row1); M3D_VECTOR vTemp3 = _mm_mul_ps(X, row0); vTemp = _mm_add_ps(vTemp, vTemp2); vTemp = _mm_add_ps(vTemp, vTemp3); M3D_VECTOR W = M3D_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); vTemp = _mm_div_ps(vTemp, W); M3D_V4StoreF3(reinterpret_cast(pOutputVector), vTemp); pOutputVector += OutputStride; } M3D_SFENCE(); #endif } inline M3D_VECTOR M3D_V3TransformNDCToViewport(M3D_VECTOR V, float vpX, float vpY, float vpW, float vpH, float vpMinZ, float vpMaxZ) noexcept { const float halfVPWidth = vpW * 0.5f; const float halfVPHeight = vpH * 0.5f; M3D_VECTOR s = M3D_V4Set(halfVPWidth, -halfVPHeight, vpMaxZ - vpMinZ, 0.0f); M3D_VECTOR o = M3D_V4Set(vpX + halfVPWidth, vpY + halfVPHeight, vpMinZ, 0.0f); return M3D_V4MultiplyAdd(V, s, o); } /* -------------------------------------------------------------------------------------------------------------------------- */ inline M3D_MATRIX M3D_TransformMatrixCamLookAtLH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept { M3D_VECTOR dir = M3D_V4Subtract(focusPos, viewPos); return M3D_TransformMatrixCamLookToLH(viewPos, dir, upDirection); } inline M3D_MATRIX M3D_TransformMatrixCamLookAtRH(M3D_VECTOR viewPos, M3D_VECTOR focusPos, M3D_VECTOR upDirection) noexcept { M3D_VECTOR dir_n = M3D_V4Subtract(viewPos, focusPos); return M3D_TransformMatrixCamLookToLH(viewPos, dir_n, upDirection); } inline M3D_MATRIX M3D_TransformMatrixCamLookToLH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept { // Keep viewer's axes orthogonal to each other and of unit length M3D_VECTOR look_normal = M3D_V3Normalize(viewDirection); M3D_VECTOR up_norm = M3D_V3Cross(upDirection, look_normal); up_norm = M3D_V3Normalize(up_norm); // U, L already ortho-normal, so no need to normalize cross product M3D_VECTOR right_norm = M3D_V3Cross(look_normal, up_norm); M3D_VECTOR viewPos_n = M3D_V4Negate(viewPos); M3D_VECTOR right_vec = M3D_V3Dot(up_norm, viewPos_n); M3D_VECTOR up_vec = M3D_V3Dot(right_norm, viewPos_n); M3D_VECTOR look_vec = M3D_V3Dot(look_normal, viewPos_n); M3D_MATRIX ret; ret.rows[0] = M3D_V4Select(right_vec, up_norm, M3D_MSelect1110.v); ret.rows[1] = M3D_V4Select(up_vec, right_norm, M3D_MSelect1110.v); ret.rows[2] = M3D_V4Select(look_vec, look_normal, M3D_MSelect1110.v); ret.rows[3] = M3D_MIdentityR3.v; ret = M3D_MTranspose(ret); return ret; } inline M3D_MATRIX M3D_TransformMatrixCamLookToRH(M3D_VECTOR viewPos, M3D_VECTOR viewDirection, M3D_VECTOR upDirection) noexcept { M3D_VECTOR viewDirection_n = M3D_V4Negate(viewDirection); return M3D_TransformMatrixCamLookToLH(viewPos, viewDirection_n, upDirection); } inline M3D_MATRIX M3D_TransformMatrixFrustrumFovLH(float fov, float ratio, float near, float far) noexcept { float SinFov; float CosFov; M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov); float fRange = far / (far - near); float Height = CosFov / SinFov; float Width = Height / ratio; #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = Width; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = Height; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = fRange; ret.mat[2][3] = 1.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = -fRange * near; ret.mat[3][3] = 0.0f; return ret; #else M3D_VECTOR rMem = { Width, Height, fRange, -fRange * near }; // Copy from memory to SSE register M3D_VECTOR vValues = rMem; M3D_MATRIX ret; M3D_VECTOR vTemp = _mm_setzero_ps(); vTemp = _mm_move_ss(vTemp, vValues); ret.rows[0] = vTemp; // Width, 0, 0, 0 vTemp = vValues; vTemp = _mm_and_ps(vTemp, M3D_MMaskY); ret.rows[1] = vTemp; // 0, Height, 0, 0 vTemp = _mm_setzero_ps(); vValues = _mm_shuffle_ps(vValues, M3D_MIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); ret.rows[2] = vTemp; // 0, 0, fRange, 1.0f vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); ret.rows[3] = vTemp; // 0, 0, -fRange * near, 0.0f return ret; #endif } inline M3D_MATRIX M3D_TransformMatrixFrustrumFovRH(float fov, float ratio, float near, float far) noexcept { float SinFov; float CosFov; M3D_ScalarSinCos(&SinFov, &CosFov, 0.5f * fov); float fRange = far / (near - far); float Height = CosFov / SinFov; float Width = Height / ratio; #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = Width; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = Height; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = fRange; ret.mat[2][3] = -1.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = fRange * near; ret.mat[3][3] = 0.0f; return ret; #else M3D_VECTOR rMem = { Width, Height, fRange, fRange * near }; // Copy from memory to SSE register M3D_VECTOR vValues = rMem; M3D_MATRIX ret; M3D_VECTOR vTemp = _mm_setzero_ps(); vTemp = _mm_move_ss(vTemp, vValues); ret.rows[0] = vTemp; // Height / a_ratio, 0, 0, 0 vTemp = vValues; vTemp = _mm_and_ps(vTemp, M3D_MMaskY); ret.rows[1] = vTemp; // 0, CosFov / SinFov, 0, 0 vTemp = _mm_setzero_ps(); vValues = _mm_shuffle_ps(vValues, M3D_MIdentityR3_n, _MM_SHUFFLE(3, 2, 3, 2)); vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); ret.rows[2] = vTemp; // 0, 0, fRange, -1.0f vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); ret.rows[3] = vTemp; // 0, 0, fRange * near, 0.0f return ret; #endif } inline M3D_MATRIX M3D_TransformMatrixTranslate(M3D_VECTOR Offset) noexcept { #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = 1.0f; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = 1.0f; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = 1.0f; ret.mat[2][3] = 0.0f; ret.mat[3][0] = Offset.v4f[0]; ret.mat[3][1] = Offset.v4f[1]; ret.mat[3][2] = Offset.v4f[2]; ret.mat[3][3] = 1.0f; return ret; #else M3D_MATRIX ret; ret.rows[0] = M3D_MIdentityR0.v; ret.rows[1] = M3D_MIdentityR1.v; ret.rows[2] = M3D_MIdentityR2.v; ret.rows[3] = M3D_V4Select(M3D_MIdentityR3.v, Offset, M3D_MSelect1110.v); return ret; #endif } inline M3D_MATRIX M3D_TransformMatrixScale(float ScaleX, float ScaleY, float ScaleZ) noexcept { #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = ScaleX; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = ScaleY; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = ScaleZ; ret.mat[2][3] = 0.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = 0.0f; ret.mat[3][3] = 1.0f; return ret; #else M3D_MATRIX ret; ret.rows[0] = _mm_set_ps(0, 0, 0, ScaleX); ret.rows[1] = _mm_set_ps(0, 0, ScaleY, 0); ret.rows[2] = _mm_set_ps(0, ScaleZ, 0, 0); ret.rows[3] = M3D_MIdentityR3.v; return ret; #endif } inline M3D_MATRIX M3D_TransformMatrixScale(M3D_VECTOR Scale) noexcept { #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = Scale.v4f[0]; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = Scale.v4f[1]; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = Scale.v4f[2]; ret.mat[2][3] = 0.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = 0.0f; ret.mat[3][3] = 1.0f; return ret; #else M3D_MATRIX ret; ret.rows[0] = _mm_and_ps(Scale, M3D_MMaskX); ret.rows[1] = _mm_and_ps(Scale, M3D_MMaskY); ret.rows[2] = _mm_and_ps(Scale, M3D_MMaskZ); ret.rows[3] = M3D_MIdentityR3.v; return ret; #endif } inline M3D_MATRIX M3D_TransformMatrixTranslate(float OffsetX, float OffsetY, float OffsetZ) noexcept { #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = 1.0f; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = 1.0f; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = 1.0f; ret.mat[2][3] = 0.0f; ret.mat[3][0] = OffsetX; ret.mat[3][1] = OffsetY; ret.mat[3][2] = OffsetZ; ret.mat[3][3] = 1.0f; return ret; #else M3D_MATRIX ret; ret.rows[0] = M3D_MIdentityR0.v; ret.rows[1] = M3D_MIdentityR1.v; ret.rows[2] = M3D_MIdentityR2.v; ret.rows[3] = M3D_V4Set(OffsetX, OffsetY, OffsetZ, 1.f); return ret; #endif } inline M3D_MATRIX M3D_TransformMatrixRotationX(float Angle) noexcept { float SinAngle; float CosAngle; M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle); #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = 1.0f; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = CosAngle; ret.mat[1][2] = SinAngle; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = -SinAngle; ret.mat[2][2] = CosAngle; ret.mat[2][3] = 0.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = 0.0f; ret.mat[3][3] = 1.0f; return ret; #else M3D_VECTOR vSin = _mm_set_ss(SinAngle); M3D_VECTOR vCos = _mm_set_ss(CosAngle); // x = 0,y = cos,z = sin, w = 0 vCos = _mm_shuffle_ps(vCos, vSin, _MM_SHUFFLE(3, 0, 0, 3)); M3D_MATRIX ret; ret.rows[0] = M3D_MIdentityR0; ret.rows[1] = vCos; // x = 0,y = sin,z = cos, w = 0 vCos = M3D_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 1, 2, 0)); // x = 0,y = -sin,z = cos, w = 0 vCos = _mm_mul_ps(vCos, M3D_MNegateY); ret.rows[2] = vCos; ret.rows[3] = M3D_MIdentityR3; return ret; #endif } inline M3D_MATRIX M3D_TransformMatrixRotationY(float Angle) noexcept { float SinAngle; float CosAngle; M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle); #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = CosAngle; ret.mat[0][1] = 0.0f; ret.mat[0][2] = -SinAngle; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = 1.0f; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = SinAngle; ret.mat[2][1] = 0.0f; ret.mat[2][2] = CosAngle; ret.mat[2][3] = 0.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = 0.0f; ret.mat[3][3] = 1.0f; return ret; #else M3D_VECTOR vSin = _mm_set_ss(SinAngle); M3D_VECTOR vCos = _mm_set_ss(CosAngle); // x = sin,y = 0,z = cos, w = 0 vSin = _mm_shuffle_ps(vSin, vCos, _MM_SHUFFLE(3, 0, 3, 0)); M3D_MATRIX ret; ret.rows[2] = vSin; ret.rows[1] = M3D_MIdentityR1; // x = cos,y = 0,z = sin, w = 0 vSin = M3D_PERMUTE_PS(vSin, _MM_SHUFFLE(3, 0, 1, 2)); // x = cos,y = 0,z = -sin, w = 0 vSin = _mm_mul_ps(vSin, M3D_MNegateZ); ret.rows[0] = vSin; ret.rows[3] = M3D_MIdentityR3; return ret; #endif } inline M3D_MATRIX M3D_TransformMatrixRotationZ(float Angle) noexcept { float SinAngle; float CosAngle; M3D_ScalarSinCos(&SinAngle, &CosAngle, Angle); #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = CosAngle; ret.mat[0][1] = SinAngle; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = -SinAngle; ret.mat[1][1] = CosAngle; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = 1.0f; ret.mat[2][3] = 0.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = 0.0f; ret.mat[3][3] = 1.0f; return ret; #else M3D_VECTOR vSin = _mm_set_ss(SinAngle); M3D_VECTOR vCos = _mm_set_ss(CosAngle); // x = cos,y = sin,z = 0, w = 0 vCos = _mm_unpacklo_ps(vCos, vSin); M3D_MATRIX ret; ret.rows[0] = vCos; // x = sin,y = cos,z = 0, w = 0 vCos = M3D_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 2, 0, 1)); // x = cos,y = -sin,z = 0, w = 0 vCos = _mm_mul_ps(vCos, M3D_MNegateX); ret.rows[1] = vCos; ret.rows[2] = M3D_MIdentityR2; ret.rows[3] = M3D_MIdentityR3; return ret; #endif } inline M3D_MATRIX M3D_TransformMatrixRotation(M3D_VECTOR Angles) noexcept { #ifdef DISABLE_INTRINSICS float cp = cosf(Angles.v4f[0]); float sp = sinf(Angles.v4f[0]); float cy = cosf(Angles.v4f[1]); float sy = sinf(Angles.v4f[1]); float cr = cosf(Angles.v4f[2]); float sr = sinf(Angles.v4f[2]); M3D_MATRIX ret; ret.mat[0][0] = cr * cy + sr * sp * sy; ret.mat[0][1] = sr * cp; ret.mat[0][2] = sr * sp * cy - cr * sy; ret.mat[0][3] = 0.0f; ret.mat[1][0] = cr * sp * sy - sr * cy; ret.mat[1][1] = cr * cp; ret.mat[1][2] = sr * sy + cr * sp * cy; ret.mat[1][3] = 0.0f; ret.mat[2][0] = cp * sy; ret.mat[2][1] = -sp; ret.mat[2][2] = cp * cy; ret.mat[2][3] = 0.0f; ret.mat[3][0] = 0.0f; ret.mat[3][1] = 0.0f; ret.mat[3][2] = 0.0f; ret.mat[3][3] = 1.0f; return ret; #else static const M3D_V4F32 Sign = {{{1.0f, -1.0f, -1.0f, 1.0f}}}; M3D_VECTOR SinAngles, CosAngles; M3D_V4SinCos(&SinAngles, &CosAngles, Angles); M3D_VECTOR P0 = M3D_V4Permute(SinAngles, CosAngles); M3D_VECTOR Y0 = M3D_V4Permute(SinAngles, CosAngles); M3D_VECTOR P1 = M3D_V4Permute(SinAngles, CosAngles); M3D_VECTOR Y1 = M3D_V4Permute(SinAngles, CosAngles); M3D_VECTOR P2 = M3D_V4Permute(SinAngles, CosAngles); M3D_VECTOR P3 = M3D_V4Permute(SinAngles, CosAngles); M3D_VECTOR Y2 = M3D_V4SplatX(SinAngles); M3D_VECTOR NS = M3D_V4Negate(SinAngles); M3D_VECTOR Q0 = M3D_V4Multiply(P0, Y0); M3D_VECTOR Q1 = M3D_V4Multiply(P1, Sign.v); Q1 = M3D_V4Multiply(Q1, Y1); M3D_VECTOR Q2 = M3D_V4Multiply(P2, Y2); Q2 = M3D_V4MultiplyAdd(Q2, P3, Q1); M3D_VECTOR V0 = M3D_V4Permute(Q0, Q2); M3D_VECTOR V1 = M3D_V4Permute(Q0, Q2); M3D_VECTOR V2 = M3D_V4Permute(Q0, NS); M3D_MATRIX ret; ret.rows[0] = M3D_V4Select(M3D_MZero, V0, M3D_MSelect1110.v); ret.rows[1] = M3D_V4Select(M3D_MZero, V1, M3D_MSelect1110.v); ret.rows[2] = M3D_V4Select(M3D_MZero, V2, M3D_MSelect1110.v); ret.rows[3] = M3D_MIdentityR3; return ret; #endif } //TODO: transform matrix is incomplete //v_tri[v_cnt].position.z = ((far+near)/2)+((far-near)/2)*_2dCoord.z; inline M3D_MATRIX M3D_TransformMatrixViewport(float _w, float _h, float _wOffset, float _hOffset) noexcept { const float widthDiv2 = _w / 2; const float heightDiv2 = _h / 2; #ifdef DISABLE_INTRINSICS M3D_MATRIX ret; ret.mat[0][0] = widthDiv2; ret.mat[0][1] = 0.0f; ret.mat[0][2] = 0.0f; ret.mat[0][3] = 0.0f; ret.mat[1][0] = 0.0f; ret.mat[1][1] = -heightDiv2; ret.mat[1][2] = 0.0f; ret.mat[1][3] = 0.0f; ret.mat[2][0] = 0.0f; ret.mat[2][1] = 0.0f; ret.mat[2][2] = 1.0f; // maxZ-minZ ignored ret.mat[2][3] = 0.0f; // minZ ignored ret.mat[3][0] = _wOffset + widthDiv2; ret.mat[3][1] = _hOffset + heightDiv2; ret.mat[3][2] = 0.0f; ret.mat[3][3] = 1.0f; return ret; #else M3D_MATRIX ret; ret.rows[0] = M3D_V4Set(widthDiv2, 0, 0, 0); ret.rows[1] = M3D_V4Set(0, -heightDiv2, 0, 0); ret.rows[2] = M3D_MIdentityR2.v; // maxZ-minZ and minZ are ignored ret.rows[3] = M3D_V4Set(_wOffset + widthDiv2, _hOffset + heightDiv2, 0, 1); return ret; #endif }