Graphic pipeline rework

- Prepare flow for multithreading support
- Optimize memory access of vertices
- More efficient back-face culling computation
- Added option to disable gprof in Cmake
This commit is contained in:
JackCarterSmith 2025-01-10 13:59:48 +01:00
parent ea712fae42
commit 4f48b76aaf
Signed by: JackCarterSmith
GPG Key ID: 832E52F4E23F8F24
6 changed files with 314 additions and 214 deletions

View File

@ -25,6 +25,7 @@ project(ProtoTank VERSION 0.1.0 DESCRIPTION "Arcade 80s-style game with tanks" L
# Compilation option
option(DISABLE_CPU_OPTI "Disable CPU optimizations" OFF)
option(ENABLE_PROFILER "Enable gprof integration" ON)
if(NOT DISABLE_CPU_OPTI)
if(NOT MSVC)
@ -97,8 +98,10 @@ if(MSVC)
set_target_properties(${PROJECT_NAME} PROPERTIES IMPORT_PREFIX "lib")
else()
# GCC profiler options
if(ENABLE_PROFILER)
list(APPEND COMPOPTS -pg -ggdb3 -no-pie)
list(APPEND LINKOPTS -pg -ggdb3 -no-pie)
endif()
# static linking of stdlib
if(MINGW)

View File

@ -9,11 +9,17 @@
//#define DISABLE_AABB_CLIPPING
//#define DISABLE_TRIANGLE_CLIPPING
#define DISABLE_WIREFRAME_MODE
//#define DISABLE_WIREFRAME_MODE
// Rendering pipeline:
// model matrix (Object SRT) -> view matrix (camera matrix inverted) -> proj matrix -> clipping -> perspective divide -> viewport transformation -> Rasterizer (draw pixels inside projected triangles on 2D screen)
// model matrix (Object SRT) -> view matrix (camera matrix inverted) -> proj matrix -> clipping -> perspective divide
// -> viewport transformation -> Rasterizer (draw pixels inside projected triangles on 2D screen)
// Revised rendering pipeline:
// AABB clipping -> model transform matrix (Object SRT) -> view matrix (camera matrix inverted) -> proj matrix
// -> faces culling -> triangles clipping -> perspective divide -> viewport transformation -> Rasterizer (draw pixels inside projected triangles on 2D screen)
//
// Virtual space transformations order:
// object space -> world space -> camera space -> homogeneous clip space -> NDC space -> raster space
//
// Rasterizer inputs elements:
@ -31,7 +37,16 @@
// * https://en.wikipedia.org/wiki/Hidden-surface_determination#Occlusion_culling
// * https://en.wikipedia.org/wiki/Bounding_volume_hierarchy
static bool VertexClipTest(M3D_F4& V, sf::Vector2f& RTsize, float gb_factor);
struct RenderItem final {
const WorldObject* pObj = nullptr;
const M3D_ContainmentType frustrumClipType = CONTAINS;
RenderItem() = delete;
RenderItem(const WorldObject* pObj) : pObj(pObj) {}
RenderItem(const WorldObject* pObj, const M3D_ContainmentType cType) : pObj(pObj), frustrumClipType(cType) {}
};
static bool VertexClipTest(M3D_F4* V, sf::Vector2f& RTsize);
Graphic3DRenderer::Graphic3DRenderer() {
if (mMainCamera == nullptr) {
@ -42,24 +57,24 @@ Graphic3DRenderer::Graphic3DRenderer() {
mMainCamera->UpdateCamView();
// Fill world object list to render
mRenderList.clear();
mRenderList.push_back(std::make_shared<ObjectDbgCube>());
mRenderList.back()->SetPosition(0.f, 0.f, 15.f);
mRenderList.back()->SetScale(2.0f);
mRenderList.push_back(std::make_shared<ObjectDbgCube>());
mRenderList.back()->SetPosition(6.f, 2.f, 2.f);
mRenderList.back()->SetScale(2.0f);
mRenderList.push_back(std::make_shared<ObjectDbgCube>());
mRenderList.back()->SetPosition(-8.f, 5.f, 10.f);
mRenderList.back()->SetScale(2.0f);
mRenderList.push_back(std::make_shared<Tank>());
mRenderList.back()->SetPosition(0.f, 0.f, 0.f);
mRenderList.back()->SetScale(5.0f);
mWorldObjsList.clear();
mWorldObjsList.push_back(std::make_shared<ObjectDbgCube>());
mWorldObjsList.back()->SetPosition(0.f, 0.f, 15.f);
mWorldObjsList.back()->SetScale(2.0f);
mWorldObjsList.push_back(std::make_shared<ObjectDbgCube>());
mWorldObjsList.back()->SetPosition(6.f, 2.f, 2.f);
mWorldObjsList.back()->SetScale(2.0f);
mWorldObjsList.push_back(std::make_shared<ObjectDbgCube>());
mWorldObjsList.back()->SetPosition(-8.f, 5.f, 10.f);
mWorldObjsList.back()->SetScale(2.0f);
mWorldObjsList.push_back(std::make_shared<Tank>());
mWorldObjsList.back()->SetPosition(0.f, 0.f, 0.f);
mWorldObjsList.back()->SetScale(5.0f);
for (size_t i = 0; i < 40; i++) {
mRenderList.push_back(std::make_shared<Tank>());
mRenderList.back()->SetPosition(-100.f + (i * 5.f), 0.f, 8.f);
mRenderList.back()->SetScale(5.0f);
mWorldObjsList.push_back(std::make_shared<Tank>());
mWorldObjsList.back()->SetPosition(-100.f + (i * 5.f), 0.f, 8.f);
mWorldObjsList.back()->SetScale(5.0f);
}
}
@ -100,9 +115,6 @@ void Graphic3DRenderer::UpdateCamera(CAMERA_MOVE type, const float value) {
}
void Graphic3DRenderer::Draw(sf::RenderTexture& context) {
sf::BlendMode sBM = sf::BlendNone;
sf::RenderStates sRS(sBM);
#ifdef DEBUG
drawnTriCount = 0;
#endif
@ -110,149 +122,8 @@ void Graphic3DRenderer::Draw(sf::RenderTexture& context) {
// Hardcoded debug movement, TODO: remove it
UpdateInternalTestObjects();
// Load main matrices
M3D_MATRIX viewMat = mMainCamera->GetView();
M3D_MATRIX invViewMat = M3D_MInverse(viewMat); // aka. camMat
M3D_MATRIX projMat = mMainCamera->GetProj();
M3D_MATRIX viewProjMat = viewMat * projMat;
// Create the frustrum "box"
M3D_BoundingFrustum camFrustrum(projMat, false);
camFrustrum.Transform(camFrustrum, invViewMat);
const float sgRatio = ComputeSGRatio();
// -= Draw the sky =-
// To avoid unfilled pixels on screen, the "sky-plane" will be rendered
// all over the screen.
// It's will be useless to use and compute a specific rectangle from the
// size of the screen!
// The sky have an infinite z-depth (any objects will be rendered over).
#ifdef DISABLE_WIREFRAME_MODE
context.clear(SF_COLOR_4CHEX(0x00B5E2FF));
#endif
// -= Draw the ground =-
// A simple rectangle shape is used to draw the ground over the sky-plane.
// The ground is draw after the sky, and before any other object.
// Depending of the camera pitch, the ratio sky/ground on screen vary.
// Like the sky, the ground have an infinite z-depth (any objects will
// be rendered over).
#ifdef DISABLE_WIREFRAME_MODE
sf::RectangleShape gndRect;
if (mMainCamera->GetPos3f().y >= 0) {
gndRect.setSize(sf::Vector2f(mRTSize.x, mRTSize.y * sgRatio));
gndRect.setPosition(sf::Vector2f(0, mRTSize.y * (1.f - sgRatio) - 1));
} else {
gndRect.setSize(sf::Vector2f(mRTSize.x, mRTSize.y * (1.f - sgRatio)));
gndRect.setPosition(sf::Vector2f(0, 0));
}
gndRect.setFillColor(SF_COLOR_4CHEX(0x009A17FF));
//gndRect.setFillColor(SF_COLOR_4CHEX(0xD5C2A5FF));
context.draw(gndRect, sRS);
#else
sf::Vertex gndLine[2];
gndLine[0].position = sf::Vector2f(0, mRTSize.y * (1.f - sgRatio));
gndLine[0].color = sf::Color::White;
gndLine[1].position = sf::Vector2f(mRTSize.x - 1, mRTSize.y * (1.f - sgRatio));
gndLine[1].color = sf::Color::White;
context.draw(gndLine, 2, sf::Lines);
#endif
// Process scene's objects
size_t prevVCount = 0;
std::vector<M3D_F4> projVertices;
for (auto& obj : mRenderList) {
M3D_BoundingBox projAABB = obj->GetAABB();
auto oTMat = obj->GetTransform();
// Object outside frustrum clipping
projAABB.Transform(projAABB, oTMat);
M3D_ContainmentType objInFrustrum = camFrustrum.Contains(projAABB);
#ifndef DISABLE_AABB_CLIPPING
if (objInFrustrum != DISJOINT)
#endif
{
size_t vCount = obj->GetObjectVerticesCount();
auto& oMesh = obj->GetObjectMesh();
if (vCount > prevVCount)
projVertices.resize(vCount);
// Vertices homogeneous clip space transformation
M3D_V3Transform(
projVertices.data(), sizeof(M3D_F4),
reinterpret_cast<const M3D_F3*>(oMesh.vertices.data()), sizeof(Vertex),
vCount,
oTMat * viewProjMat
);
// Draw the object indice triangles if visible or partially clipped
sf::Vertex v_tri[4];
for (auto& objPt : oMesh.parts) {
auto indicePtr = static_cast<const uint32_t*>(objPt.indices.data());
for (uint32_t i = 0; i < objPt.GetIndicesCount(); i += 3) {
// Misscontructed indices tree failsafe
if (i+2 > objPt.GetIndicesCount())
break;
// Triangle clipping
#ifndef DISABLE_TRIANGLE_CLIPPING
//TODO: scissor/clipping depending of how many vertices are outside/inside the clipspace, implement complete Cohen-Sutherland algo or CyrusBeck one
if (VertexClipTest(projVertices.at(indicePtr[i]), mRTSize, 2.5f) &&
VertexClipTest(projVertices.at(indicePtr[i+1]), mRTSize, 2.5f) &&
VertexClipTest(projVertices.at(indicePtr[i+2]), mRTSize, 2.5f))
#endif
{
M3D_VECTOR V1 = M3D_V4LoadF4(&projVertices.at(indicePtr[i]));
M3D_VECTOR V2 = M3D_V4LoadF4(&projVertices.at(indicePtr[i+1]));
M3D_VECTOR V3 = M3D_V4LoadF4(&projVertices.at(indicePtr[i+2]));
// Do the perspective divide
V1 = M3D_V4Divide(V1, M3D_V4SplatW(V1));
V2 = M3D_V4Divide(V2, M3D_V4SplatW(V2));
V3 = M3D_V4Divide(V3, M3D_V4SplatW(V3));
V1 = M3D_V3TransformNDCToViewport(V1, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
V2 = M3D_V3TransformNDCToViewport(V2, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
V3 = M3D_V3TransformNDCToViewport(V3, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
// Face culling
if (M3D_V4GetX(M3D_TNormal(V1,V2,V3))*0.5f <= 0) {
if (objInFrustrum == DISJOINT) {
v_tri[0].color = sf::Color::Red;
v_tri[1].color = sf::Color::Red;
v_tri[2].color = sf::Color::Red;
} else if (objInFrustrum == INTERSECTS) {
v_tri[0].color = sf::Color::Yellow;
v_tri[1].color = sf::Color::Yellow;
v_tri[2].color = sf::Color::Yellow;
} else {
v_tri[0].color = oMesh.vertices[indicePtr[i]].color;
v_tri[1].color = oMesh.vertices[indicePtr[i+1]].color;
v_tri[2].color = oMesh.vertices[indicePtr[i+2]].color;
}
v_tri[0].position = sf::Vector2f(M3D_V4GetX(V1), M3D_V4GetY(V1));
v_tri[1].position = sf::Vector2f(M3D_V4GetX(V2), M3D_V4GetY(V2));
v_tri[2].position = sf::Vector2f(M3D_V4GetX(V3), M3D_V4GetY(V3));
v_tri[3] = v_tri[0];
#ifdef DISABLE_WIREFRAME_MODE
context.draw(v_tri, 4, sf::Triangles, sRS);
#else
context.draw(v_tri, 4, sf::LineStrip, sRS);
#endif
#ifdef DEBUG
drawnTriCount++;
#endif
}
}
}
}
}
prevVCount = prevVCount;
}
DrawBackground(context);
DrawSceneObjects(context);
}
void Graphic3DRenderer::UpdateInternalTestObjects() {
@ -262,10 +133,10 @@ void Graphic3DRenderer::UpdateInternalTestObjects() {
thetaAngle2 = thetaAngle2 >= 6.283185f ? -6.283185f : thetaAngle2 + 0.005f;
static float thetaAngle3 = -4.78f;
thetaAngle3 = thetaAngle3 >= 6.283185f ? -6.283185f : thetaAngle3 + 0.008f;
mRenderList[0]->SetRotation(thetaAngle, 0.f, thetaAngle * 0.5f);
mRenderList[1]->SetRotation(thetaAngle2, 0.f, thetaAngle2 * 0.5f);
mRenderList[2]->SetRotation(thetaAngle3, 0.f, thetaAngle3 * 0.5f);
mRenderList[3]->SetRotation(0.f, thetaAngle, 0.f);
mWorldObjsList[0]->SetRotation(thetaAngle, 0.f, thetaAngle * 0.5f);
mWorldObjsList[1]->SetRotation(thetaAngle2, 0.f, thetaAngle2 * 0.5f);
mWorldObjsList[2]->SetRotation(thetaAngle3, 0.f, thetaAngle3 * 0.5f);
mWorldObjsList[3]->SetRotation(0.f, thetaAngle, 0.f);
}
// Compute the screen ratio between the ground and the sky (aka. Line of Horizon)
@ -314,9 +185,180 @@ float Graphic3DRenderer::ComputeSGRatio() {
return sgRatio;
}
inline static bool VertexClipTest(M3D_F4& V, sf::Vector2f& RTsize, float gb_factor) {
void Graphic3DRenderer::DrawBackground(sf::RenderTexture& context) {
sf::BlendMode sBM = sf::BlendNone;
sf::RenderStates sRS(sBM);
const float sgRatio = ComputeSGRatio();
// -= Draw the sky =-
// To avoid unfilled pixels on screen, the "sky-plane" will be rendered
// all over the screen.
// It's will be useless to use and compute a specific rectangle from the
// size of the screen!
// The sky have an infinite z-depth (any objects will be rendered over).
#ifdef DISABLE_WIREFRAME_MODE
context.clear(SF_COLOR_4CHEX(0x00B5E2FF));
#endif
// -= Draw the ground =-
// A simple rectangle shape is used to draw the ground over the sky-plane.
// The ground is draw after the sky, and before any other object.
// Depending of the camera pitch, the ratio sky/ground on screen vary.
// Like the sky, the ground have an infinite z-depth (any objects will
// be rendered over).
#ifdef DISABLE_WIREFRAME_MODE
sf::RectangleShape gndRect;
if (mMainCamera->GetPos3f().y >= 0) {
gndRect.setSize(sf::Vector2f(mRTSize.x, mRTSize.y * sgRatio));
gndRect.setPosition(sf::Vector2f(0, mRTSize.y * (1.f - sgRatio) - 1));
} else {
gndRect.setSize(sf::Vector2f(mRTSize.x, mRTSize.y * (1.f - sgRatio)));
gndRect.setPosition(sf::Vector2f(0, 0));
}
gndRect.setFillColor(SF_COLOR_4CHEX(0x009A17FF));
//gndRect.setFillColor(SF_COLOR_4CHEX(0xD5C2A5FF));
context.draw(gndRect, sRS);
#else
sf::Vertex gndLine[2];
gndLine[0].position = sf::Vector2f(0, mRTSize.y * (1.f - sgRatio));
gndLine[0].color = sf::Color::White;
gndLine[1].position = sf::Vector2f(mRTSize.x - 1, mRTSize.y * (1.f - sgRatio));
gndLine[1].color = sf::Color::White;
context.draw(gndLine, 2, sf::Lines, sRS);
#endif
}
void Graphic3DRenderer::DrawSceneObjects(sf::RenderTexture& context) {
sf::BlendMode sBM = sf::BlendNone;
sf::RenderStates sRS(sBM);
// Get global (camera and projection) matrixes
M3D_MATRIX viewMat = mMainCamera->GetView();
M3D_MATRIX invViewMat = M3D_MInverse(viewMat); // aka. camera matrix
M3D_MATRIX projMat = mMainCamera->GetProj();
M3D_MATRIX viewProjMat = viewMat * projMat;
std::vector<RenderItem> renderingList;
renderingList.reserve(mWorldObjsList.size());
M3D_BoundingFrustum camFrustrum(projMat, false);
camFrustrum.Transform(camFrustrum, invViewMat);
for (auto& obj : mWorldObjsList) {
#ifndef DISABLE_AABB_CLIPPING
// Objects visibility AABB test
M3D_BoundingBox projAABB = obj->GetAABB();
projAABB.Transform(projAABB, obj->GetTransform());
// Do the camera/AABB test
M3D_ContainmentType aabbTestResult = camFrustrum.Contains(projAABB);
if (aabbTestResult != DISJOINT)
renderingList.emplace_back(RenderItem(obj.get(), aabbTestResult));
#else
renderingList.emplace_back(RenderItem(obj.get()));
#endif
}
// Do the NDC projection of visibles vertices in camera frustrum
size_t prevVCount = 0;
std::vector<M3D_F4> projVertices;
sf::Vector2f guardband = mRTSize * 3.5f;
for (auto& ri : renderingList) {
size_t vCount = ri.pObj->GetObjectVerticesCount();
// Resize the output buffer only if we encounter object with more vertices than before
if (vCount > prevVCount) {
projVertices.resize(vCount);
prevVCount = vCount;
}
auto& oMesh = ri.pObj->GetObjectMesh();
// Vertices homogeneous clip space (NDC) transformation
M3D_V3Transform(
projVertices.data(), sizeof(M3D_F4),
reinterpret_cast<const M3D_F3*>(oMesh.vertices.data()), sizeof(Vertex),
vCount,
ri.pObj->GetTransform() * viewProjMat
);
// Look into triangles indices
M3D_F4* triVertices[3];
sf::Vertex drawPoints[4];
for (auto& objPt : oMesh.parts) {
auto indicePtr = static_cast<const uint32_t*>(objPt.indices.data());
for (uint32_t i = 0; i < objPt.GetIndicesCount(); i += 3) {
// Indices failsafe - discard triangle rendering
if ((i+2 > objPt.GetIndicesCount()) || indicePtr[i] >= vCount || indicePtr[i+1] >= vCount || indicePtr[i+2] >= vCount) {
//log.PrintWarning()
break;
}
// Retrieve the vertices pointer from indices list
triVertices[0] = &projVertices[indicePtr[i]];
triVertices[1] = &projVertices[indicePtr[i+1]];
triVertices[2] = &projVertices[indicePtr[i+2]];
// Triangle frustrum clipping -- TODO: Use complete Cohen-Sutherland algo or CyrusBeck one
#ifndef DISABLE_TRIANGLE_CLIPPING
if (VertexClipTest(triVertices[0], guardband) && VertexClipTest(triVertices[1], guardband) && VertexClipTest(triVertices[2], guardband))
#endif
{
M3D_VECTOR V1 = M3D_V4LoadF4(triVertices[0]);
M3D_VECTOR V2 = M3D_V4LoadF4(triVertices[1]);
M3D_VECTOR V3 = M3D_V4LoadF4(triVertices[2]);
// Back-face culling in LH-z system
// (front when (triangle normal . projected vertice) < 0)
//if (M3D_V4GetX(M3D_V3Dot(M3D_Tri3DNormal(V1,V2,V3), V1)) < 0)
// NOT USED - Too heavy computation resources usage
// Do the perspective divide
V1 = M3D_V4Divide(V1, M3D_V4SplatW(V1));
V2 = M3D_V4Divide(V2, M3D_V4SplatW(V2));
V3 = M3D_V4Divide(V3, M3D_V4SplatW(V3));
// Finally project from NDC to the screen
V1 = M3D_V3TransformNDCToViewport(V1, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
V2 = M3D_V3TransformNDCToViewport(V2, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
V3 = M3D_V3TransformNDCToViewport(V3, 0.f, 0.f, mRTSize.x, mRTSize.y, 1.f, 100.f);
// Simplified back-face culling on 2D viewport triangle
if (M3D_V4GetX(M3D_Tri2DNormal(V1,V2,V3)) > 0) {
// Set pixels color depending of frustrum clipping type - debug purpose
if (ri.frustrumClipType == DISJOINT) {
drawPoints[0].color = drawPoints[1].color = drawPoints[2].color = sf::Color::Red;
} else if (ri.frustrumClipType == INTERSECTS) {
drawPoints[0].color = drawPoints[1].color = drawPoints[2].color = sf::Color::Yellow;
} else {
drawPoints[0].color = oMesh.vertices[indicePtr[i]].color;
drawPoints[1].color = oMesh.vertices[indicePtr[i+1]].color;
drawPoints[2].color = oMesh.vertices[indicePtr[i+2]].color;
}
drawPoints[0].position = sf::Vector2f(M3D_V4GetX(V1), M3D_V4GetY(V1));
drawPoints[1].position = sf::Vector2f(M3D_V4GetX(V2), M3D_V4GetY(V2));
drawPoints[2].position = sf::Vector2f(M3D_V4GetX(V3), M3D_V4GetY(V3));
drawPoints[3] = drawPoints[0];
#ifdef DISABLE_WIREFRAME_MODE
context.draw(drawPoints, 4, sf::Triangles, sRS);
#else
context.draw(drawPoints, 4, sf::LineStrip, sRS);
#endif
#ifdef DEBUG
drawnTriCount++;
#endif
}
}
}
}
}
}
__attribute__((always_inline)) inline static bool VertexClipTest(M3D_F4* V, sf::Vector2f& RTsize) {
// Guard band are usually 2-3x the viewport size for the clipping test
return (V.x > -RTsize.x*gb_factor*V.w && V.x < RTsize.y*gb_factor*V.w &&
V.y > -RTsize.x*gb_factor*V.w && V.y < RTsize.y*gb_factor*V.w
return (V->z >= 0 && V->z <= V->w &&
V->x >= -RTsize.x*V->w && V->x <= RTsize.x*V->w &&
V->y >= -RTsize.y*V->w && V->y <= RTsize.y*V->w
);
}

View File

@ -33,22 +33,21 @@ public:
void Draw(sf::RenderTexture& context);
// Debug datas
#ifdef DEBUG
const unsigned int GetDrawTriCount() const noexcept { return drawnTriCount; }
#endif
private:
std::unique_ptr<Camera> mMainCamera; // Default player view
sf::Vector2f mRTSize;
std::vector<std::shared_ptr<WorldObject>> mRenderList; // List of elements to be rendered next frame
std::vector<std::shared_ptr<WorldObject>> mWorldObjsList; // List of elements to be rendered next frame
void UpdateInternalTestObjects();
float ComputeSGRatio();
void DrawBackground(sf::RenderTexture& context);
void DrawSceneObjects(sf::RenderTexture& context);
// Debug datas
#ifdef DEBUG
unsigned int drawnTriCount;
#endif
unsigned int drawnTriCount = 0;
};

View File

@ -350,6 +350,10 @@ M3D_ALIGNED_STRUCT(16) M3D_MATRIX {
//
// Load/Store functions
//
M3D_VECTOR M3D_V4LoadF2(const M3D_F2* src) noexcept;
M3D_VECTOR M3D_V4LoadF2A(const M3D_F2A* src) noexcept;
void M3D_V4StoreF2(M3D_F2* dst, M3D_VECTOR V) noexcept;
void M3D_V4StoreF2A(M3D_F2A* dst, M3D_VECTOR V) noexcept;
M3D_VECTOR M3D_V4LoadF3(const M3D_F3* src) noexcept;
M3D_VECTOR M3D_V4LoadF3A(const M3D_F3A* src) noexcept;
void M3D_V4StoreF3(M3D_F3* dst, M3D_VECTOR V) noexcept;
@ -542,7 +546,8 @@ inline M3D_VECTOR M3D_V4Swizzle(M3D_VECTOR V) noexcept {
M3D_VECTOR M3D_QMultiply(M3D_VECTOR Q1, M3D_VECTOR Q2) noexcept;
M3D_VECTOR M3D_QConjugate(M3D_VECTOR Q) noexcept;
M3D_VECTOR M3D_TNormal(M3D_VECTOR P1, M3D_VECTOR P2, M3D_VECTOR P3) noexcept;
M3D_VECTOR M3D_Tri2DNormal(M3D_VECTOR P1, M3D_VECTOR P2, M3D_VECTOR P3) noexcept;
M3D_VECTOR M3D_Tri3DNormal(M3D_VECTOR P1, M3D_VECTOR P2, M3D_VECTOR P3) noexcept;
void M3D_V4SinCos(M3D_VECTOR* pSin, M3D_VECTOR* pCos, M3D_VECTOR V) noexcept;

View File

@ -25,6 +25,50 @@ namespace M3D_Internal {
/* -------------------------------------------------------------------------------------------------------------------------- */
inline M3D_VECTOR M3D_V4LoadF2(const M3D_F2* src) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_VECTOR V;
V.v4f[0] = src->x;
V.v4f[1] = src->y;
V.v4f[2] = 0.f;
V.v4f[3] = 0.f;
return V;
#else
return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(src)));
#endif
}
inline M3D_VECTOR M3D_V4LoadF2A(const M3D_F2A* src) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_VECTOR V;
V.v4f[0] = src->x;
V.v4f[1] = src->y;
V.v4f[2] = 0.f;
V.v4f[3] = 0.f;
return V;
#else
return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(src)));
#endif
}
inline void M3D_V4StoreF2(M3D_F2* dst, M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
dst->x = V.v4f[0];
dst->y = V.v4f[1];
#else
_mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
#endif
}
inline void M3D_V4StoreF2A(M3D_F2A* dst, M3D_VECTOR V) noexcept {
#ifdef DISABLE_INTRINSICS
dst->x = V.v4f[0];
dst->y = V.v4f[1];
#else
_mm_store_sd(reinterpret_cast<double*>(dst), _mm_castps_pd(V));
#endif
}
inline M3D_VECTOR M3D_V4LoadF3(const M3D_F3* src) noexcept {
#ifdef DISABLE_INTRINSICS
M3D_VECTOR V;
@ -1161,18 +1205,18 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
#else // SSE4_INTRINSICS
M3D_VECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f);
// Prepare for the division
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
M3D_VECTOR vResult = _mm_rsqrt_ps(vLengthSq);
// Create zero with a single instruction
M3D_VECTOR vZeroMask = _mm_setzero_ps();
//M3D_VECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
//vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V, vResult);
vResult = _mm_mul_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
//vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
@ -1186,18 +1230,18 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
// Prepare for the division
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
M3D_VECTOR vResult = _mm_rsqrt_ps(vLengthSq);
// Create zero with a single instruction
M3D_VECTOR vZeroMask = _mm_setzero_ps();
//M3D_VECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
//vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V, vResult);
vResult = _mm_mul_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
//vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
@ -1212,18 +1256,18 @@ inline M3D_VECTOR M3D_V3Normalize(M3D_VECTOR V) noexcept {
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
vLengthSq = M3D_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
// Prepare for the division
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
M3D_VECTOR vResult = _mm_rsqrt_ps(vLengthSq);
// Create zero with a single instruction
M3D_VECTOR vZeroMask = _mm_setzero_ps();
//M3D_VECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
//vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
//vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
@ -1339,18 +1383,18 @@ inline M3D_VECTOR M3D_V2Normalize(M3D_VECTOR V) noexcept {
#else // SSE4_INTRINSICS
M3D_VECTOR vLengthSq = _mm_dp_ps(V, V, 0x3f);
// Prepare for the division
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
M3D_VECTOR vResult = _mm_rsqrt_ps(vLengthSq);
// Create zero with a single instruction
M3D_VECTOR vZeroMask = _mm_setzero_ps();
//M3D_VECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
//vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
// Reciprocal mul to perform the normalization
vResult = _mm_div_ps(V, vResult);
vResult = _mm_mul_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
//vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
@ -1363,18 +1407,18 @@ inline M3D_VECTOR M3D_V2Normalize(M3D_VECTOR V) noexcept {
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
vLengthSq = _mm_moveldup_ps(vLengthSq);
// Prepare for the division
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
M3D_VECTOR vResult = _mm_rsqrt_ps(vLengthSq);
// Create zero with a single instruction
M3D_VECTOR vZeroMask = _mm_setzero_ps();
//M3D_VECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
//vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
// Reciprocal mul to perform the normalization
vResult = _mm_div_ps(V, vResult);
vResult = _mm_mul_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
//vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
@ -1387,18 +1431,18 @@ inline M3D_VECTOR M3D_V2Normalize(M3D_VECTOR V) noexcept {
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
// Prepare for the division
M3D_VECTOR vResult = _mm_sqrt_ps(vLengthSq);
M3D_VECTOR vResult = _mm_rsqrt_ps(vLengthSq);
// Create zero with a single instruction
M3D_VECTOR vZeroMask = _mm_setzero_ps();
//M3D_VECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
//vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, M3D_MInfinity);
// Reciprocal mul to perform the normalization
vResult = _mm_div_ps(V, vResult);
vResult = _mm_mul_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
//vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
M3D_VECTOR vTemp1 = _mm_andnot_ps(vLengthSq, M3D_MQNaN);
M3D_VECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
@ -1473,11 +1517,18 @@ inline M3D_VECTOR M3D_QConjugate(M3D_VECTOR Q) noexcept {
#endif
}
inline M3D_VECTOR M3D_TNormal(M3D_VECTOR P1, M3D_VECTOR P2, M3D_VECTOR P3) noexcept {
inline M3D_VECTOR M3D_Tri2DNormal(M3D_VECTOR P1, M3D_VECTOR P2, M3D_VECTOR P3) noexcept {
M3D_VECTOR L1 = M3D_V4Subtract(P2, P1);
M3D_VECTOR L2 = M3D_V4Subtract(P3, P1);
return M3D_V2Normalize(M3D_V2Cross(L2, L1));
return M3D_V2Normalize(M3D_V2Cross(L1, L2));
}
inline M3D_VECTOR M3D_Tri3DNormal(M3D_VECTOR P1, M3D_VECTOR P2, M3D_VECTOR P3) noexcept {
M3D_VECTOR L1 = M3D_V4Subtract(P2, P1);
M3D_VECTOR L2 = M3D_V4Subtract(P3, P1);
return M3D_V3Normalize(M3D_V3Cross(L1, L2));
}

View File

@ -8,14 +8,14 @@ public:
virtual ~WorldObject() = 0;
virtual const Mesh& GetObjectMesh() const = 0;
virtual const size_t GetObjectVerticesCount() const = 0;
const M3D_MATRIX GetTransform() noexcept {
const M3D_MATRIX GetTransform() const noexcept {
M3D_MATRIX M = M3D_MIdentity();
M *= M3D_TransformMatrixScale(M3D_V4LoadF3(&scale));
M *= M3D_TransformMatrixRotation(M3D_V4LoadF3(&rot));
M *= M3D_TransformMatrixTranslate(M3D_V4LoadF3(&pos));
return M;
}
const M3D_F4X4 GetTransform4x4f() noexcept {
const M3D_F4X4 GetTransform4x4f() const noexcept {
M3D_F4X4 out;
M3D_V4StoreF4x4(&out, GetTransform());
return out;