softgpu: Track CLUTs as states for binning.

This way we can have multiple CLUTs in process at once, which helps.
This commit is contained in:
Unknown W. Brackets 2022-01-15 23:12:03 -08:00
parent ba63d9cf09
commit d6fa301ab1
9 changed files with 45 additions and 14 deletions

View file

@ -86,7 +86,7 @@ static inline void DrawBinItem(const BinItem &item, const RasterizerState &state
class DrawBinItemsTask : public Task { class DrawBinItemsTask : public Task {
public: public:
DrawBinItemsTask(BinWaitable *notify, BinQueue<BinItem, 1024> &items, std::atomic<bool> &status, const BinQueue<RasterizerState, 32> &states) DrawBinItemsTask(BinWaitable *notify, BinQueue<BinItem, 1024> &items, std::atomic<bool> &status, const BinQueue<RasterizerState, 64> &states)
: notify_(notify), items_(items), status_(status), states_(states) { : notify_(notify), items_(items), status_(status), states_(states) {
} }
@ -114,7 +114,7 @@ private:
BinWaitable *notify_; BinWaitable *notify_;
BinQueue<BinItem, 1024> &items_; BinQueue<BinItem, 1024> &items_;
std::atomic<bool> &status_; std::atomic<bool> &status_;
const BinQueue<RasterizerState, 32> &states_; const BinQueue<RasterizerState, 64> &states_;
}; };
BinManager::BinManager() { BinManager::BinManager() {
@ -137,6 +137,7 @@ void BinManager::UpdateState() {
Flush(); Flush();
stateIndex_ = (int)states_.Push(RasterizerState()); stateIndex_ = (int)states_.Push(RasterizerState());
ComputeRasterizerState(&states_[stateIndex_]); ComputeRasterizerState(&states_[stateIndex_]);
states_[stateIndex_].samplerID.cached.clut = cluts_[clutIndex_].readable;
DrawingCoords scissorTL(gstate.getScissorX1(), gstate.getScissorY1()); DrawingCoords scissorTL(gstate.getScissorX1(), gstate.getScissorY1());
DrawingCoords scissorBR(gstate.getScissorX2(), gstate.getScissorY2()); DrawingCoords scissorBR(gstate.getScissorX2(), gstate.getScissorY2());
@ -165,6 +166,13 @@ void BinManager::UpdateState() {
} }
} }
void BinManager::UpdateClut(void *src) {
if (cluts_.Full())
Flush();
clutIndex_ = (int)cluts_.Push(BinClut());
memcpy(cluts_[clutIndex_].readable, src, sizeof(BinClut));
}
void BinManager::AddTriangle(const VertexData &v0, const VertexData &v1, const VertexData &v2) { void BinManager::AddTriangle(const VertexData &v0, const VertexData &v1, const VertexData &v2) {
Vec2<int> d01((int)v0.screenpos.x - (int)v1.screenpos.x, (int)v0.screenpos.y - (int)v1.screenpos.y); Vec2<int> d01((int)v0.screenpos.x - (int)v1.screenpos.x, (int)v0.screenpos.y - (int)v1.screenpos.y);
Vec2<int> d02((int)v0.screenpos.x - (int)v2.screenpos.x, (int)v0.screenpos.y - (int)v2.screenpos.y); Vec2<int> d02((int)v0.screenpos.x - (int)v2.screenpos.x, (int)v0.screenpos.y - (int)v2.screenpos.y);
@ -310,6 +318,8 @@ void BinManager::Flush() {
queue_.Reset(); queue_.Reset();
while (states_.Size() > 1) while (states_.Size() > 1)
states_.SkipNext(); states_.SkipNext();
while (cluts_.Size() > 1)
cluts_.SkipNext();
queueRange_.x1 = 0x7FFFFFFF; queueRange_.x1 = 0x7FFFFFFF;
queueRange_.y1 = 0x7FFFFFFF; queueRange_.y1 = 0x7FFFFFFF;

View file

@ -143,12 +143,17 @@ struct BinQueue {
std::atomic<size_t> size_; std::atomic<size_t> size_;
}; };
union BinClut {
uint8_t readable[1024];
};
class BinManager { class BinManager {
public: public:
BinManager(); BinManager();
~BinManager(); ~BinManager();
void UpdateState(); void UpdateState();
void UpdateClut(void *src);
const Rasterizer::RasterizerState &State() { const Rasterizer::RasterizerState &State() {
return states_[stateIndex_]; return states_[stateIndex_];
@ -166,8 +171,10 @@ public:
private: private:
static constexpr int MAX_POSSIBLE_TASKS = 64; static constexpr int MAX_POSSIBLE_TASKS = 64;
BinQueue<Rasterizer::RasterizerState, 32> states_; BinQueue<Rasterizer::RasterizerState, 64> states_;
int stateIndex_; int stateIndex_;
BinQueue<BinClut, 64> cluts_;
int clutIndex_;
BinCoords scissor_; BinCoords scissor_;
BinQueue<BinItem, 1024> queue_; BinQueue<BinItem, 1024> queue_;
BinCoords queueRange_; BinCoords queueRange_;

View file

@ -22,7 +22,7 @@
#include "GPU/GPUState.h" #include "GPU/GPUState.h"
#include "GPU/Software/FuncId.h" #include "GPU/Software/FuncId.h"
static_assert(sizeof(SamplerID) == sizeof(SamplerID::fullKey) + sizeof(SamplerID::cached), "Bad sampler ID size"); static_assert(sizeof(SamplerID) == sizeof(SamplerID::fullKey) + sizeof(SamplerID::cached) + sizeof(SamplerID::pad), "Bad sampler ID size");
static_assert(sizeof(PixelFuncID) == sizeof(PixelFuncID::fullKey) + sizeof(PixelFuncID::cached), "Bad pixel func ID size"); static_assert(sizeof(PixelFuncID) == sizeof(PixelFuncID::fullKey) + sizeof(PixelFuncID::cached), "Bad pixel func ID size");
static inline GEComparison OptimizeRefByteCompare(GEComparison func, u8 ref) { static inline GEComparison OptimizeRefByteCompare(GEComparison func, u8 ref) {

View file

@ -173,8 +173,15 @@ struct SamplerID {
} sizes[8]; } sizes[8];
uint32_t texBlendColor; uint32_t texBlendColor;
uint32_t clutFormat; uint32_t clutFormat;
union {
uint8_t *clut;
uint16_t *clut16;
uint32_t *clut32;
};
} cached; } cached;
uint32_t pad;
union { union {
uint32_t fullKey; uint32_t fullKey;
struct { struct {

View file

@ -35,8 +35,6 @@
using namespace Math3D; using namespace Math3D;
using namespace Rasterizer; using namespace Rasterizer;
extern u32 clut[4096];
namespace Sampler { namespace Sampler {
static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int level, int levelFrac, const SamplerID &samplerID); static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int level, int levelFrac, const SamplerID &samplerID);
@ -267,16 +265,16 @@ static inline u32 LookupColor(unsigned int index, unsigned int level, const Samp
switch (samplerID.ClutFmt()) { switch (samplerID.ClutFmt()) {
case GE_CMODE_16BIT_BGR5650: case GE_CMODE_16BIT_BGR5650:
return RGB565ToRGBA8888(reinterpret_cast<u16*>(clut)[index + clutSharingOffset]); return RGB565ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);
case GE_CMODE_16BIT_ABGR5551: case GE_CMODE_16BIT_ABGR5551:
return RGBA5551ToRGBA8888(reinterpret_cast<u16*>(clut)[index + clutSharingOffset]); return RGBA5551ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);
case GE_CMODE_16BIT_ABGR4444: case GE_CMODE_16BIT_ABGR4444:
return RGBA4444ToRGBA8888(reinterpret_cast<u16*>(clut)[index + clutSharingOffset]); return RGBA4444ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);
case GE_CMODE_32BIT_ABGR8888: case GE_CMODE_32BIT_ABGR8888:
return clut[index + clutSharingOffset]; return samplerID.cached.clut32[index + clutSharingOffset];
default: default:
ERROR_LOG_REPORT(G3D, "Software: Unsupported palette format: %x", samplerID.ClutFmt()); ERROR_LOG_REPORT(G3D, "Software: Unsupported palette format: %x", samplerID.ClutFmt());

View file

@ -29,8 +29,6 @@
using namespace Gen; using namespace Gen;
using namespace Rasterizer; using namespace Rasterizer;
extern u32 clut[4096];
namespace Sampler { namespace Sampler {
FetchFunc SamplerJitCache::CompileFetch(const SamplerID &id) { FetchFunc SamplerJitCache::CompileFetch(const SamplerID &id) {
@ -1253,8 +1251,10 @@ bool SamplerJitCache::Jit_ReadClutQuad(const SamplerID &id, bool level1) {
regCache_.Release(vecLevelReg, RegCache::VEC_TEMP0); regCache_.Release(vecLevelReg, RegCache::VEC_TEMP0);
} }
X64Reg idReg = GetSamplerID();
X64Reg clutBaseReg = regCache_.Alloc(RegCache::GEN_TEMP1); X64Reg clutBaseReg = regCache_.Alloc(RegCache::GEN_TEMP1);
MOV(PTRBITS, R(clutBaseReg), ImmPtr(clut)); MOV(PTRBITS, R(clutBaseReg), MDisp(idReg, offsetof(SamplerID, cached.clut)));
UnlockSamplerID(idReg);
X64Reg resultReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT); X64Reg resultReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0); X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0);
@ -3457,8 +3457,10 @@ bool SamplerJitCache::Jit_ReadClutColor(const SamplerID &id) {
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2); regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
} }
X64Reg idReg = GetSamplerID();
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1); X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
MOV(PTRBITS, R(temp1Reg), ImmPtr(clut)); MOV(PTRBITS, R(temp1Reg), MDisp(idReg, offsetof(SamplerID, cached.clut)));
UnlockSamplerID(idReg);
switch (id.ClutFmt()) { switch (id.ClutFmt()) {
case GE_CMODE_16BIT_BGR5650: case GE_CMODE_16BIT_BGR5650:

View file

@ -617,6 +617,8 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
DEBUG_LOG(G3D, "Software: Invalid CLUT address, filling with garbage instead of crashing"); DEBUG_LOG(G3D, "Software: Invalid CLUT address, filling with garbage instead of crashing");
memset(clut, 0x00, clutTotalBytes); memset(clut, 0x00, clutTotalBytes);
} }
drawEngine_->transformUnit.NotifyClutUpdate(clut);
} }
break; break;

View file

@ -613,6 +613,10 @@ void TransformUnit::Flush() {
GPUDebug::NotifyDraw(); GPUDebug::NotifyDraw();
} }
void TransformUnit::NotifyClutUpdate(void *src) {
binner_->UpdateClut(src);
}
// TODO: This probably is not the best interface. // TODO: This probably is not the best interface.
// Also, we should try to merge this into the similar function in DrawEngineCommon. // Also, we should try to merge this into the similar function in DrawEngineCommon.
bool TransformUnit::GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices) { bool TransformUnit::GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices) {

View file

@ -120,6 +120,7 @@ public:
bool GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices); bool GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices);
void Flush(); void Flush();
void NotifyClutUpdate(void *src);
private: private:
VertexData ReadVertex(VertexReader &vreader, bool &outside_range_flag); VertexData ReadVertex(VertexReader &vreader, bool &outside_range_flag);