softgpu: Track CLUTs as states for binning.

This way we can have multiple CLUTs in process at once, which helps.
This commit is contained in:
Unknown W. Brackets 2022-01-15 23:12:03 -08:00
parent ba63d9cf09
commit d6fa301ab1
9 changed files with 45 additions and 14 deletions

View file

@ -86,7 +86,7 @@ static inline void DrawBinItem(const BinItem &item, const RasterizerState &state
class DrawBinItemsTask : public Task {
public:
DrawBinItemsTask(BinWaitable *notify, BinQueue<BinItem, 1024> &items, std::atomic<bool> &status, const BinQueue<RasterizerState, 32> &states)
DrawBinItemsTask(BinWaitable *notify, BinQueue<BinItem, 1024> &items, std::atomic<bool> &status, const BinQueue<RasterizerState, 64> &states)
: notify_(notify), items_(items), status_(status), states_(states) {
}
@ -114,7 +114,7 @@ private:
BinWaitable *notify_;
BinQueue<BinItem, 1024> &items_;
std::atomic<bool> &status_;
const BinQueue<RasterizerState, 32> &states_;
const BinQueue<RasterizerState, 64> &states_;
};
BinManager::BinManager() {
@ -137,6 +137,7 @@ void BinManager::UpdateState() {
Flush();
stateIndex_ = (int)states_.Push(RasterizerState());
ComputeRasterizerState(&states_[stateIndex_]);
states_[stateIndex_].samplerID.cached.clut = cluts_[clutIndex_].readable;
DrawingCoords scissorTL(gstate.getScissorX1(), gstate.getScissorY1());
DrawingCoords scissorBR(gstate.getScissorX2(), gstate.getScissorY2());
@ -165,6 +166,13 @@ void BinManager::UpdateState() {
}
}
void BinManager::UpdateClut(void *src) {
if (cluts_.Full())
Flush();
clutIndex_ = (int)cluts_.Push(BinClut());
memcpy(cluts_[clutIndex_].readable, src, sizeof(BinClut));
}
void BinManager::AddTriangle(const VertexData &v0, const VertexData &v1, const VertexData &v2) {
Vec2<int> d01((int)v0.screenpos.x - (int)v1.screenpos.x, (int)v0.screenpos.y - (int)v1.screenpos.y);
Vec2<int> d02((int)v0.screenpos.x - (int)v2.screenpos.x, (int)v0.screenpos.y - (int)v2.screenpos.y);
@ -310,6 +318,8 @@ void BinManager::Flush() {
queue_.Reset();
while (states_.Size() > 1)
states_.SkipNext();
while (cluts_.Size() > 1)
cluts_.SkipNext();
queueRange_.x1 = 0x7FFFFFFF;
queueRange_.y1 = 0x7FFFFFFF;

View file

@ -143,12 +143,17 @@ struct BinQueue {
std::atomic<size_t> size_;
};
union BinClut {
uint8_t readable[1024];
};
class BinManager {
public:
BinManager();
~BinManager();
void UpdateState();
void UpdateClut(void *src);
const Rasterizer::RasterizerState &State() {
return states_[stateIndex_];
@ -166,8 +171,10 @@ public:
private:
static constexpr int MAX_POSSIBLE_TASKS = 64;
BinQueue<Rasterizer::RasterizerState, 32> states_;
BinQueue<Rasterizer::RasterizerState, 64> states_;
int stateIndex_;
BinQueue<BinClut, 64> cluts_;
int clutIndex_;
BinCoords scissor_;
BinQueue<BinItem, 1024> queue_;
BinCoords queueRange_;

View file

@ -22,7 +22,7 @@
#include "GPU/GPUState.h"
#include "GPU/Software/FuncId.h"
static_assert(sizeof(SamplerID) == sizeof(SamplerID::fullKey) + sizeof(SamplerID::cached), "Bad sampler ID size");
static_assert(sizeof(SamplerID) == sizeof(SamplerID::fullKey) + sizeof(SamplerID::cached) + sizeof(SamplerID::pad), "Bad sampler ID size");
static_assert(sizeof(PixelFuncID) == sizeof(PixelFuncID::fullKey) + sizeof(PixelFuncID::cached), "Bad pixel func ID size");
static inline GEComparison OptimizeRefByteCompare(GEComparison func, u8 ref) {

View file

@ -173,8 +173,15 @@ struct SamplerID {
} sizes[8];
uint32_t texBlendColor;
uint32_t clutFormat;
union {
uint8_t *clut;
uint16_t *clut16;
uint32_t *clut32;
};
} cached;
uint32_t pad;
union {
uint32_t fullKey;
struct {

View file

@ -35,8 +35,6 @@
using namespace Math3D;
using namespace Rasterizer;
extern u32 clut[4096];
namespace Sampler {
static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int level, int levelFrac, const SamplerID &samplerID);
@ -267,16 +265,16 @@ static inline u32 LookupColor(unsigned int index, unsigned int level, const Samp
switch (samplerID.ClutFmt()) {
case GE_CMODE_16BIT_BGR5650:
return RGB565ToRGBA8888(reinterpret_cast<u16*>(clut)[index + clutSharingOffset]);
return RGB565ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);
case GE_CMODE_16BIT_ABGR5551:
return RGBA5551ToRGBA8888(reinterpret_cast<u16*>(clut)[index + clutSharingOffset]);
return RGBA5551ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);
case GE_CMODE_16BIT_ABGR4444:
return RGBA4444ToRGBA8888(reinterpret_cast<u16*>(clut)[index + clutSharingOffset]);
return RGBA4444ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);
case GE_CMODE_32BIT_ABGR8888:
return clut[index + clutSharingOffset];
return samplerID.cached.clut32[index + clutSharingOffset];
default:
ERROR_LOG_REPORT(G3D, "Software: Unsupported palette format: %x", samplerID.ClutFmt());

View file

@ -29,8 +29,6 @@
using namespace Gen;
using namespace Rasterizer;
extern u32 clut[4096];
namespace Sampler {
FetchFunc SamplerJitCache::CompileFetch(const SamplerID &id) {
@ -1253,8 +1251,10 @@ bool SamplerJitCache::Jit_ReadClutQuad(const SamplerID &id, bool level1) {
regCache_.Release(vecLevelReg, RegCache::VEC_TEMP0);
}
X64Reg idReg = GetSamplerID();
X64Reg clutBaseReg = regCache_.Alloc(RegCache::GEN_TEMP1);
MOV(PTRBITS, R(clutBaseReg), ImmPtr(clut));
MOV(PTRBITS, R(clutBaseReg), MDisp(idReg, offsetof(SamplerID, cached.clut)));
UnlockSamplerID(idReg);
X64Reg resultReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0);
@ -3457,8 +3457,10 @@ bool SamplerJitCache::Jit_ReadClutColor(const SamplerID &id) {
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
}
X64Reg idReg = GetSamplerID();
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
MOV(PTRBITS, R(temp1Reg), ImmPtr(clut));
MOV(PTRBITS, R(temp1Reg), MDisp(idReg, offsetof(SamplerID, cached.clut)));
UnlockSamplerID(idReg);
switch (id.ClutFmt()) {
case GE_CMODE_16BIT_BGR5650:

View file

@ -617,6 +617,8 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
DEBUG_LOG(G3D, "Software: Invalid CLUT address, filling with garbage instead of crashing");
memset(clut, 0x00, clutTotalBytes);
}
drawEngine_->transformUnit.NotifyClutUpdate(clut);
}
break;

View file

@ -613,6 +613,10 @@ void TransformUnit::Flush() {
GPUDebug::NotifyDraw();
}
void TransformUnit::NotifyClutUpdate(void *src) {
binner_->UpdateClut(src);
}
// TODO: This probably is not the best interface.
// Also, we should try to merge this into the similar function in DrawEngineCommon.
bool TransformUnit::GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices) {

View file

@ -120,6 +120,7 @@ public:
bool GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices);
void Flush();
void NotifyClutUpdate(void *src);
private:
VertexData ReadVertex(VertexReader &vreader, bool &outside_range_flag);