softgpu: Keep arguments in vectors for sampling.
This commit is contained in:
parent
d7c25b3e7c
commit
823c4adb15
7 changed files with 66 additions and 35 deletions
|
@ -290,8 +290,10 @@ static inline bool IsRightSideOrFlatBottomLine(const Vec2<int>& vertex, const Ve
|
|||
}
|
||||
}
|
||||
|
||||
Vec4<int> GetTextureFunctionOutput(const Vec4<int>& prim_color, const Vec4<int>& texcolor)
|
||||
{
|
||||
Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color_in, Vec4IntArg texcolor_in) {
|
||||
const Vec4<int> prim_color = prim_color_in;
|
||||
const Vec4<int> texcolor = texcolor_in;
|
||||
|
||||
Vec3<int> out_rgb;
|
||||
int out_a;
|
||||
|
||||
|
@ -314,7 +316,7 @@ Vec4<int> GetTextureFunctionOutput(const Vec4<int>& prim_color, const Vec4<int>&
|
|||
}
|
||||
|
||||
if (rgba) {
|
||||
return Vec4<int>(out_rgb.ivec);
|
||||
return ToVec4IntResult(Vec4<int>(out_rgb.ivec));
|
||||
} else {
|
||||
out_a = prim_color.a();
|
||||
}
|
||||
|
@ -366,7 +368,7 @@ Vec4<int> GetTextureFunctionOutput(const Vec4<int>& prim_color, const Vec4<int>&
|
|||
out_a = 0;
|
||||
}
|
||||
|
||||
return Vec4<int>(out_rgb.r(), out_rgb.g(), out_rgb.b(), out_a);
|
||||
return ToVec4IntResult(Vec4<int>(out_rgb, out_a));
|
||||
}
|
||||
|
||||
static inline Vec3<int> GetSourceFactor(GEBlendSrcFactor factor, const Vec4<int> &source, const Vec4<int> &dst) {
|
||||
|
@ -577,9 +579,9 @@ static inline void ApplyTexturing(Sampler::Funcs sampler, Vec4<int> &prim_color,
|
|||
GetTexelCoordinates(texlevel + 1, s, t, u[1], v[1]);
|
||||
}
|
||||
|
||||
texcolor0 = Vec4<int>::FromRGBA(sampler.nearest(u[0], v[0], tptr0, bufw0, texlevel));
|
||||
texcolor0 = Vec4<int>(sampler.nearest(u[0], v[0], tptr0, bufw0, texlevel));
|
||||
if (frac_texlevel) {
|
||||
texcolor1 = Vec4<int>::FromRGBA(sampler.nearest(u[1], v[1], tptr1, bufw1, texlevel + 1));
|
||||
texcolor1 = Vec4<int>(sampler.nearest(u[1], v[1], tptr1, bufw1, texlevel + 1));
|
||||
}
|
||||
} else {
|
||||
GetTexelCoordinatesQuad(texlevel, s, t, u, v, frac_u[0], frac_v[0]);
|
||||
|
@ -587,16 +589,16 @@ static inline void ApplyTexturing(Sampler::Funcs sampler, Vec4<int> &prim_color,
|
|||
GetTexelCoordinatesQuad(texlevel + 1, s, t, u + 4, v + 4, frac_u[1], frac_v[1]);
|
||||
}
|
||||
|
||||
texcolor0 = Vec4<int>::FromRGBA(sampler.linear(u, v, frac_u[0], frac_v[0], tptr0, bufw0, texlevel));
|
||||
texcolor0 = Vec4<int>(sampler.linear(u, v, frac_u[0], frac_v[0], tptr0, bufw0, texlevel));
|
||||
if (frac_texlevel) {
|
||||
texcolor1 = Vec4<int>::FromRGBA(sampler.linear(u + 4, v + 4, frac_u[1], frac_v[1], tptr1, bufw1, texlevel + 1));
|
||||
texcolor1 = Vec4<int>(sampler.linear(u + 4, v + 4, frac_u[1], frac_v[1], tptr1, bufw1, texlevel + 1));
|
||||
}
|
||||
}
|
||||
|
||||
if (frac_texlevel) {
|
||||
texcolor0 = (texcolor1 * frac_texlevel + texcolor0 * (256 - frac_texlevel)) / 256;
|
||||
}
|
||||
prim_color = GetTextureFunctionOutput(prim_color, texcolor0);
|
||||
prim_color = GetTextureFunctionOutput(ToVec4IntArg(prim_color), ToVec4IntArg(texcolor0));
|
||||
}
|
||||
|
||||
// Produces a signed 1.23.8 value.
|
||||
|
@ -1392,7 +1394,7 @@ bool GetCurrentTexture(GPUDebugBuffer &buffer, int level)
|
|||
u32 *row = (u32 *)buffer.GetData();
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int x = 0; x < w; ++x) {
|
||||
row[x] = sampler.nearest(x, y, texptr, texbufw, level);
|
||||
row[x] = Vec4<int>(sampler.nearest(x, y, texptr, texbufw, level)).ToRGBA();
|
||||
}
|
||||
row += w;
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "GPU/Software/FuncId.h"
|
||||
#include "GPU/Software/RasterizerRegCache.h"
|
||||
#include "GPU/Software/TransformUnit.h" // for DrawingCoords
|
||||
|
||||
struct GPUDebugBuffer;
|
||||
|
@ -35,6 +36,6 @@ bool GetCurrentTexture(GPUDebugBuffer &buffer, int level);
|
|||
|
||||
// Shared functions with RasterizerRectangle.cpp
|
||||
Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &source, const Vec4<int> &dst);
|
||||
Vec4<int> GetTextureFunctionOutput(const Vec4<int>& prim_color, const Vec4<int>& texcolor);
|
||||
Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color, Vec4IntArg texcolor);
|
||||
|
||||
} // namespace Rasterizer
|
||||
|
|
|
@ -153,7 +153,7 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) {
|
|||
int s = s_start;
|
||||
u16 *pixel = fb.Get16Ptr(pos0.x, y, gstate.FrameBufStride());
|
||||
for (int x = pos0.x; x < pos1.x; x++) {
|
||||
u32 tex_color = nearestFunc(s, t, texptr, texbufw, 0);
|
||||
u32 tex_color = Vec4<int>(nearestFunc(s, t, texptr, texbufw, 0)).ToRGBA();
|
||||
if (tex_color & 0xFF000000) {
|
||||
DrawSinglePixel5551(pixel, tex_color, pixelID);
|
||||
}
|
||||
|
@ -171,7 +171,7 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) {
|
|||
u16 *pixel = fb.Get16Ptr(pos0.x, y, gstate.FrameBufStride());
|
||||
for (int x = pos0.x; x < pos1.x; x++) {
|
||||
Vec4<int> prim_color = v1.color0;
|
||||
Vec4<int> tex_color = Vec4<int>::FromRGBA(nearestFunc(s, t, texptr, texbufw, 0));
|
||||
Vec4<int> tex_color = nearestFunc(s, t, texptr, texbufw, 0);
|
||||
prim_color = ModulateRGBA(prim_color, tex_color);
|
||||
if (prim_color.a() > 0) {
|
||||
DrawSinglePixel5551(pixel, prim_color.ToRGBA(), pixelID);
|
||||
|
@ -191,8 +191,8 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) {
|
|||
// Not really that fast but faster than triangle.
|
||||
for (int x = pos0.x; x < pos1.x; x++) {
|
||||
Vec4<int> prim_color = v1.color0;
|
||||
Vec4<int> tex_color = Vec4<int>::FromRGBA(nearestFunc(s, t, texptr, texbufw, 0));
|
||||
prim_color = GetTextureFunctionOutput(prim_color, tex_color);
|
||||
Vec4<int> tex_color = nearestFunc(s, t, texptr, texbufw, 0);
|
||||
prim_color = GetTextureFunctionOutput(ToVec4IntArg(prim_color), ToVec4IntArg(tex_color));
|
||||
drawPixel(x, y, z, 255, ToVec4IntArg(prim_color), pixelID);
|
||||
s += ds;
|
||||
}
|
||||
|
|
|
@ -64,13 +64,25 @@ typedef FakeGen::FakeXCodeBlock CodeBlock;
|
|||
// We also have the types of things that end up in regs.
|
||||
#if PPSSPP_ARCH(ARM64)
|
||||
typedef int32x4_t Vec4IntArg;
|
||||
typedef int32x4_t Vec4IntResult;
|
||||
typedef float32x4_t Vec4FloatArg;
|
||||
static inline Vec4IntArg ToVec4IntArg(const Math3D::Vec4<int> &a) { return vld1q_s32(a.AsArray()); }
|
||||
static inline Vec4IntResult ToVec4IntResult(const Math3D::Vec4<int> &a) { return vld1q_s32(a.AsArray()); }
|
||||
static inline Vec4FloatArg ToVec4FloatArg(const Math3D::Vec4<float> &a) { return vld1q_f32(a.AsArray()); }
|
||||
#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
|
||||
typedef __m128i Vec4IntArg;
|
||||
typedef __m128i Vec4IntResult;
|
||||
typedef __m128 Vec4FloatArg;
|
||||
static inline Vec4IntArg ToVec4IntArg(const Math3D::Vec4<int> &a) { return a.ivec; }
|
||||
static inline Vec4IntResult ToVec4IntResult(const Math3D::Vec4<int> &a) { return a.ivec; }
|
||||
static inline Vec4FloatArg ToVec4FloatArg(const Math3D::Vec4<float> &a) { return a.vec; }
|
||||
#else
|
||||
typedef const Math3D::Vec4<int> &Vec4IntArg;
|
||||
typedef Math3D::Vec4<int> Vec4IntResult;
|
||||
typedef const Math3D::Vec4<float> &Vec4FloatArg;
|
||||
static inline Vec4IntArg ToVec4IntArg(const Math3D::Vec4<int> &a) { return a; }
|
||||
static inline Vec4IntResult ToVec4IntResult(const Math3D::Vec4<int> &a) { return a; }
|
||||
static inline Vec4FloatArg ToVec4FloatArg(const Math3D::Vec4<float> &a) { return a; }
|
||||
#endif
|
||||
|
||||
#if PPSSPP_ARCH(AMD64) && PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER))
|
||||
|
@ -85,6 +97,7 @@ struct RegCache {
|
|||
FLAG_TEMP = 0x1000,
|
||||
|
||||
VEC_ZERO = 0x0000,
|
||||
VEC_RESULT = 0x0001,
|
||||
|
||||
GEN_SRC_ALPHA = 0x0100,
|
||||
GEN_GSTATE = 0x0101,
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "Core/Reporting.h"
|
||||
#include "GPU/Common/TextureDecoder.h"
|
||||
#include "GPU/GPUState.h"
|
||||
#include "GPU/Software/RasterizerRegCache.h"
|
||||
#include "GPU/Software/Sampler.h"
|
||||
|
||||
#if defined(_M_SSE)
|
||||
|
@ -30,13 +31,14 @@
|
|||
#endif
|
||||
|
||||
using namespace Math3D;
|
||||
using namespace Rasterizer;
|
||||
|
||||
extern u32 clut[4096];
|
||||
|
||||
namespace Sampler {
|
||||
|
||||
static u32 SampleNearest(int u, int v, const u8 *tptr, int bufw, int level);
|
||||
static u32 SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level);
|
||||
static Vec4IntResult SOFTRAST_CALL SampleNearest(int u, int v, const u8 *tptr, int bufw, int level);
|
||||
static Vec4IntResult SOFTRAST_CALL SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level);
|
||||
|
||||
std::mutex jitCacheLock;
|
||||
SamplerJitCache *jitCache = nullptr;
|
||||
|
@ -305,8 +307,7 @@ struct Nearest4 {
|
|||
};
|
||||
|
||||
template <int N>
|
||||
inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int texbufw, int level)
|
||||
{
|
||||
inline static Nearest4 SOFTRAST_CALL SampleNearest(int u[N], int v[N], const u8 *srcptr, int texbufw, int level) {
|
||||
Nearest4 res;
|
||||
if (!srcptr) {
|
||||
memset(res.v, 0, sizeof(res.v));
|
||||
|
@ -407,11 +408,12 @@ inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int t
|
|||
}
|
||||
}
|
||||
|
||||
static u32 SampleNearest(int u, int v, const u8 *tptr, int bufw, int level) {
|
||||
return SampleNearest<1>(&u, &v, tptr, bufw, level);
|
||||
static Vec4IntResult SOFTRAST_CALL SampleNearest(int u, int v, const u8 *tptr, int bufw, int level) {
|
||||
Nearest4 c = SampleNearest<1>(&u, &v, tptr, bufw, level);
|
||||
return ToVec4IntResult(Vec4<int>::FromRGBA(c.v[0]));
|
||||
}
|
||||
|
||||
static u32 SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int texlevel) {
|
||||
static Vec4IntResult SOFTRAST_CALL SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int texlevel) {
|
||||
Nearest4 c = SampleNearest<4>(u, v, tptr, bufw, texlevel);
|
||||
|
||||
Vec4<int> texcolor_tl = Vec4<int>::FromRGBA(c.v[0]);
|
||||
|
@ -420,7 +422,7 @@ static u32 SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tp
|
|||
Vec4<int> texcolor_br = Vec4<int>::FromRGBA(c.v[3]);
|
||||
Vec4<int> t = texcolor_tl * (0x100 - frac_u) + texcolor_tr * frac_u;
|
||||
Vec4<int> b = texcolor_bl * (0x100 - frac_u) + texcolor_br * frac_u;
|
||||
return ((t * (0x100 - frac_v) + b * frac_v) / (256 * 256)).ToRGBA();
|
||||
return ToVec4IntResult((t * (0x100 - frac_v) + b * frac_v) / (256 * 256));
|
||||
}
|
||||
|
||||
};
|
||||
|
|
|
@ -26,10 +26,10 @@
|
|||
|
||||
namespace Sampler {
|
||||
|
||||
typedef u32 (*NearestFunc)(int u, int v, const u8 *tptr, int bufw, int level);
|
||||
typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *NearestFunc)(int u, int v, const u8 *tptr, int bufw, int level);
|
||||
NearestFunc GetNearestFunc();
|
||||
|
||||
typedef u32 (*LinearFunc)(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level);
|
||||
typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *LinearFunc)(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level);
|
||||
LinearFunc GetLinearFunc();
|
||||
|
||||
struct Funcs {
|
||||
|
|
|
@ -62,6 +62,7 @@ NearestFunc SamplerJitCache::Compile(const SamplerID &id) {
|
|||
RegCache::GEN_ARG_LEVEL,
|
||||
});
|
||||
regCache_.ChangeReg(RAX, RegCache::GEN_RESULT);
|
||||
regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
|
||||
|
||||
BeginWrite();
|
||||
const u8 *start = AlignCode16();
|
||||
|
@ -74,9 +75,9 @@ NearestFunc SamplerJitCache::Compile(const SamplerID &id) {
|
|||
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
|
||||
|
||||
FixupBranch nonZeroSrc = J_CC(CC_NZ);
|
||||
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
|
||||
XOR(32, R(resultReg), R(resultReg));
|
||||
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
|
||||
X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
|
||||
PXOR(vecResultReg, R(vecResultReg));
|
||||
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
|
||||
zeroSrc = J(true);
|
||||
SetJumpTarget(nonZeroSrc);
|
||||
}
|
||||
|
@ -89,6 +90,23 @@ NearestFunc SamplerJitCache::Compile(const SamplerID &id) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
|
||||
|
||||
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
|
||||
MOVD_xmm(vecResultReg, R(resultReg));
|
||||
regCache_.Release(resultReg, RegCache::GEN_RESULT);
|
||||
|
||||
if (cpu_info.bSSE4_1) {
|
||||
PMOVZXBD(vecResultReg, R(vecResultReg));
|
||||
} else {
|
||||
X64Reg vecTempReg = regCache_.Find(RegCache::VEC_TEMP0);
|
||||
PXOR(vecTempReg, R(vecTempReg));
|
||||
PUNPCKLBW(vecResultReg, R(vecTempReg));
|
||||
PUNPCKLWD(vecResultReg, R(vecTempReg));
|
||||
regCache_.Unlock(vecTempReg, RegCache::VEC_TEMP0);
|
||||
}
|
||||
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
|
||||
|
||||
if (id.hasInvalidPtr) {
|
||||
SetJumpTarget(zeroSrc);
|
||||
}
|
||||
|
@ -171,7 +189,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
|
|||
if (id.hasInvalidPtr) {
|
||||
CMP(PTRBITS, R(R14), Imm8(0));
|
||||
FixupBranch nonZeroSrc = J_CC(CC_NZ);
|
||||
XOR(32, R(RAX), R(RAX));
|
||||
PXOR(XMM0, R(XMM0));
|
||||
zeroSrc = J(true);
|
||||
SetJumpTarget(nonZeroSrc);
|
||||
}
|
||||
|
@ -289,12 +307,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
|
|||
ADDPS(fpScratchReg1, R(fpScratchReg3));
|
||||
|
||||
// Time to convert back to a single 32 bit value.
|
||||
CVTPS2DQ(fpScratchReg1, R(fpScratchReg1));
|
||||
PACKSSDW(fpScratchReg1, R(fpScratchReg1));
|
||||
PACKUSWB(fpScratchReg1, R(fpScratchReg1));
|
||||
|
||||
const X64Reg resultReg = RAX;
|
||||
MOVD_xmm(R(resultReg), fpScratchReg1);
|
||||
CVTPS2DQ(XMM0, R(fpScratchReg1));
|
||||
|
||||
if (id.hasInvalidPtr) {
|
||||
SetJumpTarget(zeroSrc);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue