softgpu: Keep arguments in vectors for sampling.

This commit is contained in:
Unknown W. Brackets 2021-12-04 13:57:58 -08:00
parent d7c25b3e7c
commit 823c4adb15
7 changed files with 66 additions and 35 deletions

View file

@ -290,8 +290,10 @@ static inline bool IsRightSideOrFlatBottomLine(const Vec2<int>& vertex, const Ve
}
}
Vec4<int> GetTextureFunctionOutput(const Vec4<int>& prim_color, const Vec4<int>& texcolor)
{
Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color_in, Vec4IntArg texcolor_in) {
const Vec4<int> prim_color = prim_color_in;
const Vec4<int> texcolor = texcolor_in;
Vec3<int> out_rgb;
int out_a;
@ -314,7 +316,7 @@ Vec4<int> GetTextureFunctionOutput(const Vec4<int>& prim_color, const Vec4<int>&
}
if (rgba) {
return Vec4<int>(out_rgb.ivec);
return ToVec4IntResult(Vec4<int>(out_rgb.ivec));
} else {
out_a = prim_color.a();
}
@ -366,7 +368,7 @@ Vec4<int> GetTextureFunctionOutput(const Vec4<int>& prim_color, const Vec4<int>&
out_a = 0;
}
return Vec4<int>(out_rgb.r(), out_rgb.g(), out_rgb.b(), out_a);
return ToVec4IntResult(Vec4<int>(out_rgb, out_a));
}
static inline Vec3<int> GetSourceFactor(GEBlendSrcFactor factor, const Vec4<int> &source, const Vec4<int> &dst) {
@ -577,9 +579,9 @@ static inline void ApplyTexturing(Sampler::Funcs sampler, Vec4<int> &prim_color,
GetTexelCoordinates(texlevel + 1, s, t, u[1], v[1]);
}
texcolor0 = Vec4<int>::FromRGBA(sampler.nearest(u[0], v[0], tptr0, bufw0, texlevel));
texcolor0 = Vec4<int>(sampler.nearest(u[0], v[0], tptr0, bufw0, texlevel));
if (frac_texlevel) {
texcolor1 = Vec4<int>::FromRGBA(sampler.nearest(u[1], v[1], tptr1, bufw1, texlevel + 1));
texcolor1 = Vec4<int>(sampler.nearest(u[1], v[1], tptr1, bufw1, texlevel + 1));
}
} else {
GetTexelCoordinatesQuad(texlevel, s, t, u, v, frac_u[0], frac_v[0]);
@ -587,16 +589,16 @@ static inline void ApplyTexturing(Sampler::Funcs sampler, Vec4<int> &prim_color,
GetTexelCoordinatesQuad(texlevel + 1, s, t, u + 4, v + 4, frac_u[1], frac_v[1]);
}
texcolor0 = Vec4<int>::FromRGBA(sampler.linear(u, v, frac_u[0], frac_v[0], tptr0, bufw0, texlevel));
texcolor0 = Vec4<int>(sampler.linear(u, v, frac_u[0], frac_v[0], tptr0, bufw0, texlevel));
if (frac_texlevel) {
texcolor1 = Vec4<int>::FromRGBA(sampler.linear(u + 4, v + 4, frac_u[1], frac_v[1], tptr1, bufw1, texlevel + 1));
texcolor1 = Vec4<int>(sampler.linear(u + 4, v + 4, frac_u[1], frac_v[1], tptr1, bufw1, texlevel + 1));
}
}
if (frac_texlevel) {
texcolor0 = (texcolor1 * frac_texlevel + texcolor0 * (256 - frac_texlevel)) / 256;
}
prim_color = GetTextureFunctionOutput(prim_color, texcolor0);
prim_color = GetTextureFunctionOutput(ToVec4IntArg(prim_color), ToVec4IntArg(texcolor0));
}
// Produces a signed 1.23.8 value.
@ -1392,7 +1394,7 @@ bool GetCurrentTexture(GPUDebugBuffer &buffer, int level)
u32 *row = (u32 *)buffer.GetData();
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
row[x] = sampler.nearest(x, y, texptr, texbufw, level);
row[x] = Vec4<int>(sampler.nearest(x, y, texptr, texbufw, level)).ToRGBA();
}
row += w;
}

View file

@ -18,6 +18,7 @@
#pragma once
#include "GPU/Software/FuncId.h"
#include "GPU/Software/RasterizerRegCache.h"
#include "GPU/Software/TransformUnit.h" // for DrawingCoords
struct GPUDebugBuffer;
@ -35,6 +36,6 @@ bool GetCurrentTexture(GPUDebugBuffer &buffer, int level);
// Shared functions with RasterizerRectangle.cpp
Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &source, const Vec4<int> &dst);
Vec4<int> GetTextureFunctionOutput(const Vec4<int>& prim_color, const Vec4<int>& texcolor);
Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color, Vec4IntArg texcolor);
} // namespace Rasterizer

View file

@ -153,7 +153,7 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) {
int s = s_start;
u16 *pixel = fb.Get16Ptr(pos0.x, y, gstate.FrameBufStride());
for (int x = pos0.x; x < pos1.x; x++) {
u32 tex_color = nearestFunc(s, t, texptr, texbufw, 0);
u32 tex_color = Vec4<int>(nearestFunc(s, t, texptr, texbufw, 0)).ToRGBA();
if (tex_color & 0xFF000000) {
DrawSinglePixel5551(pixel, tex_color, pixelID);
}
@ -171,7 +171,7 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) {
u16 *pixel = fb.Get16Ptr(pos0.x, y, gstate.FrameBufStride());
for (int x = pos0.x; x < pos1.x; x++) {
Vec4<int> prim_color = v1.color0;
Vec4<int> tex_color = Vec4<int>::FromRGBA(nearestFunc(s, t, texptr, texbufw, 0));
Vec4<int> tex_color = nearestFunc(s, t, texptr, texbufw, 0);
prim_color = ModulateRGBA(prim_color, tex_color);
if (prim_color.a() > 0) {
DrawSinglePixel5551(pixel, prim_color.ToRGBA(), pixelID);
@ -191,8 +191,8 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) {
// Not really that fast but faster than triangle.
for (int x = pos0.x; x < pos1.x; x++) {
Vec4<int> prim_color = v1.color0;
Vec4<int> tex_color = Vec4<int>::FromRGBA(nearestFunc(s, t, texptr, texbufw, 0));
prim_color = GetTextureFunctionOutput(prim_color, tex_color);
Vec4<int> tex_color = nearestFunc(s, t, texptr, texbufw, 0);
prim_color = GetTextureFunctionOutput(ToVec4IntArg(prim_color), ToVec4IntArg(tex_color));
drawPixel(x, y, z, 255, ToVec4IntArg(prim_color), pixelID);
s += ds;
}

View file

@ -64,13 +64,25 @@ typedef FakeGen::FakeXCodeBlock CodeBlock;
// We also have the types of things that end up in regs.
#if PPSSPP_ARCH(ARM64)
typedef int32x4_t Vec4IntArg;
typedef int32x4_t Vec4IntResult;
typedef float32x4_t Vec4FloatArg;
static inline Vec4IntArg ToVec4IntArg(const Math3D::Vec4<int> &a) { return vld1q_s32(a.AsArray()); }
static inline Vec4IntResult ToVec4IntResult(const Math3D::Vec4<int> &a) { return vld1q_s32(a.AsArray()); }
static inline Vec4FloatArg ToVec4FloatArg(const Math3D::Vec4<float> &a) { return vld1q_f32(a.AsArray()); }
#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
typedef __m128i Vec4IntArg;
typedef __m128i Vec4IntResult;
typedef __m128 Vec4FloatArg;
static inline Vec4IntArg ToVec4IntArg(const Math3D::Vec4<int> &a) { return a.ivec; }
static inline Vec4IntResult ToVec4IntResult(const Math3D::Vec4<int> &a) { return a.ivec; }
static inline Vec4FloatArg ToVec4FloatArg(const Math3D::Vec4<float> &a) { return a.vec; }
#else
typedef const Math3D::Vec4<int> &Vec4IntArg;
typedef Math3D::Vec4<int> Vec4IntResult;
typedef const Math3D::Vec4<float> &Vec4FloatArg;
static inline Vec4IntArg ToVec4IntArg(const Math3D::Vec4<int> &a) { return a; }
static inline Vec4IntResult ToVec4IntResult(const Math3D::Vec4<int> &a) { return a; }
static inline Vec4FloatArg ToVec4FloatArg(const Math3D::Vec4<float> &a) { return a; }
#endif
#if PPSSPP_ARCH(AMD64) && PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER))
@ -85,6 +97,7 @@ struct RegCache {
FLAG_TEMP = 0x1000,
VEC_ZERO = 0x0000,
VEC_RESULT = 0x0001,
GEN_SRC_ALPHA = 0x0100,
GEN_GSTATE = 0x0101,

View file

@ -23,6 +23,7 @@
#include "Core/Reporting.h"
#include "GPU/Common/TextureDecoder.h"
#include "GPU/GPUState.h"
#include "GPU/Software/RasterizerRegCache.h"
#include "GPU/Software/Sampler.h"
#if defined(_M_SSE)
@ -30,13 +31,14 @@
#endif
using namespace Math3D;
using namespace Rasterizer;
extern u32 clut[4096];
namespace Sampler {
static u32 SampleNearest(int u, int v, const u8 *tptr, int bufw, int level);
static u32 SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level);
static Vec4IntResult SOFTRAST_CALL SampleNearest(int u, int v, const u8 *tptr, int bufw, int level);
static Vec4IntResult SOFTRAST_CALL SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level);
std::mutex jitCacheLock;
SamplerJitCache *jitCache = nullptr;
@ -305,8 +307,7 @@ struct Nearest4 {
};
template <int N>
inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int texbufw, int level)
{
inline static Nearest4 SOFTRAST_CALL SampleNearest(int u[N], int v[N], const u8 *srcptr, int texbufw, int level) {
Nearest4 res;
if (!srcptr) {
memset(res.v, 0, sizeof(res.v));
@ -407,11 +408,12 @@ inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int t
}
}
static u32 SampleNearest(int u, int v, const u8 *tptr, int bufw, int level) {
return SampleNearest<1>(&u, &v, tptr, bufw, level);
static Vec4IntResult SOFTRAST_CALL SampleNearest(int u, int v, const u8 *tptr, int bufw, int level) {
Nearest4 c = SampleNearest<1>(&u, &v, tptr, bufw, level);
return ToVec4IntResult(Vec4<int>::FromRGBA(c.v[0]));
}
static u32 SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int texlevel) {
static Vec4IntResult SOFTRAST_CALL SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int texlevel) {
Nearest4 c = SampleNearest<4>(u, v, tptr, bufw, texlevel);
Vec4<int> texcolor_tl = Vec4<int>::FromRGBA(c.v[0]);
@ -420,7 +422,7 @@ static u32 SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tp
Vec4<int> texcolor_br = Vec4<int>::FromRGBA(c.v[3]);
Vec4<int> t = texcolor_tl * (0x100 - frac_u) + texcolor_tr * frac_u;
Vec4<int> b = texcolor_bl * (0x100 - frac_u) + texcolor_br * frac_u;
return ((t * (0x100 - frac_v) + b * frac_v) / (256 * 256)).ToRGBA();
return ToVec4IntResult((t * (0x100 - frac_v) + b * frac_v) / (256 * 256));
}
};

View file

@ -26,10 +26,10 @@
namespace Sampler {
typedef u32 (*NearestFunc)(int u, int v, const u8 *tptr, int bufw, int level);
typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *NearestFunc)(int u, int v, const u8 *tptr, int bufw, int level);
NearestFunc GetNearestFunc();
typedef u32 (*LinearFunc)(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level);
typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *LinearFunc)(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level);
LinearFunc GetLinearFunc();
struct Funcs {

View file

@ -62,6 +62,7 @@ NearestFunc SamplerJitCache::Compile(const SamplerID &id) {
RegCache::GEN_ARG_LEVEL,
});
regCache_.ChangeReg(RAX, RegCache::GEN_RESULT);
regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
BeginWrite();
const u8 *start = AlignCode16();
@ -74,9 +75,9 @@ NearestFunc SamplerJitCache::Compile(const SamplerID &id) {
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
FixupBranch nonZeroSrc = J_CC(CC_NZ);
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
XOR(32, R(resultReg), R(resultReg));
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
PXOR(vecResultReg, R(vecResultReg));
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
zeroSrc = J(true);
SetJumpTarget(nonZeroSrc);
}
@ -89,6 +90,23 @@ NearestFunc SamplerJitCache::Compile(const SamplerID &id) {
return nullptr;
}
X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
MOVD_xmm(vecResultReg, R(resultReg));
regCache_.Release(resultReg, RegCache::GEN_RESULT);
if (cpu_info.bSSE4_1) {
PMOVZXBD(vecResultReg, R(vecResultReg));
} else {
X64Reg vecTempReg = regCache_.Find(RegCache::VEC_TEMP0);
PXOR(vecTempReg, R(vecTempReg));
PUNPCKLBW(vecResultReg, R(vecTempReg));
PUNPCKLWD(vecResultReg, R(vecTempReg));
regCache_.Unlock(vecTempReg, RegCache::VEC_TEMP0);
}
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
if (id.hasInvalidPtr) {
SetJumpTarget(zeroSrc);
}
@ -171,7 +189,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
if (id.hasInvalidPtr) {
CMP(PTRBITS, R(R14), Imm8(0));
FixupBranch nonZeroSrc = J_CC(CC_NZ);
XOR(32, R(RAX), R(RAX));
PXOR(XMM0, R(XMM0));
zeroSrc = J(true);
SetJumpTarget(nonZeroSrc);
}
@ -289,12 +307,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
ADDPS(fpScratchReg1, R(fpScratchReg3));
// Time to convert back to a single 32 bit value.
CVTPS2DQ(fpScratchReg1, R(fpScratchReg1));
PACKSSDW(fpScratchReg1, R(fpScratchReg1));
PACKUSWB(fpScratchReg1, R(fpScratchReg1));
const X64Reg resultReg = RAX;
MOVD_xmm(R(resultReg), fpScratchReg1);
CVTPS2DQ(XMM0, R(fpScratchReg1));
if (id.hasInvalidPtr) {
SetJumpTarget(zeroSrc);