diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index ebb9709fd..256a5e3a5 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -290,8 +290,10 @@ static inline bool IsRightSideOrFlatBottomLine(const Vec2& vertex, const Ve } } -Vec4 GetTextureFunctionOutput(const Vec4& prim_color, const Vec4& texcolor) -{ +Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color_in, Vec4IntArg texcolor_in) { + const Vec4 prim_color = prim_color_in; + const Vec4 texcolor = texcolor_in; + Vec3 out_rgb; int out_a; @@ -314,7 +316,7 @@ Vec4 GetTextureFunctionOutput(const Vec4& prim_color, const Vec4& } if (rgba) { - return Vec4(out_rgb.ivec); + return ToVec4IntResult(Vec4(out_rgb.ivec)); } else { out_a = prim_color.a(); } @@ -366,7 +368,7 @@ Vec4 GetTextureFunctionOutput(const Vec4& prim_color, const Vec4& out_a = 0; } - return Vec4(out_rgb.r(), out_rgb.g(), out_rgb.b(), out_a); + return ToVec4IntResult(Vec4(out_rgb, out_a)); } static inline Vec3 GetSourceFactor(GEBlendSrcFactor factor, const Vec4 &source, const Vec4 &dst) { @@ -577,9 +579,9 @@ static inline void ApplyTexturing(Sampler::Funcs sampler, Vec4 &prim_color, GetTexelCoordinates(texlevel + 1, s, t, u[1], v[1]); } - texcolor0 = Vec4::FromRGBA(sampler.nearest(u[0], v[0], tptr0, bufw0, texlevel)); + texcolor0 = Vec4(sampler.nearest(u[0], v[0], tptr0, bufw0, texlevel)); if (frac_texlevel) { - texcolor1 = Vec4::FromRGBA(sampler.nearest(u[1], v[1], tptr1, bufw1, texlevel + 1)); + texcolor1 = Vec4(sampler.nearest(u[1], v[1], tptr1, bufw1, texlevel + 1)); } } else { GetTexelCoordinatesQuad(texlevel, s, t, u, v, frac_u[0], frac_v[0]); @@ -587,16 +589,16 @@ static inline void ApplyTexturing(Sampler::Funcs sampler, Vec4 &prim_color, GetTexelCoordinatesQuad(texlevel + 1, s, t, u + 4, v + 4, frac_u[1], frac_v[1]); } - texcolor0 = Vec4::FromRGBA(sampler.linear(u, v, frac_u[0], frac_v[0], tptr0, bufw0, texlevel)); + texcolor0 = Vec4(sampler.linear(u, v, frac_u[0], frac_v[0], tptr0, bufw0, texlevel)); if (frac_texlevel) { - texcolor1 = Vec4::FromRGBA(sampler.linear(u + 4, v + 4, frac_u[1], frac_v[1], tptr1, bufw1, texlevel + 1)); + texcolor1 = Vec4(sampler.linear(u + 4, v + 4, frac_u[1], frac_v[1], tptr1, bufw1, texlevel + 1)); } } if (frac_texlevel) { texcolor0 = (texcolor1 * frac_texlevel + texcolor0 * (256 - frac_texlevel)) / 256; } - prim_color = GetTextureFunctionOutput(prim_color, texcolor0); + prim_color = GetTextureFunctionOutput(ToVec4IntArg(prim_color), ToVec4IntArg(texcolor0)); } // Produces a signed 1.23.8 value. @@ -1392,7 +1394,7 @@ bool GetCurrentTexture(GPUDebugBuffer &buffer, int level) u32 *row = (u32 *)buffer.GetData(); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { - row[x] = sampler.nearest(x, y, texptr, texbufw, level); + row[x] = Vec4(sampler.nearest(x, y, texptr, texbufw, level)).ToRGBA(); } row += w; } diff --git a/GPU/Software/Rasterizer.h b/GPU/Software/Rasterizer.h index 0d7e2e91f..e72cfaba9 100644 --- a/GPU/Software/Rasterizer.h +++ b/GPU/Software/Rasterizer.h @@ -18,6 +18,7 @@ #pragma once #include "GPU/Software/FuncId.h" +#include "GPU/Software/RasterizerRegCache.h" #include "GPU/Software/TransformUnit.h" // for DrawingCoords struct GPUDebugBuffer; @@ -35,6 +36,6 @@ bool GetCurrentTexture(GPUDebugBuffer &buffer, int level); // Shared functions with RasterizerRectangle.cpp Vec3 AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4 &source, const Vec4 &dst); -Vec4 GetTextureFunctionOutput(const Vec4& prim_color, const Vec4& texcolor); +Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color, Vec4IntArg texcolor); } // namespace Rasterizer diff --git a/GPU/Software/RasterizerRectangle.cpp b/GPU/Software/RasterizerRectangle.cpp index 657ff93aa..aab7d634b 100644 --- a/GPU/Software/RasterizerRectangle.cpp +++ b/GPU/Software/RasterizerRectangle.cpp @@ -153,7 +153,7 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) { int s = s_start; u16 *pixel = fb.Get16Ptr(pos0.x, y, gstate.FrameBufStride()); for (int x = pos0.x; x < pos1.x; x++) { - u32 tex_color = nearestFunc(s, t, texptr, texbufw, 0); + u32 tex_color = Vec4(nearestFunc(s, t, texptr, texbufw, 0)).ToRGBA(); if (tex_color & 0xFF000000) { DrawSinglePixel5551(pixel, tex_color, pixelID); } @@ -171,7 +171,7 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) { u16 *pixel = fb.Get16Ptr(pos0.x, y, gstate.FrameBufStride()); for (int x = pos0.x; x < pos1.x; x++) { Vec4 prim_color = v1.color0; - Vec4 tex_color = Vec4::FromRGBA(nearestFunc(s, t, texptr, texbufw, 0)); + Vec4 tex_color = nearestFunc(s, t, texptr, texbufw, 0); prim_color = ModulateRGBA(prim_color, tex_color); if (prim_color.a() > 0) { DrawSinglePixel5551(pixel, prim_color.ToRGBA(), pixelID); @@ -191,8 +191,8 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) { // Not really that fast but faster than triangle. for (int x = pos0.x; x < pos1.x; x++) { Vec4 prim_color = v1.color0; - Vec4 tex_color = Vec4::FromRGBA(nearestFunc(s, t, texptr, texbufw, 0)); - prim_color = GetTextureFunctionOutput(prim_color, tex_color); + Vec4 tex_color = nearestFunc(s, t, texptr, texbufw, 0); + prim_color = GetTextureFunctionOutput(ToVec4IntArg(prim_color), ToVec4IntArg(tex_color)); drawPixel(x, y, z, 255, ToVec4IntArg(prim_color), pixelID); s += ds; } diff --git a/GPU/Software/RasterizerRegCache.h b/GPU/Software/RasterizerRegCache.h index 7c819c77c..f16c11b71 100644 --- a/GPU/Software/RasterizerRegCache.h +++ b/GPU/Software/RasterizerRegCache.h @@ -64,13 +64,25 @@ typedef FakeGen::FakeXCodeBlock CodeBlock; // We also have the types of things that end up in regs. #if PPSSPP_ARCH(ARM64) typedef int32x4_t Vec4IntArg; +typedef int32x4_t Vec4IntResult; +typedef float32x4_t Vec4FloatArg; static inline Vec4IntArg ToVec4IntArg(const Math3D::Vec4 &a) { return vld1q_s32(a.AsArray()); } +static inline Vec4IntResult ToVec4IntResult(const Math3D::Vec4 &a) { return vld1q_s32(a.AsArray()); } +static inline Vec4FloatArg ToVec4FloatArg(const Math3D::Vec4 &a) { return vld1q_f32(a.AsArray()); } #elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) typedef __m128i Vec4IntArg; +typedef __m128i Vec4IntResult; +typedef __m128 Vec4FloatArg; static inline Vec4IntArg ToVec4IntArg(const Math3D::Vec4 &a) { return a.ivec; } +static inline Vec4IntResult ToVec4IntResult(const Math3D::Vec4 &a) { return a.ivec; } +static inline Vec4FloatArg ToVec4FloatArg(const Math3D::Vec4 &a) { return a.vec; } #else typedef const Math3D::Vec4 &Vec4IntArg; +typedef Math3D::Vec4 Vec4IntResult; +typedef const Math3D::Vec4 &Vec4FloatArg; static inline Vec4IntArg ToVec4IntArg(const Math3D::Vec4 &a) { return a; } +static inline Vec4IntResult ToVec4IntResult(const Math3D::Vec4 &a) { return a; } +static inline Vec4FloatArg ToVec4FloatArg(const Math3D::Vec4 &a) { return a; } #endif #if PPSSPP_ARCH(AMD64) && PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)) @@ -85,6 +97,7 @@ struct RegCache { FLAG_TEMP = 0x1000, VEC_ZERO = 0x0000, + VEC_RESULT = 0x0001, GEN_SRC_ALPHA = 0x0100, GEN_GSTATE = 0x0101, diff --git a/GPU/Software/Sampler.cpp b/GPU/Software/Sampler.cpp index 155f1359b..bc6982ef1 100644 --- a/GPU/Software/Sampler.cpp +++ b/GPU/Software/Sampler.cpp @@ -23,6 +23,7 @@ #include "Core/Reporting.h" #include "GPU/Common/TextureDecoder.h" #include "GPU/GPUState.h" +#include "GPU/Software/RasterizerRegCache.h" #include "GPU/Software/Sampler.h" #if defined(_M_SSE) @@ -30,13 +31,14 @@ #endif using namespace Math3D; +using namespace Rasterizer; extern u32 clut[4096]; namespace Sampler { -static u32 SampleNearest(int u, int v, const u8 *tptr, int bufw, int level); -static u32 SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level); +static Vec4IntResult SOFTRAST_CALL SampleNearest(int u, int v, const u8 *tptr, int bufw, int level); +static Vec4IntResult SOFTRAST_CALL SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level); std::mutex jitCacheLock; SamplerJitCache *jitCache = nullptr; @@ -305,8 +307,7 @@ struct Nearest4 { }; template -inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int texbufw, int level) -{ +inline static Nearest4 SOFTRAST_CALL SampleNearest(int u[N], int v[N], const u8 *srcptr, int texbufw, int level) { Nearest4 res; if (!srcptr) { memset(res.v, 0, sizeof(res.v)); @@ -407,11 +408,12 @@ inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int t } } -static u32 SampleNearest(int u, int v, const u8 *tptr, int bufw, int level) { - return SampleNearest<1>(&u, &v, tptr, bufw, level); +static Vec4IntResult SOFTRAST_CALL SampleNearest(int u, int v, const u8 *tptr, int bufw, int level) { + Nearest4 c = SampleNearest<1>(&u, &v, tptr, bufw, level); + return ToVec4IntResult(Vec4::FromRGBA(c.v[0])); } -static u32 SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int texlevel) { +static Vec4IntResult SOFTRAST_CALL SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int texlevel) { Nearest4 c = SampleNearest<4>(u, v, tptr, bufw, texlevel); Vec4 texcolor_tl = Vec4::FromRGBA(c.v[0]); @@ -420,7 +422,7 @@ static u32 SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tp Vec4 texcolor_br = Vec4::FromRGBA(c.v[3]); Vec4 t = texcolor_tl * (0x100 - frac_u) + texcolor_tr * frac_u; Vec4 b = texcolor_bl * (0x100 - frac_u) + texcolor_br * frac_u; - return ((t * (0x100 - frac_v) + b * frac_v) / (256 * 256)).ToRGBA(); + return ToVec4IntResult((t * (0x100 - frac_v) + b * frac_v) / (256 * 256)); } }; diff --git a/GPU/Software/Sampler.h b/GPU/Software/Sampler.h index 4c4eef1e9..72f6ffc56 100644 --- a/GPU/Software/Sampler.h +++ b/GPU/Software/Sampler.h @@ -26,10 +26,10 @@ namespace Sampler { -typedef u32 (*NearestFunc)(int u, int v, const u8 *tptr, int bufw, int level); +typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *NearestFunc)(int u, int v, const u8 *tptr, int bufw, int level); NearestFunc GetNearestFunc(); -typedef u32 (*LinearFunc)(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level); +typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *LinearFunc)(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level); LinearFunc GetLinearFunc(); struct Funcs { diff --git a/GPU/Software/SamplerX86.cpp b/GPU/Software/SamplerX86.cpp index af3bdd6c8..e455b8274 100644 --- a/GPU/Software/SamplerX86.cpp +++ b/GPU/Software/SamplerX86.cpp @@ -62,6 +62,7 @@ NearestFunc SamplerJitCache::Compile(const SamplerID &id) { RegCache::GEN_ARG_LEVEL, }); regCache_.ChangeReg(RAX, RegCache::GEN_RESULT); + regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT); BeginWrite(); const u8 *start = AlignCode16(); @@ -74,9 +75,9 @@ NearestFunc SamplerJitCache::Compile(const SamplerID &id) { regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR); FixupBranch nonZeroSrc = J_CC(CC_NZ); - X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT); - XOR(32, R(resultReg), R(resultReg)); - regCache_.Unlock(resultReg, RegCache::GEN_RESULT); + X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT); + PXOR(vecResultReg, R(vecResultReg)); + regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT); zeroSrc = J(true); SetJumpTarget(nonZeroSrc); } @@ -89,6 +90,23 @@ NearestFunc SamplerJitCache::Compile(const SamplerID &id) { return nullptr; } + X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT); + + X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT); + MOVD_xmm(vecResultReg, R(resultReg)); + regCache_.Release(resultReg, RegCache::GEN_RESULT); + + if (cpu_info.bSSE4_1) { + PMOVZXBD(vecResultReg, R(vecResultReg)); + } else { + X64Reg vecTempReg = regCache_.Find(RegCache::VEC_TEMP0); + PXOR(vecTempReg, R(vecTempReg)); + PUNPCKLBW(vecResultReg, R(vecTempReg)); + PUNPCKLWD(vecResultReg, R(vecTempReg)); + regCache_.Unlock(vecTempReg, RegCache::VEC_TEMP0); + } + regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT); + if (id.hasInvalidPtr) { SetJumpTarget(zeroSrc); } @@ -171,7 +189,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { if (id.hasInvalidPtr) { CMP(PTRBITS, R(R14), Imm8(0)); FixupBranch nonZeroSrc = J_CC(CC_NZ); - XOR(32, R(RAX), R(RAX)); + PXOR(XMM0, R(XMM0)); zeroSrc = J(true); SetJumpTarget(nonZeroSrc); } @@ -289,12 +307,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { ADDPS(fpScratchReg1, R(fpScratchReg3)); // Time to convert back to a single 32 bit value. - CVTPS2DQ(fpScratchReg1, R(fpScratchReg1)); - PACKSSDW(fpScratchReg1, R(fpScratchReg1)); - PACKUSWB(fpScratchReg1, R(fpScratchReg1)); - - const X64Reg resultReg = RAX; - MOVD_xmm(R(resultReg), fpScratchReg1); + CVTPS2DQ(XMM0, R(fpScratchReg1)); if (id.hasInvalidPtr) { SetJumpTarget(zeroSrc);