diff --git a/GPU/Software/Clipper.cpp b/GPU/Software/Clipper.cpp index 9035c8507..ee1fcb8f6 100644 --- a/GPU/Software/Clipper.cpp +++ b/GPU/Software/Clipper.cpp @@ -117,7 +117,7 @@ inline float clip_dotprod(const VertexData &vert, float A, float B, float C, flo } \ } -static void RotateUVThrough(const VertexData &tl, const VertexData &br, VertexData &tr, VertexData &bl) { +static void RotateUV(const VertexData &tl, const VertexData &br, VertexData &tr, VertexData &bl) { const int x1 = tl.screenpos.x; const int x2 = br.screenpos.x; const int y1 = tl.screenpos.y; @@ -194,6 +194,8 @@ void ProcessRect(const VertexData &v0, const VertexData &v1, BinManager &binner) bottomright = &buf[i]; } + RotateUV(*topleft, *bottomright, *topright, *bottomleft); + // Four triangles to do backfaces as well. Two of them will get backface culled. ProcessTriangleInternal(*topleft, *topright, *bottomright, buf[3], binner, true); ProcessTriangleInternal(*bottomright, *topright, *topleft, buf[3], binner, true); @@ -241,7 +243,7 @@ void ProcessRect(const VertexData &v0, const VertexData &v1, BinManager &binner) bottomright = &buf[i]; } - RotateUVThrough(v0, v1, *topright, *bottomleft); + RotateUV(v0, v1, *topright, *bottomleft); if (gstate.isModeClear() && !gstate.isDitherEnabled()) { binner.AddClearRect(v0, v1); diff --git a/GPU/Software/RasterizerRectangle.cpp b/GPU/Software/RasterizerRectangle.cpp index 363ca6a3f..844520b7f 100644 --- a/GPU/Software/RasterizerRectangle.cpp +++ b/GPU/Software/RasterizerRectangle.cpp @@ -270,8 +270,7 @@ bool RectangleFastPath(const VertexData &v0, const VertexData &v1, BinManager &b // Currently only works for TL/BR, which is the most common but not required. bool orient_check = xdiff >= 0 && ydiff >= 0; // We already have a fast path for clear in ClearRectangle. - bool state_check = !state.pixelID.clearMode && NoClampOrWrap(v0.texturecoords) && NoClampOrWrap(v1.texturecoords); - // TODO: No mipmap levels? Might be a font at level 1... + bool state_check = !state.pixelID.clearMode && !state.samplerID.hasAnyMips && NoClampOrWrap(v0.texturecoords) && NoClampOrWrap(v1.texturecoords); if ((coord_check || !state.enableTextures) && orient_check && state_check) { binner.AddSprite(v0, v1); return true; diff --git a/GPU/Software/SamplerX86.cpp b/GPU/Software/SamplerX86.cpp index e7227471d..288c23de8 100644 --- a/GPU/Software/SamplerX86.cpp +++ b/GPU/Software/SamplerX86.cpp @@ -1539,27 +1539,7 @@ bool SamplerJitCache::Jit_ApplyTextureFunc(const SamplerID &id) { Describe("EnvBlend"); PACKSSDW(primColorReg, R(primColorReg)); - // Start out with the prim color side. Materialize a 255 to inverse resultReg and round. - PCMPEQD(tempReg, R(tempReg)); - PSRLW(tempReg, 8); - - // We're going to lose tempReg, so save the 255s. - X64Reg roundValueReg = regCache_.Alloc(RegCache::VEC_TEMP1); - MOVDQA(roundValueReg, R(tempReg)); - - PSUBW(tempReg, R(resultReg)); - PMULLW(tempReg, R(primColorReg)); - // Okay, now add the rounding value. - PADDW(tempReg, R(roundValueReg)); - regCache_.Release(roundValueReg, RegCache::VEC_TEMP1); - - if (id.useTextureAlpha) { - // Before we modify the texture color, let's calculate alpha. - PADDW(primColorReg, M(constOnes16_)); - PMULLW(primColorReg, R(resultReg)); - // We divide later. - } - + // First off, let's grab the color value. X64Reg idReg = GetSamplerID(); X64Reg texEnvReg = regCache_.Alloc(RegCache::VEC_TEMP1); if (cpu_info.bSSE4_1) { @@ -1570,22 +1550,66 @@ bool SamplerJitCache::Jit_ApplyTextureFunc(const SamplerID &id) { PUNPCKLBW(texEnvReg, R(zeroReg)); regCache_.Unlock(zeroReg, RegCache::VEC_ZERO); } - PMULLW(resultReg, R(texEnvReg)); - regCache_.Release(texEnvReg, RegCache::VEC_TEMP1); UnlockSamplerID(idReg); - // Add in the prim color side and divide. - PADDW(resultReg, R(tempReg)); - if (id.useColorDoubling) - PSRLW(resultReg, 7); - else - PSRLW(resultReg, 8); + // Now merge in the prim color so we have them interleaved, texenv low. + PUNPCKLWD(texEnvReg, R(primColorReg)); + + // Okay, now materialize 255 for inversing resultReg and rounding. + PCMPEQD(tempReg, R(tempReg)); + PSRLW(tempReg, 8); + + // If alpha is used, we want the roundup and factor to be zero. + if (id.useTextureAlpha) + PSRLDQ(tempReg, 10); + + // We're going to lose tempReg, so save the 255s. + X64Reg roundValueReg = regCache_.Alloc(RegCache::VEC_TEMP2); + MOVDQA(roundValueReg, R(tempReg)); + + // Okay, now inverse, then merge with resultReg low to match texenv low. + PSUBUSW(tempReg, R(resultReg)); + PUNPCKLWD(resultReg, R(tempReg)); if (id.useTextureAlpha) { - // We put the alpha in here, just need to divide it after that multiply. - PSRLW(primColorReg, 8); + // Before we multiply, let's include alpha in that multiply. + PADDW(primColorReg, M(constOnes16_)); + // Mask off everything but alpha, and move to the second highest short. + PSRLDQ(primColorReg, 6); + PSLLDQ(primColorReg, 12); + // Now simply merge in with texenv. + POR(texEnvReg, R(primColorReg)); } - useAlphaFrom(primColorReg); + + // Alright, now to multiply and add all in one go. Note this gives us DWORDs. + PMADDWD(resultReg, R(texEnvReg)); + regCache_.Release(texEnvReg, RegCache::VEC_TEMP1); + + // Now convert back to 16 bit and add the 255s for rounding. + if (cpu_info.bSSE4_1) { + PACKUSDW(resultReg, R(resultReg)); + } else { + PSLLD(resultReg, 16); + PSRAD(resultReg, 16); + PACKSSDW(resultReg, R(resultReg)); + } + PADDW(resultReg, R(roundValueReg)); + regCache_.Release(roundValueReg, RegCache::VEC_TEMP2); + + // Okay, divide by 256 or 128 depending on doubling (we want to preserve the precision.) + if (id.useColorDoubling && id.useTextureAlpha) { + // If doubling, we want to still divide alpha by 256. + PSRLW(resultReg, 7); + PSRLW(primColorReg, resultReg, 1); + useAlphaFrom(primColorReg); + } else if (id.useColorDoubling) { + PSRLW(resultReg, 7); + } else { + PSRLW(resultReg, 8); + } + + if (!id.useTextureAlpha) + useAlphaFrom(primColorReg); break; } diff --git a/test.py b/test.py index 77e7644df..52c863b71 100755 --- a/test.py +++ b/test.py @@ -161,6 +161,8 @@ tests_good = [ "gpu/texfunc/decal", "gpu/texfunc/modulate", "gpu/texfunc/replace", + "gpu/textures/mipmap", + "gpu/textures/rotate", "hash/hash", "hle/check_not_used_uids", "intr/intr", @@ -400,8 +402,6 @@ tests_next = [ "gpu/signals/jumps", "gpu/signals/simple", "gpu/simple/simple", - "gpu/textures/mipmap", - "gpu/textures/rotate", "gpu/triangle/triangle", "gpu/vertices/colors", "gpu/vertices/texcoords",