softgpu: Simply 5551 blending fast path.

Since it only supports multiply and add, let's just stick with that.
This commit is contained in:
Unknown W. Brackets 2022-09-24 18:19:22 -07:00
parent 1eeb4f0bcf
commit c47d7eab38
4 changed files with 244 additions and 211 deletions

View file

@ -270,206 +270,6 @@ static inline bool IsRightSideOrFlatBottomLine(const Vec2<int>& vertex, const Ve
}
}
static inline Vec3<int> GetSourceFactor(PixelBlendFactor factor, const Vec4<int> &source, const Vec4<int> &dst, uint32_t fix) {
switch (factor) {
case PixelBlendFactor::OTHERCOLOR:
return dst.rgb();
case PixelBlendFactor::INVOTHERCOLOR:
return Vec3<int>::AssignToAll(255) - dst.rgb();
case PixelBlendFactor::SRCALPHA:
#if defined(_M_SSE)
return Vec3<int>(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)));
#else
return Vec3<int>::AssignToAll(source.a());
#endif
case PixelBlendFactor::INVSRCALPHA:
#if defined(_M_SSE)
return Vec3<int>(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))));
#else
return Vec3<int>::AssignToAll(255 - source.a());
#endif
case PixelBlendFactor::DSTALPHA:
return Vec3<int>::AssignToAll(dst.a());
case PixelBlendFactor::INVDSTALPHA:
return Vec3<int>::AssignToAll(255 - dst.a());
case PixelBlendFactor::DOUBLESRCALPHA:
return Vec3<int>::AssignToAll(2 * source.a());
case PixelBlendFactor::DOUBLEINVSRCALPHA:
return Vec3<int>::AssignToAll(255 - std::min(2 * source.a(), 255));
case PixelBlendFactor::DOUBLEDSTALPHA:
return Vec3<int>::AssignToAll(2 * dst.a());
case PixelBlendFactor::DOUBLEINVDSTALPHA:
return Vec3<int>::AssignToAll(255 - std::min(2 * dst.a(), 255));
case PixelBlendFactor::FIX:
default:
// All other dest factors (> 10) are treated as FIXA.
return Vec3<int>::FromRGB(fix);
case PixelBlendFactor::ZERO:
return Vec3<int>::AssignToAll(0);
case PixelBlendFactor::ONE:
return Vec3<int>::AssignToAll(255);
}
}
static inline Vec3<int> GetDestFactor(PixelBlendFactor factor, const Vec4<int> &source, const Vec4<int> &dst, uint32_t fix) {
switch (factor) {
case PixelBlendFactor::OTHERCOLOR:
return source.rgb();
case PixelBlendFactor::INVOTHERCOLOR:
return Vec3<int>::AssignToAll(255) - source.rgb();
case PixelBlendFactor::SRCALPHA:
#if defined(_M_SSE)
return Vec3<int>(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)));
#else
return Vec3<int>::AssignToAll(source.a());
#endif
case PixelBlendFactor::INVSRCALPHA:
#if defined(_M_SSE)
return Vec3<int>(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))));
#else
return Vec3<int>::AssignToAll(255 - source.a());
#endif
case PixelBlendFactor::DSTALPHA:
return Vec3<int>::AssignToAll(dst.a());
case PixelBlendFactor::INVDSTALPHA:
return Vec3<int>::AssignToAll(255 - dst.a());
case PixelBlendFactor::DOUBLESRCALPHA:
return Vec3<int>::AssignToAll(2 * source.a());
case PixelBlendFactor::DOUBLEINVSRCALPHA:
return Vec3<int>::AssignToAll(255 - std::min(2 * source.a(), 255));
case PixelBlendFactor::DOUBLEDSTALPHA:
return Vec3<int>::AssignToAll(2 * dst.a());
case PixelBlendFactor::DOUBLEINVDSTALPHA:
return Vec3<int>::AssignToAll(255 - std::min(2 * dst.a(), 255));
case PixelBlendFactor::FIX:
default:
// All other dest factors (> 10) are treated as FIXB.
return Vec3<int>::FromRGB(fix);
case PixelBlendFactor::ZERO:
return Vec3<int>::AssignToAll(0);
case PixelBlendFactor::ONE:
return Vec3<int>::AssignToAll(255);
}
}
// Removed inline here - it was never chosen to be inlined by the compiler anyway, too complex.
Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &source, const Vec4<int> &dst) {
// Note: These factors cannot go below 0, but they can go above 255 when doubling.
Vec3<int> srcfactor = GetSourceFactor(pixelID.AlphaBlendSrc(), source, dst, pixelID.cached.alphaBlendSrc);
Vec3<int> dstfactor = GetDestFactor(pixelID.AlphaBlendDst(), source, dst, pixelID.cached.alphaBlendDst);
switch (pixelID.AlphaBlendEq()) {
case GE_BLENDMODE_MUL_AND_ADD:
{
#if defined(_M_SSE)
// We switch to 16 bit to use mulhi, and we use 4 bits of decimal to make the 16 bit shift free.
const __m128i half = _mm_set1_epi16(1 << 3);
const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
const __m128i s = _mm_mulhi_epi16(srgb, sf);
const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
const __m128i d = _mm_mulhi_epi16(drgb, df);
return Vec3<int>(_mm_unpacklo_epi16(_mm_adds_epi16(s, d), _mm_setzero_si128()));
#else
static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
return lhs + rhs;
#endif
}
case GE_BLENDMODE_MUL_AND_SUBTRACT:
{
#if defined(_M_SSE)
const __m128i half = _mm_set1_epi16(1 << 3);
const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
const __m128i s = _mm_mulhi_epi16(srgb, sf);
const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
const __m128i d = _mm_mulhi_epi16(drgb, df);
return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128()), _mm_setzero_si128()));
#else
static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
return lhs - rhs;
#endif
}
case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
{
#if defined(_M_SSE)
const __m128i half = _mm_set1_epi16(1 << 3);
const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
const __m128i s = _mm_mulhi_epi16(srgb, sf);
const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
const __m128i d = _mm_mulhi_epi16(drgb, df);
return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128()), _mm_setzero_si128()));
#else
static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
return rhs - lhs;
#endif
}
case GE_BLENDMODE_MIN:
return Vec3<int>(std::min(source.r(), dst.r()),
std::min(source.g(), dst.g()),
std::min(source.b(), dst.b()));
case GE_BLENDMODE_MAX:
return Vec3<int>(std::max(source.r(), dst.r()),
std::max(source.g(), dst.g()),
std::max(source.b(), dst.b()));
case GE_BLENDMODE_ABSDIFF:
return Vec3<int>(::abs(source.r() - dst.r()),
::abs(source.g() - dst.g()),
::abs(source.b() - dst.b()));
default:
return source.rgb();
}
}
static inline Vec4IntResult SOFTRAST_CALL ApplyTexturing(float s, float t, int x, int y, Vec4IntArg prim_color, int texlevel, int frac_texlevel, bool bilinear, const RasterizerState &state) {
const u8 **tptr0 = const_cast<const u8 **>(&state.texptr[texlevel]);
const uint16_t *bufw0 = &state.texbufw[texlevel];