Delete a lot of specialized alpha checking code.

This was now only used to check alpha in CLUTs, and the generic functions will not actually be any slower.
This commit is contained in:
Henrik Rydgård 2022-04-15 12:34:50 +02:00
parent 062423597d
commit c4dfbf4f1a
12 changed files with 153 additions and 576 deletions

View file

@ -1430,118 +1430,6 @@ inline u32 TfmtRawToFullAlpha(GETextureFormat fmt) {
return 0;
}
#ifdef _M_SSE
inline u32 SSEReduce32And(__m128i value) {
// TODO: Should use a shuffle instead of slri, probably.
value = _mm_and_si128(value, _mm_srli_si128(value, 64));
value = _mm_and_si128(value, _mm_srli_si128(value, 32));
return _mm_cvtsi128_si32(value);
}
inline u32 SSEReduce16And(__m128i value) {
// TODO: Should use a shuffle instead of slri, probably.
value = _mm_and_si128(value, _mm_srli_si128(value, 64));
value = _mm_and_si128(value, _mm_srli_si128(value, 32));
value = _mm_and_si128(value, _mm_srli_si128(value, 16));
return _mm_cvtsi128_si32(value);
}
#endif
#if PPSSPP_ARCH(ARM_NEON)
inline u32 NEONReduce32And(uint32x4_t value) {
// TODO: Maybe a shuffle and a vector and, or something?
return vgetq_lane_u32(value, 0) & vgetq_lane_u32(value, 1) & vgetq_lane_u32(value, 2) & vgetq_lane_u32(value, 3);
}
#endif
// TODO: SSE/SIMD
// At least on x86, compiler actually SIMDs these pretty well.
void CopyAndSumMask16(u16 *dst, const u16 *src, int width, u32 *outMask) {
u16 mask = 0xFFFF;
for (int i = 0; i < width; i++) {
u16 color = src[i];
mask &= color;
dst[i] = color;
}
*outMask &= (u32)mask;
}
// Used in video playback so nice to have being fast.
void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask) {
u32 mask = 0xFFFFFFFF;
#ifdef _M_SSE
if (width >= 4) {
__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
while (width >= 4) {
__m128i color = _mm_loadu_si128((__m128i *)src);
wideMask = _mm_and_si128(wideMask, color);
_mm_storeu_si128((__m128i *)dst, color);
src += 4;
dst += 4;
width -= 4;
}
mask = SSEReduce32And(wideMask);
}
#elif PPSSPP_ARCH(ARM_NEON)
if (width >= 4) {
uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF);
while (width >= 4) {
uint32x4_t colors = vld1q_u32(src);
wideMask = vandq_u32(wideMask, colors);
vst1q_u32(dst, colors);
src += 4;
dst += 4;
width -= 4;
}
mask = NEONReduce32And(wideMask);
}
#endif
for (int i = 0; i < width; i++) {
u32 color = src[i];
mask &= color;
dst[i] = color;
}
*outMask &= (u32)mask;
}
void CheckMask16(const u16 *src, int width, u32 *outMask) {
u16 mask = 0xFFFF;
for (int i = 0; i < width; i++) {
mask &= src[i];
}
*outMask &= (u32)mask;
}
void CheckMask32(const u32 *src, int width, u32 *outMask) {
u32 mask = 0xFFFFFFFF;
#ifdef _M_SSE
if (width >= 4) {
__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
while (width >= 4) {
wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src));
src += 4;
width -= 4;
}
mask = SSEReduce32And(wideMask);
}
#elif PPSSPP_ARCH(ARM_NEON)
if (width >= 4) {
uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF);
while (width >= 4) {
wideMask = vandq_u32(wideMask, vld1q_u32(src));
src += 4;
width -= 4;
}
mask = NEONReduce32And(wideMask);
}
#endif
for (int i = 0; i < width; i++) {
mask &= src[i];
}
*outMask &= (u32)mask;
}
CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureFormat format, GEPaletteFormat clutformat, uint32_t texaddr, int level, int bufw, bool reverseColors, bool useBGRA, bool expandTo32bit) {
u32 alphaSum = 0xFFFFFFFF;
u32 fullAlphaMask = 0x0;