diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index be528411c..8a5f98e30 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -659,12 +659,16 @@ CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w } CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h) { -#ifdef _M_SSE // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.) if ((w & 7) == 0 && (stride & 7) == 0) { +#ifdef _M_SSE return CheckAlphaRGBA4444SSE2(pixelData, stride, w, h); - } +#elif PPSSPP_ARCH(ARMV7) || PPSSPP_ARCH(ARM64) + if (cpu_info.bNEON) { + return CheckAlphaRGBA4444NEON(pixelData, stride, w, h); + } #endif + } const u32 *p = pixelData; const int w2 = (w + 1) / 2; @@ -688,12 +692,16 @@ CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w } CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w, int h) { -#ifdef _M_SSE // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.) if ((w & 7) == 0 && (stride & 7) == 0) { +#ifdef _M_SSE return CheckAlphaRGBA5551SSE2(pixelData, stride, w, h); - } +#elif PPSSPP_ARCH(ARMV7) || PPSSPP_ARCH(ARM64) + if (cpu_info.bNEON) { + return CheckAlphaRGBA5551NEON(pixelData, stride, w, h); + } #endif + } const u32 *p = pixelData; const int w2 = (w + 1) / 2; diff --git a/GPU/Common/TextureDecoderNEON.cpp b/GPU/Common/TextureDecoderNEON.cpp index 2b98fdfdf..2a4d06572 100644 --- a/GPU/Common/TextureDecoderNEON.cpp +++ b/GPU/Common/TextureDecoderNEON.cpp @@ -342,4 +342,48 @@ CheckAlphaResult CheckAlphaABGR1555NEON(const u32 *pixelData, int stride, int w, return CHECKALPHA_FULL; } +CheckAlphaResult CheckAlphaRGBA4444NEON(const u32 *pixelData, int stride, int w, int h) { + const u16 *p = (const u16 *)pixelData; + + const uint16x8_t mask = vdupq_n_u16((u16)0xF000); + uint16x8_t bits = mask; + for (int y = 0; y < h; ++y) { + for (int i = 0; i < w; i += 8) { + const uint16x8_t a = vld1q_u16(&p[i]); + bits = vandq_u16(bits, a); + } + + uint16x8_t result = veorq_u16(bits, mask); + if (VectorIsNonZeroNEON(result)) { + return CHECKALPHA_ANY; + } + + p += stride; + } + + return CHECKALPHA_FULL; +} + +CheckAlphaResult CheckAlphaRGBA5551NEON(const u32 *pixelData, int stride, int w, int h) { + const u16 *p = (const u16 *)pixelData; + + const uint16x8_t mask = vdupq_n_u16((u16)0x8000); + uint16x8_t bits = mask; + for (int y = 0; y < h; ++y) { + for (int i = 0; i < w; i += 8) { + const uint16x8_t a = vld1q_u16(&p[i]); + bits = vandq_u16(bits, a); + } + + uint16x8_t result = veorq_u16(bits, mask); + if (VectorIsNonZeroNEON(result)) { + return CHECKALPHA_ANY; + } + + p += stride; + } + + return CHECKALPHA_FULL; +} + #endif diff --git a/GPU/Common/TextureDecoderNEON.h b/GPU/Common/TextureDecoderNEON.h index 1471b41d3..02c3a8ec7 100644 --- a/GPU/Common/TextureDecoderNEON.h +++ b/GPU/Common/TextureDecoderNEON.h @@ -24,3 +24,5 @@ u32 ReliableHash32NEON(const void *input, size_t len, u32 seed); CheckAlphaResult CheckAlphaRGBA8888NEON(const u32 *pixelData, int stride, int w, int h); CheckAlphaResult CheckAlphaABGR4444NEON(const u32 *pixelData, int stride, int w, int h); CheckAlphaResult CheckAlphaABGR1555NEON(const u32 *pixelData, int stride, int w, int h); +CheckAlphaResult CheckAlphaRGBA4444NEON(const u32 *pixelData, int stride, int w, int h); +CheckAlphaResult CheckAlphaRGBA5551NEON(const u32 *pixelData, int stride, int w, int h);