diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index 710421f19..e8c5a417e 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -968,6 +968,29 @@ inline void ApplyTexturing(Vec3 &prim_color_rgb, int &prim_color_a, float s // Nearest filtering only. Round texcoords or just chop bits? texcolor = Vec4::FromRGBA(SampleNearest(texlevel, u[0], v[0], tptr, bufwbits)); } else { +#if defined(_M_SSE) + MEMORY_ALIGNED16(u32 tc[2]); + MEMORY_ALIGNED16(u32 bc[2]); + tc[0] = SampleNearest(texlevel, u[0], v[0], tptr, bufwbits); + tc[1] = SampleNearest(texlevel, u[1], v[1], tptr, bufwbits); + bc[0] = SampleNearest(texlevel, u[2], v[2], tptr, bufwbits); + bc[1] = SampleNearest(texlevel, u[3], v[3], tptr, bufwbits); + + const __m128i z = _mm_setzero_si128(); + + __m128i tvec = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)tc), z); + tvec = _mm_mullo_epi16(tvec, _mm_set1_epi16(0x100 - frac_v)); + __m128i bvec = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bc), z); + bvec = _mm_mullo_epi16(bvec, _mm_set1_epi16(frac_v)); + + // This multiplies the left and right sides. We shift right after, although this may round down... + __m128i rowmult = _mm_set_epi16(frac_u, frac_u, frac_u, frac_u, 0x100 - frac_u, 0x100 - frac_u, 0x100 - frac_u, 0x100 - frac_u); + __m128i tmp = _mm_mulhi_epu16(_mm_add_epi16(tvec, bvec), rowmult); + + // Now we need to add the left and right sides together. + __m128i res = _mm_add_epi16(tmp, _mm_shuffle_epi32(tmp, _MM_SHUFFLE(3, 2, 3, 2))); + texcolor = Vec4(_mm_unpacklo_epi16(res, z)); +#else Vec4 texcolor_tl = Vec4::FromRGBA(SampleNearest(texlevel, u[0], v[0], tptr, bufwbits)); Vec4 texcolor_tr = Vec4::FromRGBA(SampleNearest(texlevel, u[1], v[1], tptr, bufwbits)); Vec4 texcolor_bl = Vec4::FromRGBA(SampleNearest(texlevel, u[2], v[2], tptr, bufwbits)); @@ -976,6 +999,7 @@ inline void ApplyTexturing(Vec3 &prim_color_rgb, int &prim_color_a, float s Vec4 t = texcolor_tl * (0x100 - frac_u) + texcolor_tr * frac_u; Vec4 b = texcolor_bl * (0x100 - frac_u) + texcolor_br * frac_u; texcolor = (t * (0x100 - frac_v) + b * frac_v) / (256 * 256); +#endif } Vec4 out = GetTextureFunctionOutput(prim_color_rgb, prim_color_a, texcolor); prim_color_rgb = out.rgb();