diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index e114b2f94..2fedc0d41 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -1240,7 +1240,35 @@ inline Vec4 TriangleEdge::StepX(const Vec4 &w) { } inline Vec4 TriangleEdge::StepY(const Vec4 &w) { +#if defined(_M_SSE) && !defined(_M_IX86) + return _mm_add_epi32(w.ivec, stepY.ivec); +#else return w + stepY; +#endif +} + +inline Vec4 MakeMask(const Vec4 &w0, const Vec4 &w1, const Vec4 &w2, const Vec4 &bias0, const Vec4 &bias1, const Vec4 &bias2) { +#if defined(_M_SSE) && !defined(_M_IX86) + __m128i biased0 = _mm_add_epi32(w0.ivec, bias0.ivec); + __m128i biased1 = _mm_add_epi32(w1.ivec, bias1.ivec); + __m128i biased2 = _mm_add_epi32(w2.ivec, bias2.ivec); + + return _mm_or_si128(biased0, _mm_or_si128(biased1, biased2)); +#else + return (w0 + bias0) | (w1 + bias1) | (w2 + bias2); +#endif +} + +inline bool AnyMask(const Vec4 &mask) { +#if defined(_M_SSE) && !defined(_M_IX86) + // In other words: !(mask.x < 0 && mask.y < 0 && mask.z < 0 && mask.w < 0) + __m128i low2 = _mm_and_si128(mask.ivec, _mm_shuffle_epi32(mask.ivec, _MM_SHUFFLE(3, 2, 3, 2))); + __m128i low1 = _mm_and_si128(low2, _mm_shuffle_epi32(low2, _MM_SHUFFLE(1, 1, 1, 1))); + // Now we only need to check one sign bit. + return _mm_cvtsi128_si32(low1) >= 0; +#else + return mask.x >= 0 || mask.y >= 0 || mask.z >= 0 || mask.w >= 0; +#endif } template @@ -1320,8 +1348,8 @@ void DrawTriangleSlice( p.x = (p.x + 2) & 0x3FF) { // If p is on or inside all edges, render pixel - Vec4 mask = (w0 + bias0) | (w1 + bias1) | (w2 + bias2); - if (mask.x >= 0 || mask.y >= 0 || mask.z >= 0 || mask.w >= 0) { + Vec4 mask = MakeMask(w0, w1, w2, bias0, bias1, bias2); + if (AnyMask(mask)) { Vec4 wsum_recip = (w0 + w1 + w2).Cast().Reciprocal(); Vec4 prim_color[4];