GPU: Allow gcc/clang/icc runtime SSE4 usage.
All our builds before were only using SSE4 in jit...
This commit is contained in:
parent
eee62849fe
commit
8a00c2d233
6 changed files with 104 additions and 58 deletions
|
@ -25,9 +25,6 @@
|
||||||
|
|
||||||
#ifdef _M_SSE
|
#ifdef _M_SSE
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#endif
|
|
||||||
|
|
||||||
#if _M_SSE >= 0x401
|
|
||||||
#include <smmintrin.h>
|
#include <smmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -181,19 +178,15 @@ void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
|
#if defined(_M_SSE)
|
||||||
#if _M_SSE >= 0x401
|
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
|
||||||
|
[[gnu::target("sse4.1")]]
|
||||||
|
#endif
|
||||||
|
static inline void ConvertRGBA8888ToRGBA5551_SSE4(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
|
||||||
const __m128i maskAG = _mm_set1_epi32(0x8000F800);
|
const __m128i maskAG = _mm_set1_epi32(0x8000F800);
|
||||||
const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
|
const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
|
||||||
const __m128i mask = _mm_set1_epi32(0x0000FFFF);
|
const __m128i mask = _mm_set1_epi32(0x0000FFFF);
|
||||||
|
|
||||||
const __m128i *srcp = (const __m128i *)src;
|
|
||||||
__m128i *dstp = (__m128i *)dst;
|
|
||||||
u32 sseChunks = (numPixels / 4) & ~1;
|
|
||||||
// SSE 4.1 required for _mm_packus_epi32.
|
|
||||||
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {
|
|
||||||
sseChunks = 0;
|
|
||||||
}
|
|
||||||
for (u32 i = 0; i < sseChunks; i += 2) {
|
for (u32 i = 0; i < sseChunks; i += 2) {
|
||||||
__m128i c1 = _mm_load_si128(&srcp[i + 0]);
|
__m128i c1 = _mm_load_si128(&srcp[i + 0]);
|
||||||
__m128i c2 = _mm_load_si128(&srcp[i + 1]);
|
__m128i c2 = _mm_load_si128(&srcp[i + 1]);
|
||||||
|
@ -213,6 +206,21 @@ void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
|
||||||
|
|
||||||
_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
|
_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
|
||||||
|
#if defined(_M_SSE)
|
||||||
|
const __m128i *srcp = (const __m128i *)src;
|
||||||
|
__m128i *dstp = (__m128i *)dst;
|
||||||
|
u32 sseChunks = (numPixels / 4) & ~1;
|
||||||
|
// SSE 4.1 required for _mm_packus_epi32.
|
||||||
|
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {
|
||||||
|
sseChunks = 0;
|
||||||
|
} else {
|
||||||
|
ConvertRGBA8888ToRGBA5551_SSE4(dstp, srcp, sseChunks);
|
||||||
|
}
|
||||||
|
|
||||||
// The remainder starts right after those done via SSE.
|
// The remainder starts right after those done via SSE.
|
||||||
u32 i = sseChunks * 4;
|
u32 i = sseChunks * 4;
|
||||||
#else
|
#else
|
||||||
|
@ -223,19 +231,15 @@ void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
|
#if defined(_M_SSE)
|
||||||
#if _M_SSE >= 0x401
|
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
|
||||||
|
[[gnu::target("sse4.1")]]
|
||||||
|
#endif
|
||||||
|
static inline void ConvertBGRA8888ToRGBA5551_SSE4(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
|
||||||
const __m128i maskAG = _mm_set1_epi32(0x8000F800);
|
const __m128i maskAG = _mm_set1_epi32(0x8000F800);
|
||||||
const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
|
const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
|
||||||
const __m128i mask = _mm_set1_epi32(0x0000FFFF);
|
const __m128i mask = _mm_set1_epi32(0x0000FFFF);
|
||||||
|
|
||||||
const __m128i *srcp = (const __m128i *)src;
|
|
||||||
__m128i *dstp = (__m128i *)dst;
|
|
||||||
u32 sseChunks = (numPixels / 4) & ~1;
|
|
||||||
// SSE 4.1 required for _mm_packus_epi32.
|
|
||||||
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {
|
|
||||||
sseChunks = 0;
|
|
||||||
}
|
|
||||||
for (u32 i = 0; i < sseChunks; i += 2) {
|
for (u32 i = 0; i < sseChunks; i += 2) {
|
||||||
__m128i c1 = _mm_load_si128(&srcp[i + 0]);
|
__m128i c1 = _mm_load_si128(&srcp[i + 0]);
|
||||||
__m128i c2 = _mm_load_si128(&srcp[i + 1]);
|
__m128i c2 = _mm_load_si128(&srcp[i + 1]);
|
||||||
|
@ -255,6 +259,21 @@ void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
|
||||||
|
|
||||||
_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
|
_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
|
||||||
|
#if defined(_M_SSE)
|
||||||
|
const __m128i *srcp = (const __m128i *)src;
|
||||||
|
__m128i *dstp = (__m128i *)dst;
|
||||||
|
u32 sseChunks = (numPixels / 4) & ~1;
|
||||||
|
// SSE 4.1 required for _mm_packus_epi32.
|
||||||
|
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {
|
||||||
|
sseChunks = 0;
|
||||||
|
} else {
|
||||||
|
ConvertBGRA8888ToRGBA5551_SSE4(dstp, srcp, sseChunks);
|
||||||
|
}
|
||||||
|
|
||||||
// The remainder starts right after those done via SSE.
|
// The remainder starts right after those done via SSE.
|
||||||
u32 i = sseChunks * 4;
|
u32 i = sseChunks * 4;
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -29,9 +29,7 @@
|
||||||
|
|
||||||
#ifdef _M_SSE
|
#ifdef _M_SSE
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#if _M_SSE >= 0x401
|
|
||||||
#include <smmintrin.h>
|
#include <smmintrin.h>
|
||||||
#endif
|
|
||||||
|
|
||||||
u32 QuickTexHashSSE2(const void *checkp, u32 size) {
|
u32 QuickTexHashSSE2(const void *checkp, u32 size) {
|
||||||
u32 check = 0;
|
u32 check = 0;
|
||||||
|
|
|
@ -31,7 +31,8 @@
|
||||||
#include "Common/CPUDetect.h"
|
#include "Common/CPUDetect.h"
|
||||||
#include "ext/xbrz/xbrz.h"
|
#include "ext/xbrz/xbrz.h"
|
||||||
|
|
||||||
#if _M_SSE >= 0x401
|
#if defined(_M_SSE)
|
||||||
|
#include <emmintrin.h>
|
||||||
#include <smmintrin.h>
|
#include <smmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -281,9 +282,12 @@ void scaleBicubicT(u32* data, u32* out, int w, int h, int l, int u) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#if _M_SSE >= 0x401
|
#if defined(_M_SSE)
|
||||||
template<int f, int T>
|
template<int f, int T>
|
||||||
void scaleBicubicTSSE41(u32* data, u32* out, int w, int h, int l, int u) {
|
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
|
||||||
|
[[gnu::target("sse4.1")]]
|
||||||
|
#endif
|
||||||
|
static void scaleBicubicTSSE41(u32* data, u32* out, int w, int h, int l, int u) {
|
||||||
int outw = w*f;
|
int outw = w*f;
|
||||||
for (int yb = 0; yb < (u - l)*f / BLOCK_SIZE + 1; ++yb) {
|
for (int yb = 0; yb < (u - l)*f / BLOCK_SIZE + 1; ++yb) {
|
||||||
for (int xb = 0; xb < w*f / BLOCK_SIZE + 1; ++xb) {
|
for (int xb = 0; xb < w*f / BLOCK_SIZE + 1; ++xb) {
|
||||||
|
@ -321,7 +325,7 @@ void scaleBicubicTSSE41(u32* data, u32* out, int w, int h, int l, int u) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void scaleBicubicBSpline(int factor, u32* data, u32* out, int w, int h, int l, int u) {
|
void scaleBicubicBSpline(int factor, u32* data, u32* out, int w, int h, int l, int u) {
|
||||||
#if _M_SSE >= 0x401
|
#if defined(_M_SSE)
|
||||||
if (cpu_info.bSSE4_1) {
|
if (cpu_info.bSSE4_1) {
|
||||||
switch (factor) {
|
switch (factor) {
|
||||||
case 2: scaleBicubicTSSE41<2, 0>(data, out, w, h, l, u); break; // when I first tested this,
|
case 2: scaleBicubicTSSE41<2, 0>(data, out, w, h, l, u); break; // when I first tested this,
|
||||||
|
@ -339,13 +343,13 @@ void scaleBicubicBSpline(int factor, u32* data, u32* out, int w, int h, int l, i
|
||||||
case 5: scaleBicubicT<5, 0>(data, out, w, h, l, u); break; // any of these break statements
|
case 5: scaleBicubicT<5, 0>(data, out, w, h, l, u); break; // any of these break statements
|
||||||
default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
|
default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
|
||||||
}
|
}
|
||||||
#if _M_SSE >= 0x401
|
#if defined(_M_SSE)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void scaleBicubicMitchell(int factor, u32* data, u32* out, int w, int h, int l, int u) {
|
void scaleBicubicMitchell(int factor, u32* data, u32* out, int w, int h, int l, int u) {
|
||||||
#if _M_SSE >= 0x401
|
#if defined(_M_SSE)
|
||||||
if (cpu_info.bSSE4_1) {
|
if (cpu_info.bSSE4_1) {
|
||||||
switch (factor) {
|
switch (factor) {
|
||||||
case 2: scaleBicubicTSSE41<2, 1>(data, out, w, h, l, u); break;
|
case 2: scaleBicubicTSSE41<2, 1>(data, out, w, h, l, u); break;
|
||||||
|
@ -363,7 +367,7 @@ void scaleBicubicMitchell(int factor, u32* data, u32* out, int w, int h, int l,
|
||||||
case 5: scaleBicubicT<5, 1>(data, out, w, h, l, u); break;
|
case 5: scaleBicubicT<5, 1>(data, out, w, h, l, u); break;
|
||||||
default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
|
default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
|
||||||
}
|
}
|
||||||
#if _M_SSE >= 0x401
|
#if defined(_M_SSE)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
|
@ -114,7 +114,9 @@ __m128 SSENormalizeMultiplierSSE2(__m128 v)
|
||||||
return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0));
|
return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
#if _M_SSE >= 0x401
|
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
|
||||||
|
[[gnu::target("sse4.1")]]
|
||||||
|
#endif
|
||||||
__m128 SSENormalizeMultiplierSSE4(__m128 v)
|
__m128 SSENormalizeMultiplierSSE4(__m128 v)
|
||||||
{
|
{
|
||||||
return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF));
|
return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF));
|
||||||
|
@ -126,12 +128,7 @@ __m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)
|
||||||
return SSENormalizeMultiplierSSE4(v);
|
return SSENormalizeMultiplierSSE4(v);
|
||||||
return SSENormalizeMultiplierSSE2(v);
|
return SSENormalizeMultiplierSSE2(v);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
__m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)
|
|
||||||
{
|
|
||||||
return SSENormalizeMultiplierSSE2(v);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
template<>
|
template<>
|
||||||
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const
|
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const
|
||||||
{
|
{
|
||||||
|
|
|
@ -25,10 +25,8 @@
|
||||||
|
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#if _M_SSE >= 0x401
|
|
||||||
#include <smmintrin.h>
|
#include <smmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
|
|
||||||
#if PPSSPP_ARCH(ARM_NEON)
|
#if PPSSPP_ARCH(ARM_NEON)
|
||||||
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
|
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
|
||||||
|
|
|
@ -40,9 +40,6 @@
|
||||||
|
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#endif
|
|
||||||
|
|
||||||
#if _M_SSE >= 0x401
|
|
||||||
#include <smmintrin.h>
|
#include <smmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -583,6 +580,17 @@ struct TriangleEdge {
|
||||||
Vec4<int> stepY;
|
Vec4<int> stepY;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
|
||||||
|
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
|
||||||
|
[[gnu::target("sse4.1")]]
|
||||||
|
#endif
|
||||||
|
static inline __m128i SOFTRAST_CALL TriangleEdgeStartSSE4(__m128i initX, __m128i initY, int xf, int yf, int c) {
|
||||||
|
initX = _mm_mullo_epi32(initX, _mm_set1_epi32(xf));
|
||||||
|
initY = _mm_mullo_epi32(initY, _mm_set1_epi32(yf));
|
||||||
|
return _mm_add_epi32(_mm_add_epi32(initX, initY), _mm_set1_epi32(c));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
template <bool useSSE4>
|
template <bool useSSE4>
|
||||||
Vec4<int> TriangleEdge<useSSE4>::Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin) {
|
Vec4<int> TriangleEdge<useSSE4>::Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin) {
|
||||||
// Start at pixel centers.
|
// Start at pixel centers.
|
||||||
|
@ -597,12 +605,9 @@ Vec4<int> TriangleEdge<useSSE4>::Start(const ScreenCoords &v0, const ScreenCoord
|
||||||
stepX = Vec4<int>::AssignToAll(xf * 16 * 2);
|
stepX = Vec4<int>::AssignToAll(xf * 16 * 2);
|
||||||
stepY = Vec4<int>::AssignToAll(yf * 16 * 2);
|
stepY = Vec4<int>::AssignToAll(yf * 16 * 2);
|
||||||
|
|
||||||
#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401
|
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
|
||||||
if (useSSE4) {
|
if (useSSE4)
|
||||||
initX.ivec = _mm_mullo_epi32(initX.ivec, _mm_set1_epi32(xf));
|
return TriangleEdgeStartSSE4(initX.ivec, initY.ivec, xf, yf, c);
|
||||||
initY.ivec = _mm_mullo_epi32(initY.ivec, _mm_set1_epi32(yf));
|
|
||||||
return _mm_add_epi32(_mm_add_epi32(initX.ivec, initY.ivec), _mm_set1_epi32(c));
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
return Vec4<int>::AssignToAll(xf) * initX + Vec4<int>::AssignToAll(yf) * initY + Vec4<int>::AssignToAll(c);
|
return Vec4<int>::AssignToAll(xf) * initX + Vec4<int>::AssignToAll(yf) * initY + Vec4<int>::AssignToAll(c);
|
||||||
}
|
}
|
||||||
|
@ -625,14 +630,23 @@ inline Vec4<int> TriangleEdge<useSSE4>::StepY(const Vec4<int> &w) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
|
||||||
|
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
|
||||||
|
[[gnu::target("sse4.1")]]
|
||||||
|
#endif
|
||||||
|
static inline int SOFTRAST_CALL MaxWeightSSE4(__m128i w) {
|
||||||
|
__m128i max2 = _mm_max_epi32(w, _mm_shuffle_epi32(w, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||||
|
__m128i max1 = _mm_max_epi32(max2, _mm_shuffle_epi32(max2, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||||
|
return _mm_cvtsi128_si32(max1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
template <bool useSSE4>
|
template <bool useSSE4>
|
||||||
void TriangleEdge<useSSE4>::NarrowMinMaxX(const Vec4<int> &w, int64_t minX, int64_t &rowMinX, int64_t &rowMaxX) {
|
void TriangleEdge<useSSE4>::NarrowMinMaxX(const Vec4<int> &w, int64_t minX, int64_t &rowMinX, int64_t &rowMaxX) {
|
||||||
int wmax;
|
int wmax;
|
||||||
#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401
|
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
|
||||||
if (useSSE4) {
|
if (useSSE4) {
|
||||||
__m128i max01 = _mm_max_epi32(w.ivec, _mm_shuffle_epi32(w.ivec, _MM_SHUFFLE(3, 2, 3, 2)));
|
wmax = MaxWeightSSE4(w.ivec);
|
||||||
__m128i max0 = _mm_max_epi32(max01, _mm_shuffle_epi32(max01, _MM_SHUFFLE(1, 1, 1, 1)));
|
|
||||||
wmax = _mm_cvtsi128_si32(max0);
|
|
||||||
} else {
|
} else {
|
||||||
wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w));
|
wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w));
|
||||||
}
|
}
|
||||||
|
@ -654,11 +668,20 @@ void TriangleEdge<useSSE4>::NarrowMinMaxX(const Vec4<int> &w, int64_t minX, int6
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
|
||||||
|
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
|
||||||
|
[[gnu::target("sse4.1")]]
|
||||||
|
#endif
|
||||||
|
static inline __m128i SOFTRAST_CALL StepTimesSSE4(__m128i w, __m128i step, int c) {
|
||||||
|
return _mm_add_epi32(w, _mm_mullo_epi32(_mm_set1_epi32(c), step));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
template <bool useSSE4>
|
template <bool useSSE4>
|
||||||
inline Vec4<int> TriangleEdge<useSSE4>::StepXTimes(const Vec4<int> &w, int c) {
|
inline Vec4<int> TriangleEdge<useSSE4>::StepXTimes(const Vec4<int> &w, int c) {
|
||||||
#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401
|
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
|
||||||
if (useSSE4)
|
if (useSSE4)
|
||||||
return _mm_add_epi32(w.ivec, _mm_mullo_epi32(_mm_set1_epi32(c), stepX.ivec));
|
return StepTimesSSE4(w.ivec, stepX.ivec, c);
|
||||||
#endif
|
#endif
|
||||||
return w + stepX * c;
|
return w + stepX * c;
|
||||||
}
|
}
|
||||||
|
@ -675,15 +698,22 @@ static inline Vec4<int> MakeMask(const Vec4<int> &w0, const Vec4<int> &w1, const
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
|
||||||
|
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
|
||||||
|
[[gnu::target("sse4.1")]]
|
||||||
|
#endif
|
||||||
|
static inline bool SOFTRAST_CALL AnyMaskSSE4(__m128i mask) {
|
||||||
|
__m128i sig = _mm_srai_epi32(mask, 31);
|
||||||
|
return _mm_test_all_ones(sig) == 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
template <bool useSSE4>
|
template <bool useSSE4>
|
||||||
static inline bool AnyMask(const Vec4<int> &mask) {
|
static inline bool AnyMask(const Vec4<int> &mask) {
|
||||||
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
|
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
|
||||||
#if _M_SSE >= 0x401
|
|
||||||
if (useSSE4) {
|
if (useSSE4) {
|
||||||
__m128i sig = _mm_srai_epi32(mask.ivec, 31);
|
return AnyMaskSSE4(mask.ivec);
|
||||||
return _mm_test_all_ones(sig) == 0;
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
// In other words: !(mask.x < 0 && mask.y < 0 && mask.z < 0 && mask.w < 0)
|
// In other words: !(mask.x < 0 && mask.y < 0 && mask.z < 0 && mask.w < 0)
|
||||||
__m128i low2 = _mm_and_si128(mask.ivec, _mm_shuffle_epi32(mask.ivec, _MM_SHUFFLE(3, 2, 3, 2)));
|
__m128i low2 = _mm_and_si128(mask.ivec, _mm_shuffle_epi32(mask.ivec, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue