ARM32: Backport a lot of previously 64-bit-only NEON optimizations to ARM32.
This commit is contained in:
parent
d58f826c8d
commit
45aae7b9da
3 changed files with 60 additions and 28 deletions
|
@ -335,7 +335,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
|
||||||
{
|
{
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));
|
_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
vst1q_f32(&mips->f[inst->dest], vld1q_f32(&mips->f[inst->src1]));
|
vst1q_f32(&mips->f[inst->dest], vld1q_f32(&mips->f[inst->src1]));
|
||||||
#else
|
#else
|
||||||
memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));
|
memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));
|
||||||
|
@ -347,7 +347,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
|
||||||
{
|
{
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
_mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
|
_mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
vst1q_f32(&mips->f[inst->dest], vaddq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
|
vst1q_f32(&mips->f[inst->dest], vaddq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
|
||||||
#else
|
#else
|
||||||
for (int i = 0; i < 4; i++)
|
for (int i = 0; i < 4; i++)
|
||||||
|
@ -360,7 +360,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
|
||||||
{
|
{
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
_mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
|
_mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
vst1q_f32(&mips->f[inst->dest], vsubq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
|
vst1q_f32(&mips->f[inst->dest], vsubq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
|
||||||
#else
|
#else
|
||||||
for (int i = 0; i < 4; i++)
|
for (int i = 0; i < 4; i++)
|
||||||
|
@ -373,7 +373,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
|
||||||
{
|
{
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
|
_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
vst1q_f32(&mips->f[inst->dest], vmulq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
|
vst1q_f32(&mips->f[inst->dest], vmulq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
|
||||||
#else
|
#else
|
||||||
for (int i = 0; i < 4; i++)
|
for (int i = 0; i < 4; i++)
|
||||||
|
@ -408,7 +408,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
|
||||||
{
|
{
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
_mm_store_ps(&mips->f[inst->dest], _mm_xor_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)signBits)));
|
_mm_store_ps(&mips->f[inst->dest], _mm_xor_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)signBits)));
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
vst1q_f32(&mips->f[inst->dest], vnegq_f32(vld1q_f32(&mips->f[inst->src1])));
|
vst1q_f32(&mips->f[inst->dest], vnegq_f32(vld1q_f32(&mips->f[inst->src1])));
|
||||||
#else
|
#else
|
||||||
for (int i = 0; i < 4; i++)
|
for (int i = 0; i < 4; i++)
|
||||||
|
@ -421,7 +421,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
|
||||||
{
|
{
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
_mm_store_ps(&mips->f[inst->dest], _mm_and_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)noSignMask)));
|
_mm_store_ps(&mips->f[inst->dest], _mm_and_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)noSignMask)));
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
vst1q_f32(&mips->f[inst->dest], vabsq_f32(vld1q_f32(&mips->f[inst->src1])));
|
vst1q_f32(&mips->f[inst->dest], vabsq_f32(vld1q_f32(&mips->f[inst->src1])));
|
||||||
#else
|
#else
|
||||||
for (int i = 0; i < 4; i++)
|
for (int i = 0; i < 4; i++)
|
||||||
|
|
74
GPU/Math3D.h
74
GPU/Math3D.h
|
@ -219,7 +219,7 @@ public:
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
__m128i ivec;
|
__m128i ivec;
|
||||||
__m128 vec;
|
__m128 vec;
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
int32x4_t ivec;
|
int32x4_t ivec;
|
||||||
float32x4_t vec;
|
float32x4_t vec;
|
||||||
#endif
|
#endif
|
||||||
|
@ -238,7 +238,7 @@ public:
|
||||||
Vec3(const Vec3Packed<T> &_xyz) {
|
Vec3(const Vec3Packed<T> &_xyz) {
|
||||||
vec = _mm_loadu_ps(_xyz.AsArray());
|
vec = _mm_loadu_ps(_xyz.AsArray());
|
||||||
}
|
}
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
Vec3(const float32x4_t &_vec) : vec(_vec) {}
|
Vec3(const float32x4_t &_vec) : vec(_vec) {}
|
||||||
#if !defined(_MSC_VER)
|
#if !defined(_MSC_VER)
|
||||||
Vec3(const int32x4_t &_ivec) : ivec(_ivec) {}
|
Vec3(const int32x4_t &_ivec) : ivec(_ivec) {}
|
||||||
|
@ -578,7 +578,7 @@ public:
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
__m128i ivec;
|
__m128i ivec;
|
||||||
__m128 vec;
|
__m128 vec;
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
int32x4_t ivec;
|
int32x4_t ivec;
|
||||||
float32x4_t vec;
|
float32x4_t vec;
|
||||||
#endif
|
#endif
|
||||||
|
@ -595,7 +595,7 @@ public:
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
Vec4(const __m128 &_vec) : vec(_vec) {}
|
Vec4(const __m128 &_vec) : vec(_vec) {}
|
||||||
Vec4(const __m128i &_ivec) : ivec(_ivec) {}
|
Vec4(const __m128i &_ivec) : ivec(_ivec) {}
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
Vec4(const float32x4_t &_vec) : vec(_vec) {}
|
Vec4(const float32x4_t &_vec) : vec(_vec) {}
|
||||||
#if !defined(_MSC_VER)
|
#if !defined(_MSC_VER)
|
||||||
Vec4(const int32x4_t &_ivec) : ivec(_ivec) {}
|
Vec4(const int32x4_t &_ivec) : ivec(_ivec) {}
|
||||||
|
@ -607,14 +607,14 @@ public:
|
||||||
if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) {
|
if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) {
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
return _mm_cvtps_epi32(SAFE_M128(vec));
|
return _mm_cvtps_epi32(SAFE_M128(vec));
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
return vcvtq_s32_f32(vec);
|
return vcvtq_s32_f32(vec);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) {
|
if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) {
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
return _mm_cvtepi32_ps(SAFE_M128I(ivec));
|
return _mm_cvtepi32_ps(SAFE_M128I(ivec));
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
return vcvtq_f32_s32(ivec);
|
return vcvtq_f32_s32(ivec);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -922,7 +922,7 @@ inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, c
|
||||||
_mm_add_ps(_mm_mul_ps(col2, z), col3));
|
_mm_add_ps(_mm_mul_ps(col2, z), col3));
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
|
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||||
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
||||||
float32x4_t col0 = vld1q_f32(m);
|
float32x4_t col0 = vld1q_f32(m);
|
||||||
float32x4_t col1 = vld1q_f32(m + 3);
|
float32x4_t col1 = vld1q_f32(m + 3);
|
||||||
|
@ -933,6 +933,17 @@ inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
||||||
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
|
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
|
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
||||||
|
float32x4_t col0 = vld1q_f32(m);
|
||||||
|
float32x4_t col1 = vld1q_f32(m + 3);
|
||||||
|
float32x4_t col2 = vld1q_f32(m + 6);
|
||||||
|
float32x4_t col3 = vld1q_f32(m + 9);
|
||||||
|
float32x4_t sum = vaddq_f32(
|
||||||
|
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
|
||||||
|
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// v and vecOut must point to different memory.
|
// v and vecOut must point to different memory.
|
||||||
|
@ -947,7 +958,7 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
|
||||||
vecOut[0] = _mm_cvtss_f32(sum);
|
vecOut[0] = _mm_cvtss_f32(sum);
|
||||||
vecOut[1] = vectorGetByIndex<1>(sum);
|
vecOut[1] = vectorGetByIndex<1>(sum);
|
||||||
vecOut[2] = vectorGetByIndex<2>(sum);
|
vecOut[2] = vectorGetByIndex<2>(sum);
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
|
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
|
||||||
float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m);
|
float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m);
|
||||||
vecOut[0] = vgetq_lane_f32(sum, 0);
|
vecOut[0] = vgetq_lane_f32(sum, 0);
|
||||||
|
@ -967,7 +978,7 @@ inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
|
||||||
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
|
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
|
||||||
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
|
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
|
||||||
return Vec3ByMatrix43Internal(x, y, z, m);
|
return Vec3ByMatrix43Internal(x, y, z, m);
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
return Vec3ByMatrix43Internal(v.vec, m);
|
return Vec3ByMatrix43Internal(v.vec, m);
|
||||||
#else
|
#else
|
||||||
Vec3f vecOut;
|
Vec3f vecOut;
|
||||||
|
@ -999,6 +1010,17 @@ inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
|
||||||
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
|
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
|
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
|
||||||
|
float32x4_t col0 = vld1q_f32(m);
|
||||||
|
float32x4_t col1 = vld1q_f32(m + 4);
|
||||||
|
float32x4_t col2 = vld1q_f32(m + 8);
|
||||||
|
float32x4_t col3 = vld1q_f32(m + 12);
|
||||||
|
float32x4_t sum = vaddq_f32(
|
||||||
|
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
|
||||||
|
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) {
|
inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) {
|
||||||
|
@ -1008,7 +1030,7 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
|
||||||
__m128 z = _mm_set1_ps(v[2]);
|
__m128 z = _mm_set1_ps(v[2]);
|
||||||
__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
|
__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
|
||||||
_mm_storeu_ps(vecOut, sum);
|
_mm_storeu_ps(vecOut, sum);
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
|
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
|
||||||
float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m);
|
float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m);
|
||||||
vst1q_f32(vecOut, sum);
|
vst1q_f32(vecOut, sum);
|
||||||
|
@ -1027,7 +1049,7 @@ inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
|
||||||
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
|
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
|
||||||
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
|
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
|
||||||
return Vec3ByMatrix44Internal(x, y, z, m);
|
return Vec3ByMatrix44Internal(x, y, z, m);
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
return Vec3ByMatrix44Internal(v.vec, m);
|
return Vec3ByMatrix44Internal(v.vec, m);
|
||||||
#else
|
#else
|
||||||
Vec4f vecOut;
|
Vec4f vecOut;
|
||||||
|
@ -1057,6 +1079,16 @@ inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
||||||
vmulq_laneq_f32(col2, vec, 2));
|
vmulq_laneq_f32(col2, vec, 2));
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
|
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
||||||
|
float32x4_t col0 = vld1q_f32(m);
|
||||||
|
float32x4_t col1 = vld1q_f32(m + 3);
|
||||||
|
float32x4_t col2 = vld1q_f32(m + 6);
|
||||||
|
float32x4_t sum = vaddq_f32(
|
||||||
|
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
|
||||||
|
vmulq_lane_f32(col2, vget_high_f32(vec), 2));
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
|
inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
|
||||||
|
@ -1068,7 +1100,7 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]
|
||||||
vecOut[0] = _mm_cvtss_f32(sum);
|
vecOut[0] = _mm_cvtss_f32(sum);
|
||||||
vecOut[1] = vectorGetByIndex<1>(sum);
|
vecOut[1] = vectorGetByIndex<1>(sum);
|
||||||
vecOut[2] = vectorGetByIndex<2>(sum);
|
vecOut[2] = vectorGetByIndex<2>(sum);
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);
|
float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);
|
||||||
vecOut[0] = vgetq_lane_f32(sum, 0);
|
vecOut[0] = vgetq_lane_f32(sum, 0);
|
||||||
vecOut[1] = vgetq_lane_f32(sum, 1);
|
vecOut[1] = vgetq_lane_f32(sum, 1);
|
||||||
|
@ -1087,7 +1119,7 @@ inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
|
||||||
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
|
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
|
||||||
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
|
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
|
||||||
return Norm3ByMatrix43Internal(x, y, z, m);
|
return Norm3ByMatrix43Internal(x, y, z, m);
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
return Norm3ByMatrix43Internal(v.vec, m);
|
return Norm3ByMatrix43Internal(v.vec, m);
|
||||||
#else
|
#else
|
||||||
Vec3f vecOut;
|
Vec3f vecOut;
|
||||||
|
@ -1209,7 +1241,7 @@ inline Vec3<float> Vec3<float>::FromRGB(unsigned int rgb)
|
||||||
__m128i c = _mm_cvtsi32_si128(rgb);
|
__m128i c = _mm_cvtsi32_si128(rgb);
|
||||||
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
||||||
return Vec3<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
|
return Vec3<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
|
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
|
||||||
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
||||||
return Vec3<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
|
return Vec3<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
|
||||||
|
@ -1228,7 +1260,7 @@ inline Vec3<int> Vec3<int>::FromRGB(unsigned int rgb)
|
||||||
__m128i c = _mm_cvtsi32_si128(rgb);
|
__m128i c = _mm_cvtsi32_si128(rgb);
|
||||||
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
||||||
return Vec3<int>(c);
|
return Vec3<int>(c);
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
|
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
|
||||||
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
||||||
return Vec3<int>(vreinterpretq_s32_u32(u));
|
return Vec3<int>(vreinterpretq_s32_u32(u));
|
||||||
|
@ -1244,7 +1276,7 @@ __forceinline unsigned int Vec3<float>::ToRGB() const
|
||||||
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
|
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
|
||||||
__m128i c16 = _mm_packs_epi32(c, c);
|
__m128i c16 = _mm_packs_epi32(c, c);
|
||||||
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
|
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f))));
|
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f))));
|
||||||
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
||||||
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
||||||
|
@ -1261,7 +1293,7 @@ __forceinline unsigned int Vec3<int>::ToRGB() const
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
|
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
|
||||||
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
|
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3));
|
uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3));
|
||||||
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
||||||
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
||||||
|
@ -1278,7 +1310,7 @@ inline Vec4<float> Vec4<float>::FromRGBA(unsigned int rgba)
|
||||||
__m128i c = _mm_cvtsi32_si128(rgba);
|
__m128i c = _mm_cvtsi32_si128(rgba);
|
||||||
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
||||||
return Vec4<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
|
return Vec4<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
|
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
|
||||||
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
||||||
return Vec4<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
|
return Vec4<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
|
||||||
|
@ -1304,7 +1336,7 @@ inline Vec4<int> Vec4<int>::FromRGBA(unsigned int rgba)
|
||||||
__m128i c = _mm_cvtsi32_si128(rgba);
|
__m128i c = _mm_cvtsi32_si128(rgba);
|
||||||
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
||||||
return Vec4<int>(c);
|
return Vec4<int>(c);
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
|
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
|
||||||
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
||||||
return Vec4<int>(vreinterpretq_s32_u32(u));
|
return Vec4<int>(vreinterpretq_s32_u32(u));
|
||||||
|
@ -1320,7 +1352,7 @@ __forceinline unsigned int Vec4<float>::ToRGBA() const
|
||||||
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
|
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
|
||||||
__m128i c16 = _mm_packs_epi32(c, c);
|
__m128i c16 = _mm_packs_epi32(c, c);
|
||||||
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
|
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f))));
|
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f))));
|
||||||
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
||||||
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
||||||
|
@ -1338,7 +1370,7 @@ __forceinline unsigned int Vec4<int>::ToRGBA() const
|
||||||
#if defined(_M_SSE)
|
#if defined(_M_SSE)
|
||||||
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
|
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
|
||||||
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
|
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
|
||||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
uint16x4_t c16 = vqmovun_s32(ivec);
|
uint16x4_t c16 = vqmovun_s32(ivec);
|
||||||
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
||||||
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||||
#define PPSSPP_ARCH_ARM64 1
|
#define PPSSPP_ARCH_ARM64 1
|
||||||
#define PPSSPP_ARCH_64BIT 1
|
#define PPSSPP_ARCH_64BIT 1
|
||||||
#define PPSSPP_ARCH_ARM_NEON 1
|
#define PPSSPP_ARCH_ARM_NEON 1 // Applies to both ARM32 and ARM64
|
||||||
#define PPSSPP_ARCH_ARM64_NEON 1
|
#define PPSSPP_ARCH_ARM64_NEON 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue