Merge pull request #18450 from hrydgard/neon-arm32

Enable some NEON optimizations on ARM32 that we only had on ARM64 before
This commit is contained in:
Henrik Rydgård 2023-11-28 00:28:39 +01:00 committed by GitHub
commit 8ad0ef6901
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 77 additions and 29 deletions

View file

@ -335,7 +335,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{ {
#if defined(_M_SSE) #if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1])); _mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vld1q_f32(&mips->f[inst->src1])); vst1q_f32(&mips->f[inst->dest], vld1q_f32(&mips->f[inst->src1]));
#else #else
memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float)); memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));
@ -347,7 +347,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{ {
#if defined(_M_SSE) #if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2]))); _mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vaddq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2]))); vst1q_f32(&mips->f[inst->dest], vaddq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
#else #else
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
@ -360,7 +360,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{ {
#if defined(_M_SSE) #if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2]))); _mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vsubq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2]))); vst1q_f32(&mips->f[inst->dest], vsubq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
#else #else
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
@ -373,7 +373,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{ {
#if defined(_M_SSE) #if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2]))); _mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vmulq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2]))); vst1q_f32(&mips->f[inst->dest], vmulq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
#else #else
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
@ -408,7 +408,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{ {
#if defined(_M_SSE) #if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_xor_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)signBits))); _mm_store_ps(&mips->f[inst->dest], _mm_xor_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)signBits)));
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vnegq_f32(vld1q_f32(&mips->f[inst->src1]))); vst1q_f32(&mips->f[inst->dest], vnegq_f32(vld1q_f32(&mips->f[inst->src1])));
#else #else
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
@ -421,7 +421,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{ {
#if defined(_M_SSE) #if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_and_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)noSignMask))); _mm_store_ps(&mips->f[inst->dest], _mm_and_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)noSignMask)));
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vabsq_f32(vld1q_f32(&mips->f[inst->src1]))); vst1q_f32(&mips->f[inst->dest], vabsq_f32(vld1q_f32(&mips->f[inst->src1])));
#else #else
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)

View file

@ -1944,7 +1944,7 @@ bool GPUCommon::DescribeCodePtr(const u8 *ptr, std::string &name) {
} }
void GPUCommon::UpdateUVScaleOffset() { void GPUCommon::UpdateUVScaleOffset() {
#ifdef _M_SSE #if defined(_M_SSE)
__m128i values = _mm_slli_epi32(_mm_load_si128((const __m128i *)&gstate.texscaleu), 8); __m128i values = _mm_slli_epi32(_mm_load_si128((const __m128i *)&gstate.texscaleu), 8);
_mm_storeu_si128((__m128i *)&gstate_c.uv, values); _mm_storeu_si128((__m128i *)&gstate_c.uv, values);
#elif PPSSPP_ARCH(ARM_NEON) #elif PPSSPP_ARCH(ARM_NEON)

View file

@ -219,7 +219,7 @@ public:
#if defined(_M_SSE) #if defined(_M_SSE)
__m128i ivec; __m128i ivec;
__m128 vec; __m128 vec;
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
int32x4_t ivec; int32x4_t ivec;
float32x4_t vec; float32x4_t vec;
#endif #endif
@ -238,7 +238,7 @@ public:
Vec3(const Vec3Packed<T> &_xyz) { Vec3(const Vec3Packed<T> &_xyz) {
vec = _mm_loadu_ps(_xyz.AsArray()); vec = _mm_loadu_ps(_xyz.AsArray());
} }
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
Vec3(const float32x4_t &_vec) : vec(_vec) {} Vec3(const float32x4_t &_vec) : vec(_vec) {}
#if !defined(_MSC_VER) #if !defined(_MSC_VER)
Vec3(const int32x4_t &_ivec) : ivec(_ivec) {} Vec3(const int32x4_t &_ivec) : ivec(_ivec) {}
@ -578,7 +578,7 @@ public:
#if defined(_M_SSE) #if defined(_M_SSE)
__m128i ivec; __m128i ivec;
__m128 vec; __m128 vec;
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
int32x4_t ivec; int32x4_t ivec;
float32x4_t vec; float32x4_t vec;
#endif #endif
@ -595,7 +595,7 @@ public:
#if defined(_M_SSE) #if defined(_M_SSE)
Vec4(const __m128 &_vec) : vec(_vec) {} Vec4(const __m128 &_vec) : vec(_vec) {}
Vec4(const __m128i &_ivec) : ivec(_ivec) {} Vec4(const __m128i &_ivec) : ivec(_ivec) {}
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
Vec4(const float32x4_t &_vec) : vec(_vec) {} Vec4(const float32x4_t &_vec) : vec(_vec) {}
#if !defined(_MSC_VER) #if !defined(_MSC_VER)
Vec4(const int32x4_t &_ivec) : ivec(_ivec) {} Vec4(const int32x4_t &_ivec) : ivec(_ivec) {}
@ -607,14 +607,14 @@ public:
if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) { if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) {
#if defined(_M_SSE) #if defined(_M_SSE)
return _mm_cvtps_epi32(SAFE_M128(vec)); return _mm_cvtps_epi32(SAFE_M128(vec));
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
return vcvtq_s32_f32(vec); return vcvtq_s32_f32(vec);
#endif #endif
} }
if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) { if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) {
#if defined(_M_SSE) #if defined(_M_SSE)
return _mm_cvtepi32_ps(SAFE_M128I(ivec)); return _mm_cvtepi32_ps(SAFE_M128I(ivec));
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
return vcvtq_f32_s32(ivec); return vcvtq_f32_s32(ivec);
#endif #endif
} }
@ -922,7 +922,7 @@ inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, c
_mm_add_ps(_mm_mul_ps(col2, z), col3)); _mm_add_ps(_mm_mul_ps(col2, z), col3));
return sum; return sum;
} }
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) #elif PPSSPP_ARCH(ARM64_NEON)
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) { inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
float32x4_t col0 = vld1q_f32(m); float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 3); float32x4_t col1 = vld1q_f32(m + 3);
@ -933,6 +933,17 @@ inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3)); vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
return sum; return sum;
} }
#elif PPSSPP_ARCH(ARM_NEON)
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 3);
float32x4_t col2 = vld1q_f32(m + 6);
float32x4_t col3 = vld1q_f32(m + 9);
float32x4_t sum = vaddq_f32(
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
return sum;
}
#endif #endif
// v and vecOut must point to different memory. // v and vecOut must point to different memory.
@ -947,7 +958,7 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
vecOut[0] = _mm_cvtss_f32(sum); vecOut[0] = _mm_cvtss_f32(sum);
vecOut[1] = vectorGetByIndex<1>(sum); vecOut[1] = vectorGetByIndex<1>(sum);
vecOut[2] = vectorGetByIndex<2>(sum); vecOut[2] = vectorGetByIndex<2>(sum);
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
float vecIn[4] = {v[0], v[1], v[2], 1.0f}; float vecIn[4] = {v[0], v[1], v[2], 1.0f};
float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m); float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m);
vecOut[0] = vgetq_lane_f32(sum, 0); vecOut[0] = vgetq_lane_f32(sum, 0);
@ -967,7 +978,7 @@ inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1)); __m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2)); __m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
return Vec3ByMatrix43Internal(x, y, z, m); return Vec3ByMatrix43Internal(x, y, z, m);
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
return Vec3ByMatrix43Internal(v.vec, m); return Vec3ByMatrix43Internal(v.vec, m);
#else #else
Vec3f vecOut; Vec3f vecOut;
@ -999,6 +1010,17 @@ inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3)); vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
return sum; return sum;
} }
#elif PPSSPP_ARCH(ARM_NEON)
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 4);
float32x4_t col2 = vld1q_f32(m + 8);
float32x4_t col3 = vld1q_f32(m + 12);
float32x4_t sum = vaddq_f32(
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
return sum;
}
#endif #endif
inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) { inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) {
@ -1008,7 +1030,7 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
__m128 z = _mm_set1_ps(v[2]); __m128 z = _mm_set1_ps(v[2]);
__m128 sum = Vec3ByMatrix44Internal(x, y, z, m); __m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
_mm_storeu_ps(vecOut, sum); _mm_storeu_ps(vecOut, sum);
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
float vecIn[4] = {v[0], v[1], v[2], 1.0f}; float vecIn[4] = {v[0], v[1], v[2], 1.0f};
float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m); float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m);
vst1q_f32(vecOut, sum); vst1q_f32(vecOut, sum);
@ -1027,7 +1049,7 @@ inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1)); __m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2)); __m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
return Vec3ByMatrix44Internal(x, y, z, m); return Vec3ByMatrix44Internal(x, y, z, m);
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
return Vec3ByMatrix44Internal(v.vec, m); return Vec3ByMatrix44Internal(v.vec, m);
#else #else
Vec4f vecOut; Vec4f vecOut;
@ -1057,6 +1079,16 @@ inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
vmulq_laneq_f32(col2, vec, 2)); vmulq_laneq_f32(col2, vec, 2));
return sum; return sum;
} }
#elif PPSSPP_ARCH(ARM_NEON)
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 3);
float32x4_t col2 = vld1q_f32(m + 6);
float32x4_t sum = vaddq_f32(
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
vmulq_lane_f32(col2, vget_high_f32(vec), 2));
return sum;
}
#endif #endif
inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) { inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
@ -1068,7 +1100,7 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]
vecOut[0] = _mm_cvtss_f32(sum); vecOut[0] = _mm_cvtss_f32(sum);
vecOut[1] = vectorGetByIndex<1>(sum); vecOut[1] = vectorGetByIndex<1>(sum);
vecOut[2] = vectorGetByIndex<2>(sum); vecOut[2] = vectorGetByIndex<2>(sum);
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m); float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);
vecOut[0] = vgetq_lane_f32(sum, 0); vecOut[0] = vgetq_lane_f32(sum, 0);
vecOut[1] = vgetq_lane_f32(sum, 1); vecOut[1] = vgetq_lane_f32(sum, 1);
@ -1087,7 +1119,7 @@ inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1)); __m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2)); __m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
return Norm3ByMatrix43Internal(x, y, z, m); return Norm3ByMatrix43Internal(x, y, z, m);
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
return Norm3ByMatrix43Internal(v.vec, m); return Norm3ByMatrix43Internal(v.vec, m);
#else #else
Vec3f vecOut; Vec3f vecOut;
@ -1120,6 +1152,13 @@ inline void ConvertMatrix4x3To4x4(float *m4x4, const float *m4x3) {
} }
inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) { inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
#if PPSSPP_ARCH(ARM_NEON)
// vld3q is a perfect match here!
float32x4x3_t packed = vld3q_f32(m4x3);
vst1q_f32(m4x4, packed.val[0]);
vst1q_f32(m4x4 + 4, packed.val[1]);
vst1q_f32(m4x4 + 8, packed.val[2]);
#else
m4x4[0] = m4x3[0]; m4x4[0] = m4x3[0];
m4x4[1] = m4x3[3]; m4x4[1] = m4x3[3];
m4x4[2] = m4x3[6]; m4x4[2] = m4x3[6];
@ -1132,6 +1171,7 @@ inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
m4x4[9] = m4x3[5]; m4x4[9] = m4x3[5];
m4x4[10] = m4x3[8]; m4x4[10] = m4x3[8];
m4x4[11] = m4x3[11]; m4x4[11] = m4x3[11];
#endif
m4x4[12] = 0.0f; m4x4[12] = 0.0f;
m4x4[13] = 0.0f; m4x4[13] = 0.0f;
m4x4[14] = 0.0f; m4x4[14] = 0.0f;
@ -1147,6 +1187,13 @@ inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
// 89AB // 89AB
// Don't see a way to SIMD that. Should be pretty fast anyway. // Don't see a way to SIMD that. Should be pretty fast anyway.
inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) { inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
#if PPSSPP_ARCH(ARM_NEON)
// vld3q is a perfect match here!
float32x4x3_t packed = vld3q_f32(m4x3);
vst1q_f32(m4x4, packed.val[0]);
vst1q_f32(m4x4 + 4, packed.val[1]);
vst1q_f32(m4x4 + 8, packed.val[2]);
#else
m4x4[0] = m4x3[0]; m4x4[0] = m4x3[0];
m4x4[1] = m4x3[3]; m4x4[1] = m4x3[3];
m4x4[2] = m4x3[6]; m4x4[2] = m4x3[6];
@ -1159,6 +1206,7 @@ inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
m4x4[9] = m4x3[5]; m4x4[9] = m4x3[5];
m4x4[10] = m4x3[8]; m4x4[10] = m4x3[8];
m4x4[11] = m4x3[11]; m4x4[11] = m4x3[11];
#endif
} }
inline void Transpose4x4(float out[16], const float in[16]) { inline void Transpose4x4(float out[16], const float in[16]) {
@ -1209,7 +1257,7 @@ inline Vec3<float> Vec3<float>::FromRGB(unsigned int rgb)
__m128i c = _mm_cvtsi32_si128(rgb); __m128i c = _mm_cvtsi32_si128(rgb);
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z); c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
return Vec3<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f))); return Vec3<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb)); uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c))); uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
return Vec3<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f))); return Vec3<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
@ -1228,7 +1276,7 @@ inline Vec3<int> Vec3<int>::FromRGB(unsigned int rgb)
__m128i c = _mm_cvtsi32_si128(rgb); __m128i c = _mm_cvtsi32_si128(rgb);
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z); c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
return Vec3<int>(c); return Vec3<int>(c);
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb)); uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c))); uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
return Vec3<int>(vreinterpretq_s32_u32(u)); return Vec3<int>(vreinterpretq_s32_u32(u));
@ -1244,7 +1292,7 @@ __forceinline unsigned int Vec3<float>::ToRGB() const
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f))); __m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
__m128i c16 = _mm_packs_epi32(c, c); __m128i c16 = _mm_packs_epi32(c, c);
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF; return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f)))); uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f))));
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16)); uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
return vget_lane_u32(vreinterpret_u32_u8(c8), 0); return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
@ -1261,7 +1309,7 @@ __forceinline unsigned int Vec3<int>::ToRGB() const
#if defined(_M_SSE) #if defined(_M_SSE)
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec)); __m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF; return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3)); uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3));
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16)); uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
return vget_lane_u32(vreinterpret_u32_u8(c8), 0); return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
@ -1278,7 +1326,7 @@ inline Vec4<float> Vec4<float>::FromRGBA(unsigned int rgba)
__m128i c = _mm_cvtsi32_si128(rgba); __m128i c = _mm_cvtsi32_si128(rgba);
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z); c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
return Vec4<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f))); return Vec4<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba)); uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c))); uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
return Vec4<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f))); return Vec4<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
@ -1304,7 +1352,7 @@ inline Vec4<int> Vec4<int>::FromRGBA(unsigned int rgba)
__m128i c = _mm_cvtsi32_si128(rgba); __m128i c = _mm_cvtsi32_si128(rgba);
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z); c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
return Vec4<int>(c); return Vec4<int>(c);
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba)); uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c))); uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
return Vec4<int>(vreinterpretq_s32_u32(u)); return Vec4<int>(vreinterpretq_s32_u32(u));
@ -1320,7 +1368,7 @@ __forceinline unsigned int Vec4<float>::ToRGBA() const
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f))); __m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
__m128i c16 = _mm_packs_epi32(c, c); __m128i c16 = _mm_packs_epi32(c, c);
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)); return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f)))); uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f))));
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16)); uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
return vget_lane_u32(vreinterpret_u32_u8(c8), 0); return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
@ -1338,7 +1386,7 @@ __forceinline unsigned int Vec4<int>::ToRGBA() const
#if defined(_M_SSE) #if defined(_M_SSE)
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec)); __m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)); return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
#elif PPSSPP_ARCH(ARM64_NEON) #elif PPSSPP_ARCH(ARM_NEON)
uint16x4_t c16 = vqmovun_s32(ivec); uint16x4_t c16 = vqmovun_s32(ivec);
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16)); uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
return vget_lane_u32(vreinterpret_u32_u8(c8), 0); return vget_lane_u32(vreinterpret_u32_u8(c8), 0);

View file

@ -57,7 +57,7 @@
#if defined(__aarch64__) || defined(_M_ARM64) #if defined(__aarch64__) || defined(_M_ARM64)
#define PPSSPP_ARCH_ARM64 1 #define PPSSPP_ARCH_ARM64 1
#define PPSSPP_ARCH_64BIT 1 #define PPSSPP_ARCH_64BIT 1
#define PPSSPP_ARCH_ARM_NEON 1 #define PPSSPP_ARCH_ARM_NEON 1 // Applies to both ARM32 and ARM64
#define PPSSPP_ARCH_ARM64_NEON 1 #define PPSSPP_ARCH_ARM64_NEON 1
#endif #endif