Merge pull request #18450 from hrydgard/neon-arm32
Enable some NEON optimizations on ARM32 that we only had on ARM64 before
This commit is contained in:
commit
8ad0ef6901
4 changed files with 77 additions and 29 deletions
|
@ -335,7 +335,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
|
|||
{
|
||||
#if defined(_M_SSE)
|
||||
_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
vst1q_f32(&mips->f[inst->dest], vld1q_f32(&mips->f[inst->src1]));
|
||||
#else
|
||||
memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));
|
||||
|
@ -347,7 +347,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
|
|||
{
|
||||
#if defined(_M_SSE)
|
||||
_mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
vst1q_f32(&mips->f[inst->dest], vaddq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
|
||||
#else
|
||||
for (int i = 0; i < 4; i++)
|
||||
|
@ -360,7 +360,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
|
|||
{
|
||||
#if defined(_M_SSE)
|
||||
_mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
vst1q_f32(&mips->f[inst->dest], vsubq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
|
||||
#else
|
||||
for (int i = 0; i < 4; i++)
|
||||
|
@ -373,7 +373,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
|
|||
{
|
||||
#if defined(_M_SSE)
|
||||
_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
vst1q_f32(&mips->f[inst->dest], vmulq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
|
||||
#else
|
||||
for (int i = 0; i < 4; i++)
|
||||
|
@ -408,7 +408,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
|
|||
{
|
||||
#if defined(_M_SSE)
|
||||
_mm_store_ps(&mips->f[inst->dest], _mm_xor_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)signBits)));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
vst1q_f32(&mips->f[inst->dest], vnegq_f32(vld1q_f32(&mips->f[inst->src1])));
|
||||
#else
|
||||
for (int i = 0; i < 4; i++)
|
||||
|
@ -421,7 +421,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
|
|||
{
|
||||
#if defined(_M_SSE)
|
||||
_mm_store_ps(&mips->f[inst->dest], _mm_and_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)noSignMask)));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
vst1q_f32(&mips->f[inst->dest], vabsq_f32(vld1q_f32(&mips->f[inst->src1])));
|
||||
#else
|
||||
for (int i = 0; i < 4; i++)
|
||||
|
|
|
@ -1944,7 +1944,7 @@ bool GPUCommon::DescribeCodePtr(const u8 *ptr, std::string &name) {
|
|||
}
|
||||
|
||||
void GPUCommon::UpdateUVScaleOffset() {
|
||||
#ifdef _M_SSE
|
||||
#if defined(_M_SSE)
|
||||
__m128i values = _mm_slli_epi32(_mm_load_si128((const __m128i *)&gstate.texscaleu), 8);
|
||||
_mm_storeu_si128((__m128i *)&gstate_c.uv, values);
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
|
|
90
GPU/Math3D.h
90
GPU/Math3D.h
|
@ -219,7 +219,7 @@ public:
|
|||
#if defined(_M_SSE)
|
||||
__m128i ivec;
|
||||
__m128 vec;
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
int32x4_t ivec;
|
||||
float32x4_t vec;
|
||||
#endif
|
||||
|
@ -238,7 +238,7 @@ public:
|
|||
Vec3(const Vec3Packed<T> &_xyz) {
|
||||
vec = _mm_loadu_ps(_xyz.AsArray());
|
||||
}
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
Vec3(const float32x4_t &_vec) : vec(_vec) {}
|
||||
#if !defined(_MSC_VER)
|
||||
Vec3(const int32x4_t &_ivec) : ivec(_ivec) {}
|
||||
|
@ -578,7 +578,7 @@ public:
|
|||
#if defined(_M_SSE)
|
||||
__m128i ivec;
|
||||
__m128 vec;
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
int32x4_t ivec;
|
||||
float32x4_t vec;
|
||||
#endif
|
||||
|
@ -595,7 +595,7 @@ public:
|
|||
#if defined(_M_SSE)
|
||||
Vec4(const __m128 &_vec) : vec(_vec) {}
|
||||
Vec4(const __m128i &_ivec) : ivec(_ivec) {}
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
Vec4(const float32x4_t &_vec) : vec(_vec) {}
|
||||
#if !defined(_MSC_VER)
|
||||
Vec4(const int32x4_t &_ivec) : ivec(_ivec) {}
|
||||
|
@ -607,14 +607,14 @@ public:
|
|||
if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) {
|
||||
#if defined(_M_SSE)
|
||||
return _mm_cvtps_epi32(SAFE_M128(vec));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
return vcvtq_s32_f32(vec);
|
||||
#endif
|
||||
}
|
||||
if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) {
|
||||
#if defined(_M_SSE)
|
||||
return _mm_cvtepi32_ps(SAFE_M128I(ivec));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
return vcvtq_f32_s32(ivec);
|
||||
#endif
|
||||
}
|
||||
|
@ -922,7 +922,7 @@ inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, c
|
|||
_mm_add_ps(_mm_mul_ps(col2, z), col3));
|
||||
return sum;
|
||||
}
|
||||
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
||||
float32x4_t col0 = vld1q_f32(m);
|
||||
float32x4_t col1 = vld1q_f32(m + 3);
|
||||
|
@ -933,6 +933,17 @@ inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
|||
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
|
||||
return sum;
|
||||
}
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
||||
float32x4_t col0 = vld1q_f32(m);
|
||||
float32x4_t col1 = vld1q_f32(m + 3);
|
||||
float32x4_t col2 = vld1q_f32(m + 6);
|
||||
float32x4_t col3 = vld1q_f32(m + 9);
|
||||
float32x4_t sum = vaddq_f32(
|
||||
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
|
||||
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
// v and vecOut must point to different memory.
|
||||
|
@ -947,7 +958,7 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
|
|||
vecOut[0] = _mm_cvtss_f32(sum);
|
||||
vecOut[1] = vectorGetByIndex<1>(sum);
|
||||
vecOut[2] = vectorGetByIndex<2>(sum);
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
|
||||
float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m);
|
||||
vecOut[0] = vgetq_lane_f32(sum, 0);
|
||||
|
@ -967,7 +978,7 @@ inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
|
|||
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
return Vec3ByMatrix43Internal(x, y, z, m);
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
return Vec3ByMatrix43Internal(v.vec, m);
|
||||
#else
|
||||
Vec3f vecOut;
|
||||
|
@ -999,6 +1010,17 @@ inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
|
|||
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
|
||||
return sum;
|
||||
}
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
|
||||
float32x4_t col0 = vld1q_f32(m);
|
||||
float32x4_t col1 = vld1q_f32(m + 4);
|
||||
float32x4_t col2 = vld1q_f32(m + 8);
|
||||
float32x4_t col3 = vld1q_f32(m + 12);
|
||||
float32x4_t sum = vaddq_f32(
|
||||
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
|
||||
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) {
|
||||
|
@ -1008,7 +1030,7 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
|
|||
__m128 z = _mm_set1_ps(v[2]);
|
||||
__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
|
||||
_mm_storeu_ps(vecOut, sum);
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
|
||||
float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m);
|
||||
vst1q_f32(vecOut, sum);
|
||||
|
@ -1027,7 +1049,7 @@ inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
|
|||
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
return Vec3ByMatrix44Internal(x, y, z, m);
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
return Vec3ByMatrix44Internal(v.vec, m);
|
||||
#else
|
||||
Vec4f vecOut;
|
||||
|
@ -1057,6 +1079,16 @@ inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
|||
vmulq_laneq_f32(col2, vec, 2));
|
||||
return sum;
|
||||
}
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
||||
float32x4_t col0 = vld1q_f32(m);
|
||||
float32x4_t col1 = vld1q_f32(m + 3);
|
||||
float32x4_t col2 = vld1q_f32(m + 6);
|
||||
float32x4_t sum = vaddq_f32(
|
||||
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
|
||||
vmulq_lane_f32(col2, vget_high_f32(vec), 2));
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
|
||||
|
@ -1068,7 +1100,7 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]
|
|||
vecOut[0] = _mm_cvtss_f32(sum);
|
||||
vecOut[1] = vectorGetByIndex<1>(sum);
|
||||
vecOut[2] = vectorGetByIndex<2>(sum);
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);
|
||||
vecOut[0] = vgetq_lane_f32(sum, 0);
|
||||
vecOut[1] = vgetq_lane_f32(sum, 1);
|
||||
|
@ -1087,7 +1119,7 @@ inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
|
|||
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
return Norm3ByMatrix43Internal(x, y, z, m);
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
return Norm3ByMatrix43Internal(v.vec, m);
|
||||
#else
|
||||
Vec3f vecOut;
|
||||
|
@ -1120,6 +1152,13 @@ inline void ConvertMatrix4x3To4x4(float *m4x4, const float *m4x3) {
|
|||
}
|
||||
|
||||
inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
|
||||
#if PPSSPP_ARCH(ARM_NEON)
|
||||
// vld3q is a perfect match here!
|
||||
float32x4x3_t packed = vld3q_f32(m4x3);
|
||||
vst1q_f32(m4x4, packed.val[0]);
|
||||
vst1q_f32(m4x4 + 4, packed.val[1]);
|
||||
vst1q_f32(m4x4 + 8, packed.val[2]);
|
||||
#else
|
||||
m4x4[0] = m4x3[0];
|
||||
m4x4[1] = m4x3[3];
|
||||
m4x4[2] = m4x3[6];
|
||||
|
@ -1132,6 +1171,7 @@ inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
|
|||
m4x4[9] = m4x3[5];
|
||||
m4x4[10] = m4x3[8];
|
||||
m4x4[11] = m4x3[11];
|
||||
#endif
|
||||
m4x4[12] = 0.0f;
|
||||
m4x4[13] = 0.0f;
|
||||
m4x4[14] = 0.0f;
|
||||
|
@ -1147,6 +1187,13 @@ inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
|
|||
// 89AB
|
||||
// Don't see a way to SIMD that. Should be pretty fast anyway.
|
||||
inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
|
||||
#if PPSSPP_ARCH(ARM_NEON)
|
||||
// vld3q is a perfect match here!
|
||||
float32x4x3_t packed = vld3q_f32(m4x3);
|
||||
vst1q_f32(m4x4, packed.val[0]);
|
||||
vst1q_f32(m4x4 + 4, packed.val[1]);
|
||||
vst1q_f32(m4x4 + 8, packed.val[2]);
|
||||
#else
|
||||
m4x4[0] = m4x3[0];
|
||||
m4x4[1] = m4x3[3];
|
||||
m4x4[2] = m4x3[6];
|
||||
|
@ -1159,6 +1206,7 @@ inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
|
|||
m4x4[9] = m4x3[5];
|
||||
m4x4[10] = m4x3[8];
|
||||
m4x4[11] = m4x3[11];
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void Transpose4x4(float out[16], const float in[16]) {
|
||||
|
@ -1209,7 +1257,7 @@ inline Vec3<float> Vec3<float>::FromRGB(unsigned int rgb)
|
|||
__m128i c = _mm_cvtsi32_si128(rgb);
|
||||
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
||||
return Vec3<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
|
||||
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
||||
return Vec3<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
|
||||
|
@ -1228,7 +1276,7 @@ inline Vec3<int> Vec3<int>::FromRGB(unsigned int rgb)
|
|||
__m128i c = _mm_cvtsi32_si128(rgb);
|
||||
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
||||
return Vec3<int>(c);
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
|
||||
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
||||
return Vec3<int>(vreinterpretq_s32_u32(u));
|
||||
|
@ -1244,7 +1292,7 @@ __forceinline unsigned int Vec3<float>::ToRGB() const
|
|||
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
|
||||
__m128i c16 = _mm_packs_epi32(c, c);
|
||||
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f))));
|
||||
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
||||
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
||||
|
@ -1261,7 +1309,7 @@ __forceinline unsigned int Vec3<int>::ToRGB() const
|
|||
#if defined(_M_SSE)
|
||||
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
|
||||
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3));
|
||||
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
||||
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
||||
|
@ -1278,7 +1326,7 @@ inline Vec4<float> Vec4<float>::FromRGBA(unsigned int rgba)
|
|||
__m128i c = _mm_cvtsi32_si128(rgba);
|
||||
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
||||
return Vec4<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
|
||||
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
||||
return Vec4<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
|
||||
|
@ -1304,7 +1352,7 @@ inline Vec4<int> Vec4<int>::FromRGBA(unsigned int rgba)
|
|||
__m128i c = _mm_cvtsi32_si128(rgba);
|
||||
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
||||
return Vec4<int>(c);
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
|
||||
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
||||
return Vec4<int>(vreinterpretq_s32_u32(u));
|
||||
|
@ -1320,7 +1368,7 @@ __forceinline unsigned int Vec4<float>::ToRGBA() const
|
|||
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
|
||||
__m128i c16 = _mm_packs_epi32(c, c);
|
||||
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f))));
|
||||
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
||||
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
||||
|
@ -1338,7 +1386,7 @@ __forceinline unsigned int Vec4<int>::ToRGBA() const
|
|||
#if defined(_M_SSE)
|
||||
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
|
||||
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
uint16x4_t c16 = vqmovun_s32(ivec);
|
||||
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
||||
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
||||
|
|
|
@ -57,7 +57,7 @@
|
|||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
#define PPSSPP_ARCH_ARM64 1
|
||||
#define PPSSPP_ARCH_64BIT 1
|
||||
#define PPSSPP_ARCH_ARM_NEON 1
|
||||
#define PPSSPP_ARCH_ARM_NEON 1 // Applies to both ARM32 and ARM64
|
||||
#define PPSSPP_ARCH_ARM64_NEON 1
|
||||
#endif
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue