Merge pull request #17905 from unknownbrackets/irjit-opt

irjit: Implement some missing, handle partial Vec4s more
2023-08-14 07:49:45 +02:00 · 2023-08-14 07:49:45 +02:00 · 1beb01af6a
commit 1beb01af6a
parent 63b3b31feb 159b41a0fa
4 changed files with 193 additions and 48 deletions
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@ -66,10 +66,12 @@ namespace MIPSComp {
 		return regs[1] == regs[0] + 1;
 	}

+	static bool IsConsecutive3(const u8 regs[3]) {
+		return IsConsecutive2(regs) && regs[2] == regs[1] + 1;
+	}
+
 	static bool IsConsecutive4(const u8 regs[4]) {
-		return regs[1] == regs[0] + 1 &&
-			     regs[2] == regs[1] + 1 &&
-			     regs[3] == regs[2] + 1;
+		return IsConsecutive3(regs) && regs[3] == regs[2] + 1;
 	}

 	static bool IsVec2(VectorSize sz, const u8 regs[2]) {
@ -80,6 +82,10 @@ namespace MIPSComp {
 		return sz == V_Quad && IsConsecutive4(regs) && (regs[0] & 3) == 0;
 	}

+	static bool IsVec3of4(VectorSize sz, const u8 regs[4]) {
+		return sz == V_Triple && IsConsecutive3(regs) && (regs[0] & 3) == 0;
+	}
+
 	static bool IsMatrixVec4(MatrixSize sz, const u8 regs[16]) {
 		if (sz != M_4x4)
 			return false;
@ -330,7 +336,7 @@ namespace MIPSComp {
 		if (js.prefixD == 0)
 			return;

-		if (IsVec4(sz, regs) && js.VfpuWriteMask() != 0) {
+		if (IsVec4(sz, regs) && js.VfpuWriteMask() != 0 && opts.preferVec4) {
 			// Use temps for all, we'll blend in the end (keeping in Vec4.)
 			for (int i = 0; i < 4; ++i)
 				regs[i] = IRVTEMP_PFX_D + i;
@ -372,7 +378,7 @@ namespace MIPSComp {
 	}

 	void IRFrontend::ApplyPrefixDMask(u8 *vregs, VectorSize sz, int vectorReg) {
-		if (IsVec4(sz, vregs) && js.VfpuWriteMask() != 0) {
+		if (IsVec4(sz, vregs) && js.VfpuWriteMask() != 0 && opts.preferVec4) {
 			u8 origV[4];
 			GetVectorRegs(origV, sz, vectorReg);

@ -418,8 +424,42 @@ namespace MIPSComp {

 		CheckMemoryBreakpoint(rs, imm);

+		enum class LSVType {
+			INVALID,
+			LVQ,
+			SVQ,
+			LVLQ,
+			LVRQ,
+			SVLQ,
+			SVRQ,
+		};
+
+		LSVType optype = LSVType::INVALID;
 		switch (op >> 26) {
-		case 54: //lv.q
+		case 54: optype = LSVType::LVQ; break; // lv.q
+		case 62: optype = LSVType::SVQ; break; // sv.q
+		case 53: // lvl/lvr.q - highly unusual
+			optype = (op & 2) == 0 ? LSVType::LVLQ : LSVType::LVRQ;
+			break;
+		case 61: // svl/svr.q - highly unusual
+			optype = (op & 2) == 0 ? LSVType::SVLQ : LSVType::SVRQ;
+			break;
+		}
+		if (optype == LSVType::INVALID)
+			INVALIDOP;
+
+		if ((optype == LSVType::LVRQ || optype == LSVType::SVRQ) && opts.unalignedLoadStoreVec4) {
+			// We don't bother with an op for this, but we do fuse unaligned stores which happen.
+			MIPSOpcode nextOp = GetOffsetInstruction(1);
+			if ((nextOp.encoding ^ op.encoding) == 0x0000000E) {
+				// Okay, it's an svr.q/svl.q pair, same registers.  Treat as lv.q/sv.q.
+				EatInstruction(nextOp);
+				optype = optype == LSVType::LVRQ ? LSVType::LVQ : LSVType::SVQ;
+			}
+		}
+
+		switch (optype) {
+		case LSVType::LVQ:
 			if (IsVec4(V_Quad, vregs)) {
 				ir.Write(IROp::LoadVec4, vregs[0], rs, ir.AddConstant(imm));
 			} else {
@ -433,7 +473,7 @@ namespace MIPSComp {
 			}
 			break;

-		case 62: //sv.q
+		case LSVType::SVQ:
 			if (IsVec4(V_Quad, vregs)) {
 				ir.Write(IROp::StoreVec4, vregs[0], rs, ir.AddConstant(imm));
 			} else {
@ -447,8 +487,11 @@ namespace MIPSComp {
 			}
 			break;

-		case 53: // lvl/lvr.q - highly unusual
-		case 61: // svl/svr.q - highly unusual
+		case LSVType::LVLQ:
+		case LSVType::LVRQ:
+		case LSVType::SVLQ:
+		case LSVType::SVRQ:
+			// These are pretty uncommon unless paired.
 			DISABLE;
 			break;

@ -704,10 +747,21 @@ namespace MIPSComp {
 		GetVectorRegsPrefixT(tregs, sz, vt);
 		GetVectorRegsPrefixD(dregs, V_Single, vd);

-		if (IsVec4(sz, sregs) && IsVec4(sz, tregs) && IsOverlapSafe(dregs[0], n, sregs, n, tregs)) {
-			ir.Write(IROp::Vec4Dot, dregs[0], sregs[0], tregs[0]);
-			ApplyPrefixD(dregs, V_Single, vd);
-			return;
+		if (IsOverlapSafe(dregs[0], n, sregs, n, tregs)) {
+			if (IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
+				ir.Write(IROp::Vec4Dot, dregs[0], sregs[0], tregs[0]);
+				ApplyPrefixD(dregs, V_Single, vd);
+				return;
+			} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
+				// Nice example of this in Fat Princess (US) in block 088181A0 (hot.)
+				// Create a temporary copy of S with the last element zeroed.
+				ir.Write(IROp::Vec4Init, IRVTEMP_0, (int)Vec4Init::AllZERO);
+				ir.Write({ IROp::Vec4Blend, IRVTEMP_0, IRVTEMP_0, sregs[0], 0x7 });
+				// Now we can just dot like normal, with the last element effectively masked.
+				ir.Write(IROp::Vec4Dot, dregs[0], IRVTEMP_0, sregs[0] == tregs[0] ? IRVTEMP_0 : tregs[0]);
+				ApplyPrefixD(dregs, V_Single, vd);
+				return;
+			}
 		}

 		int temp0 = IRVTEMP_0;
@ -741,6 +795,8 @@ namespace MIPSComp {
 			VSLT,
 		};
 		VecDo3Op type = VecDo3Op::INVALID;
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);

 		// Check that we can support the ops, and prepare temporary values for ops that need it.
 		switch (op >> 26) {
@ -778,9 +834,11 @@ namespace MIPSComp {
 		case VecDo3Op::VMUL:
 			break;
 		case VecDo3Op::VDIV:
-			if (!js.HasNoPrefix()) {
+			if (js.HasUnknownPrefix() || (sz != V_Single && !js.HasNoPrefix()))
+				DISABLE;
+			// If it's single, we just need to check the prefixes are within the size.
+			if (!IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op))
 				DISABLE;
-			}
 			break;
 		case VecDo3Op::VMIN:
 		case VecDo3Op::VMAX:
@ -790,9 +848,6 @@ namespace MIPSComp {
 			break;
 		}

-		VectorSize sz = GetVecSize(op);
-		int n = GetNumVectorElements(sz);
-
 		u8 sregs[4], tregs[4], dregs[4];
 		GetVectorRegsPrefixS(sregs, sz, _VS);
 		GetVectorRegsPrefixT(tregs, sz, _VT);
@ -808,7 +863,7 @@ namespace MIPSComp {
 		}

 		// If all three are consecutive 4, we're safe regardless of if we use temps so we should not check that here.
-		if (allowSIMD && IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
+		if (allowSIMD) {
 			IROp opFunc = IROp::Nop;
 			switch (type) {
 			case VecDo3Op::VADD: // d[i] = s[i] + t[i]; break; //vadd
@ -828,13 +883,24 @@ namespace MIPSComp {
 				break;
 			}

-			if (opFunc != IROp::Nop) {
-				ir.Write(opFunc, dregs[0], sregs[0], tregs[0]);
-			} else {
-				DISABLE;
+			if (IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
+				if (opFunc != IROp::Nop) {
+					ir.Write(opFunc, dregs[0], sregs[0], tregs[0]);
+				} else {
+					DISABLE;
+				}
+				ApplyPrefixD(dregs, sz, _VD);
+				return;
+			} else if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
+				// This is actually pretty common.  Use a temp + blend.
+				// We could post-process this, but it's easier to do it here.
+				if (opFunc == IROp::Nop)
+					DISABLE;
+				ir.Write(opFunc, IRVTEMP_0, sregs[0], tregs[0]);
+				ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7 });
+				ApplyPrefixD(dregs, sz, _VD);
+				return;
 			}
-			ApplyPrefixD(dregs, sz, _VD);
-			return;
 		}

 		if (type == VecDo3Op::VSGE || type == VecDo3Op::VSLT) {
@ -901,10 +967,8 @@ namespace MIPSComp {
 			// D prefix is fine for these, and used sometimes.
 			if (js.HasUnknownPrefix() || js.HasSPrefix())
 				DISABLE;
-		} else {
-			// Many of these apply the D prefix strangely or override parts of the S prefix.
-			if (!js.HasNoPrefix())
-				DISABLE;
+		} else if (optype == 5 && js.HasDPrefix()) {
+			DISABLE;
 		}

 		// Vector unary operation
@ -912,13 +976,19 @@ namespace MIPSComp {

 		int vs = _VS;
 		int vd = _VD;
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);

 		if (optype >= 16 && !js.HasNoPrefix()) {
-			DISABLE;
-		} else if ((optype == 1 || optype == 2) && js.HasSPrefix()) {
-			DISABLE;
-		} else if (optype == 5 && js.HasDPrefix()) {
-			DISABLE;
+			// Many of these apply the D prefix strangely or override parts of the S prefix.
+			if (js.HasUnknownPrefix() || sz != V_Single)
+				DISABLE;
+			// If it's single, we just need to check the prefixes are within the size.
+			if (!IsPrefixWithinSize(js.prefixS, op))
+				DISABLE;
+			// The negative ones seem to use negate flags as a prefix hack.
+			if (optype >= 24 && (js.prefixS & 0x000F0000) != 0)
+				DISABLE;
 		}

 		// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
@ -926,9 +996,6 @@ namespace MIPSComp {
 			return;
 		}

-		VectorSize sz = GetVecSize(op);
-		int n = GetNumVectorElements(sz);
-
 		u8 sregs[4]{}, dregs[4]{};
 		GetVectorRegsPrefixS(sregs, sz, vs);
 		GetVectorRegsPrefixD(dregs, sz, vd);
@ -954,20 +1021,34 @@ namespace MIPSComp {
 			break;
 		}

-		if (canSIMD && !usingTemps && IsVec4(sz, sregs) && IsVec4(sz, dregs)) {
+		if (canSIMD && !usingTemps) {
+			IROp irop = IROp::Nop;
 			switch (optype) {
 			case 0:  // vmov
-				ir.Write(IROp::Vec4Mov, dregs[0], sregs[0]);
+				irop = IROp::Vec4Mov;
 				break;
 			case 1:  // vabs
-				ir.Write(IROp::Vec4Abs, dregs[0], sregs[0]);
+				irop = IROp::Vec4Abs;
 				break;
 			case 2:  // vneg
-				ir.Write(IROp::Vec4Neg, dregs[0], sregs[0]);
+				irop = IROp::Vec4Neg;
 				break;
 			}
-			ApplyPrefixD(dregs, sz, vd);
-			return;
+			if (IsVec4(sz, sregs) && IsVec4(sz, dregs) && irop != IROp::Nop) {
+				ir.Write(irop, dregs[0], sregs[0]);
+				ApplyPrefixD(dregs, sz, vd);
+				return;
+			} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, dregs) && irop != IROp::Nop && opts.preferVec4) {
+				// This is a simple case of vmov.t, just blend.
+				if (irop == IROp::Vec4Mov) {
+					ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], sregs[0], 0x7 });
+				} else {
+					ir.Write(irop, IRVTEMP_0, sregs[0]);
+					ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7 });
+				}
+				ApplyPrefixD(dregs, sz, vd);
+				return;
+			}
 		}

 		for (int i = 0; i < n; ++i) {
@ -1378,11 +1459,16 @@ namespace MIPSComp {
 			}
 		}

-		if (IsVec4(sz, sregs) && IsVec4(sz, dregs)) {
-			if (!overlap || (vs == vd && IsOverlapSafe(treg, n, dregs))) {
+		if (!overlap || (vs == vd && IsOverlapSafe(treg, n, dregs))) {
+			if (IsVec4(sz, sregs) && IsVec4(sz, dregs)) {
 				ir.Write(IROp::Vec4Scale, dregs[0], sregs[0], treg);
 				ApplyPrefixD(dregs, sz, vd);
 				return;
+			} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, dregs) && opts.preferVec4) {
+				ir.Write(IROp::Vec4Scale, IRVTEMP_0, sregs[0], treg);
+				ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7 });
+				ApplyPrefixD(dregs, sz, vd);
+				return;
 			}
 		}

@ -1627,8 +1713,46 @@ namespace MIPSComp {
 		// d[0] = s[y]*t[z], d[1] = s[z]*t[x], d[2] = s[x]*t[y]
 		// To do a full cross product: vcrs tmp1, s, t; vcrs tmp2 t, s; vsub d, tmp1, tmp2;
 		// (or just use vcrsp.)
+		// Note: this is possibly just a swizzle prefix hack for vmul.

-		DISABLE;
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+		if (sz != V_Triple)
+			DISABLE;
+
+		u8 sregs[4], dregs[4], tregs[4];
+		GetVectorRegsPrefixS(sregs, sz, _VS);
+		GetVectorRegsPrefixT(tregs, sz, _VT);
+		GetVectorRegsPrefixD(dregs, sz, _VD);
+
+		if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
+			// Use Vec4 where we can.  First, apply shuffles.
+			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], VFPU_SWIZZLE(1, 2, 0, 3));
+			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, tregs[0], VFPU_SWIZZLE(2, 0, 1, 3));
+			ir.Write(IROp::Vec4Mul, IRVTEMP_0, IRVTEMP_PFX_S, IRVTEMP_PFX_T);
+			// Now just retain w and blend in our values.
+			ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7 });
+		} else {
+			u8 tempregs[4]{};
+			if (!IsOverlapSafe(n, dregs, n, sregs, n, tregs)) {
+				for (int i = 0; i < n; ++i)
+					tempregs[i] = IRVTEMP_0 + i;
+			} else {
+				for (int i = 0; i < n; ++i)
+					tempregs[i] = dregs[i];
+			}
+
+			ir.Write(IROp::FMul, tempregs[0], sregs[1], tregs[2]);
+			ir.Write(IROp::FMul, tempregs[1], sregs[2], tregs[0]);
+			ir.Write(IROp::FMul, tempregs[2], sregs[0], tregs[1]);
+
+			for (int i = 0; i < n; i++) {
+				if (tempregs[i] != dregs[i])
+					ir.Write(IROp::FMov, dregs[i], tempregs[i]);
+			}
+		}
+
+		ApplyPrefixD(dregs, sz, _VD);
 	}

 	void IRFrontend::Comp_VDet(MIPSOpcode op) {
@ -2040,6 +2164,10 @@ namespace MIPSComp {
 		if (IsVec4(sz, dregs)) {
 			ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(cst_constants[conNum]));
 			ir.Write(IROp::Vec4Shuffle, dregs[0], IRVTEMP_0, 0);
+		} else if (IsVec3of4(sz, dregs) && opts.preferVec4) {
+			ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(cst_constants[conNum]));
+			ir.Write(IROp::Vec4Shuffle, IRVTEMP_0, IRVTEMP_0, 0);
+			ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7 });
 		} else {
 			for (int i = 0; i < n; i++) {
 				// Most of the time, materializing a float is slower than copying from another float.
@ -2190,6 +2318,9 @@ namespace MIPSComp {

 		if (IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
 			ir.Write(IROp::Vec4Add, dregs[0], tregs[0], sregs[0]);
+		} else if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
+			ir.Write(IROp::Vec4Add, IRVTEMP_0, tregs[0], sregs[0]);
+			ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7 });
 		} else {
 			u8 tempregs[4];
 			for (int i = 0; i < n; ++i) {
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@ -385,6 +385,8 @@ private:
 struct IROptions {
 	uint32_t disableFlags;
 	bool unalignedLoadStore;
+	bool unalignedLoadStoreVec4;
+	bool preferVec4;
 };

 const IRMeta *GetIRMeta(IROp op);
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@ -50,9 +50,20 @@ IRJit::IRJit(MIPSState *mipsState) : frontend_(mipsState->HasDefaultPrefix()), m

 	IROptions opts{};
 	opts.disableFlags = g_Config.uJitDisableFlags;
-	// Assume that RISC-V always has very slow unaligned memory accesses.
-#if !PPSSPP_ARCH(RISCV64)
+#if PPSSPP_ARCH(RISCV64)
+	// Assume RISC-V always has very slow unaligned memory accesses.
+	opts.unalignedLoadStore = false;
+	opts.unalignedLoadStoreVec4 = true;
+	opts.preferVec4 = cpu_info.RiscV_V;
+#elif PPSSPP_ARCH(ARM)
 	opts.unalignedLoadStore = (opts.disableFlags & (uint32_t)JitDisable::LSU_UNALIGNED) == 0;
+	opts.unalignedLoadStoreVec4 = true;
+	opts.preferVec4 = cpu_info.bASIMD || cpu_info.bNEON;
+#else
+	opts.unalignedLoadStore = (opts.disableFlags & (uint32_t)JitDisable::LSU_UNALIGNED) == 0;
+	// TODO: Could allow on x86 pretty easily...
+	opts.unalignedLoadStoreVec4 = false;
+	opts.preferVec4 = true;
 #endif
 	frontend_.SetOptions(opts);
 }
--- a/Core/MIPS/JitCommon/JitState.h
+++ b/Core/MIPS/JitCommon/JitState.h
@ -233,6 +233,7 @@ namespace MIPSComp {
 		bool downcountInRegister;
 		// ARM64 only
 		bool useASIMDVFPU;
+		// ARM64 and RV64
 		bool useStaticAlloc;
 		bool enablePointerify;