Revert "VertexLoader: Remove now-unused weights translation code"

This reverts commit 44100c6c1d.
2018-04-10 12:30:49 +02:00 · 2018-04-10 12:30:49 +02:00 · ccd594dae7
commit ccd594dae7
parent 0345e1a7d4
5 changed files with 355 additions and 0 deletions
--- a/GPU/Common/VertexDecoderArm.cpp
+++ b/GPU/Common/VertexDecoderArm.cpp
@ -111,6 +111,9 @@ static const ARMReg srcNEON = Q2;
 static const ARMReg accNEON = Q3;

 static const JitLookup jitLookup[] = {
+	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
+	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
+	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
 	{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
@ -323,6 +326,55 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	return (JittedVertexDecoder)start;
 }

+void VertexDecoderJitCache::Jit_WeightsU8() {
+	// Basic implementation - a byte at a time. TODO: Optimize
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LDRB(tempReg1, srcReg, dec_->weightoff + j);
+		STRB(tempReg1, dstReg, dec_->decFmt.w0off + j);
+	}
+	if (j & 3) {
+		// Create a zero register. Might want to make a fixed one.
+		EOR(scratchReg, scratchReg, scratchReg);
+	}
+	while (j & 3) {
+		STRB(scratchReg, dstReg, dec_->decFmt.w0off + j);
+		j++;
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsU16() {
+	// Basic implementation - a short at a time. TODO: Optimize
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LDRH(tempReg1, srcReg, dec_->weightoff + j * 2);
+		STRH(tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
+	}
+	if (j & 3) {
+		// Create a zero register. Might want to make a fixed one.
+		EOR(scratchReg, scratchReg, scratchReg);
+	}
+	while (j & 3) {
+		STRH(scratchReg, dstReg, dec_->decFmt.w0off + j * 2);
+		j++;
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsFloat() {
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LDR(tempReg1, srcReg, dec_->weightoff + j * 4);
+		STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
+	}
+	if (j & 3) {
+		EOR(tempReg1, tempReg1, tempReg1);
+	}
+	while (j & 3) {  // Zero additional weights rounding up to 4.
+		STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
+		j++;
+	}
+}
+
 static const ARMReg weightRegs[8] = { S8, S9, S10, S11, S12, S13, S14, S15 };
 static const ARMReg neonWeightRegsD[4] = { D4, D5, D6, D7 };
 static const ARMReg neonWeightRegsQ[2] = { Q2, Q3 };
--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@ -85,6 +85,9 @@ static const ARM64Reg neonWeightRegsQ[2] = { Q3, Q2 };  // reverse order to prev
 // Q16+ are free-for-all for matrices. In 16 registers, we can fit 4 4x4 matrices.

 static const JitLookup jitLookup[] = {
+	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
+	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
+	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
 	{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
@ -353,6 +356,44 @@ void VertexDecoderJitCache::Jit_ApplyWeights() {
 	}
 }

+void VertexDecoderJitCache::Jit_WeightsU8() {
+	// Basic implementation - a byte at a time. TODO: Optimize
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j);
+		STRB(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j);
+	}
+	while (j & 3) {
+		STRB(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j);
+		j++;
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsU16() {
+	// Basic implementation - a short at a time. TODO: Optimize
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 2);
+		STRH(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
+	}
+	while (j & 3) {
+		STRH(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 2);
+		j++;
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsFloat() {
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 4);
+		STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
+	}
+	while (j & 3) {  // Zero additional weights rounding up to 4.
+		STR(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 4);
+		j++;
+	}
+}
+
 void VertexDecoderJitCache::Jit_WeightsU8Skin() {
 	// Weight is first so srcReg is correct.
 	switch (dec_->nweights) {
--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@ -177,6 +177,67 @@ void PrintDecodedVertex(VertexReader &vtx) {
 VertexDecoder::VertexDecoder() : decoded_(nullptr), ptr_(nullptr), jitted_(0), jittedSize_(0) {
 }

+void VertexDecoder::Step_WeightsU8() const
+{
+	u8 *wt = (u8 *)(decoded_ + decFmt.w0off);
+	const u8 *wdata = (const u8*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++)
+		wt[j] = wdata[j];
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0;
+}
+
+void VertexDecoder::Step_WeightsU16() const
+{
+	u16 *wt = (u16 *)(decoded_ + decFmt.w0off);
+	const u16 *wdata = (const u16*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++)
+		wt[j] = wdata[j];
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0;
+}
+
+void VertexDecoder::Step_WeightsU8ToFloat() const
+{
+	float *wt = (float *)(decoded_ + decFmt.w0off);
+	const u8 *wdata = (const u8*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++) {
+		wt[j] = (float)wdata[j] * (1.0f / 128.0f);
+	}
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0;
+}
+
+void VertexDecoder::Step_WeightsU16ToFloat() const
+{
+	float *wt = (float *)(decoded_ + decFmt.w0off);
+	const u16 *wdata = (const u16*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++) {
+		wt[j] = (float)wdata[j] * (1.0f / 32768.0f);
+	}
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0;
+}
+
+// Float weights should be uncommon, we can live with having to multiply these by 2.0
+// to avoid special checks in the vertex shader generator.
+// (PSP uses 0.0-2.0 fixed point numbers for weights)
+void VertexDecoder::Step_WeightsFloat() const
+{
+	float *wt = (float *)(decoded_ + decFmt.w0off);
+	const float *wdata = (const float*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++) {
+		wt[j] = wdata[j];
+	}
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0.0f;
+}
+
 void VertexDecoder::ComputeSkinMatrix(const float weights[8]) const {
 	memset(skinMatrix, 0, sizeof(skinMatrix));
 	for (int j = 0; j < nweights; j++) {
@ -809,6 +870,20 @@ void VertexDecoder::Step_PosFloatMorphSkin() const {
 	Vec3ByMatrix43(v, pos, skinMatrix);
 }

+static const StepFunction wtstep[4] = {
+	0,
+	&VertexDecoder::Step_WeightsU8,
+	&VertexDecoder::Step_WeightsU16,
+	&VertexDecoder::Step_WeightsFloat,
+};
+
+static const StepFunction wtstepToFloat[4] = {
+	0,
+	&VertexDecoder::Step_WeightsU8ToFloat,
+	&VertexDecoder::Step_WeightsU16ToFloat,
+	&VertexDecoder::Step_WeightsFloat,
+};
+
 // TODO: Morph weights correctly! This is missing. Not sure if any game actually
 // use this functionality at all.

--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@ -461,6 +461,12 @@ public:

 	std::string GetString(DebugShaderStringType stringType);

+	void Step_WeightsU8() const;
+	void Step_WeightsU16() const;
+	void Step_WeightsU8ToFloat() const;
+	void Step_WeightsU16ToFloat() const;
+	void Step_WeightsFloat() const;
+
 	void ComputeSkinMatrix(const float weights[8]) const;

 	void Step_WeightsU8Skin() const;
@ -613,6 +619,12 @@ public:
 	JittedVertexDecoder Compile(const VertexDecoder &dec, int32_t *jittedSize);
 	void Clear();

+	void Jit_WeightsU8();
+	void Jit_WeightsU16();
+	void Jit_WeightsU8ToFloat();
+	void Jit_WeightsU16ToFloat();
+	void Jit_WeightsFloat();
+
 	void Jit_WeightsU8Skin();
 	void Jit_WeightsU16Skin();
 	void Jit_WeightsFloatSkin();
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@ -94,10 +94,16 @@ static const X64Reg fpScratchReg4 = XMM4;
 // on the interpreter if the compiler fails.

 static const JitLookup jitLookup[] = {
+	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
+	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
+	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
 	{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},

+	{&VertexDecoder::Step_WeightsU8ToFloat, &VertexDecoderJitCache::Jit_WeightsU8ToFloat},
+	{&VertexDecoder::Step_WeightsU16ToFloat, &VertexDecoderJitCache::Jit_WeightsU16ToFloat},
+
 	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
 	{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
 	{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
@ -276,6 +282,175 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	return (JittedVertexDecoder)start;
 }

+void VertexDecoderJitCache::Jit_WeightsU8() {
+	switch (dec_->nweights) {
+	case 1:
+		MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff));
+		break;
+	case 2:
+		MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff));
+		break;
+	case 3:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		AND(32, R(tempReg1), Imm32(0x00FFFFFF));
+		break;
+	case 4:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		break;
+	case 5:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
+		break;
+	case 6:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
+		break;
+	case 7:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
+		AND(32, R(tempReg2), Imm32(0x00FFFFFF));
+		break;
+	case 8:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
+		break;
+	}
+
+	if (dec_->nweights <= 4) {
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
+	} else {
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w1off), R(tempReg2));
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsU16() {
+	switch (dec_->nweights) {
+	case 1:
+		MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0));
+		return;
+
+	case 2:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0));
+		return;
+
+	case 3:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2));
+		return;
+
+	case 4:
+		// Anything above 4 will do 4 here, and then the rest after.
+	case 5:
+	case 6:
+	case 7:
+	case 8:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2));
+		break;
+	}
+
+	// Basic implementation - a short at a time. TODO: Optimize
+	int j;
+	for (j = 4; j < dec_->nweights; j++) {
+		MOV(16, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 2));
+		MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), R(tempReg1));
+	}
+	while (j & 3) {
+		MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), Imm16(0));
+		j++;
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsU8ToFloat() {
+	if (dec_->nweights >= 4) {
+		Jit_AnyU8ToFloat(dec_->weightoff, 32);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		if (dec_->nweights > 4) {
+			Jit_AnyU8ToFloat(dec_->weightoff + 4, (dec_->nweights - 4) * 8);
+			MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
+		}
+	} else {
+		Jit_AnyU8ToFloat(dec_->weightoff, dec_->nweights * 8);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsU16ToFloat() {
+	if (dec_->nweights >= 4) {
+		Jit_AnyU16ToFloat(dec_->weightoff, 64);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		if (dec_->nweights > 4) {
+			Jit_AnyU16ToFloat(dec_->weightoff + 4 * 2, (dec_->nweights - 4) * 16);
+			MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
+		}
+	} else {
+		Jit_AnyU16ToFloat(dec_->weightoff, dec_->nweights * 16);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsFloat() {
+	int j;
+	switch (dec_->nweights) {
+	case 1:
+		// MOVSS: When the source operand is a memory location and destination operand is an XMM register, the three high-order doublewords of the destination operand are cleared to all 0s.
+		MOVSS(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		break;
+
+	case 2:
+		MOVQ_xmm(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		break;
+
+	case 4:
+		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		break;
+
+	case 5:
+		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVSS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
+		break;
+
+	case 6:
+		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVQ_xmm(XMM4, MDisp(srcReg, dec_->weightoff + 16));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
+		break;
+
+	case 8:
+		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVUPS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
+		break;
+
+	default:
+		for (j = 0; j < dec_->nweights; j++) {
+			MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4));
+			MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1));
+		}
+		while (j & 3) {  // Zero additional weights rounding up to 4.
+			MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0));
+			j++;
+		}
+		break;
+	}
+}
+
 void VertexDecoderJitCache::Jit_WeightsU8Skin() {
 	MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));